上篇 1_Project Overview, Data Wrangling and Exploratory Analysis

使用不同的机器学习方法进行预测

线性回归

在这本笔记本中,将训练一个线性回归模型来预测基于历史能源数据、几个天气变量、一天中的小时、一周中的一天、周末和假期的电源能耗。

为了做到这一点,我们将把模型设定为从2012-01-01到2014-10-31的每日和每小时的能源和天气数据。

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.mpl_style = 'default'
from sklearn.linear_model import LinearRegression
dailyElectricity = pd.read_excel('Data/dailyElectricityWithFeatures.xlsx')
dailyElectricity = dailyElectricity.drop('startDay', 1).drop('endDay', 1) dailyChilledWater = pd.read_excel('Data/dailyChilledWaterWithFeatures.xlsx')
dailyChilledWater = dailyChilledWater.drop('startDay', 1).drop('endDay', 1) dailySteam = pd.read_excel('Data/dailySteamWithFeatures.xlsx')
dailySteam = dailySteam.drop('startDay', 1).drop('endDay', 1) hourlyElectricity = pd.read_excel('Data/hourlyElectricityWithFeatures.xlsx')
hourlyElectricity = hourlyElectricity.drop('startTime', 1).drop('endTime', 1) hourlyChilledWater = pd.read_excel('Data/hourlyChilledWaterWithFeatures.xlsx')
hourlyChilledWater = hourlyChilledWater.drop('startTime', 1).drop('endTime', 1) hourlySteam = pd.read_excel('Data/hourlySteamWithFeatures.xlsx')
hourlySteam = hourlySteam.drop('startTime', 1).drop('endTime', 1) #显示出dataframe
dailyElectricity.head()
2012-01-01 2800.244977 76.652174 7.173913 3.073913 1004.956522 95.260870 236.086957 4.118361 0.004796 0.086957 7.826087 0 0.0
2012-01-02 3168.974047 55.958333 5.833333 -2.937500 994.625000 87.333333 253.750000 5.914357 0.003415 0.000000 9.166667 0 0.3
2012-01-03 5194.533376 42.500000-3.208333-12.975000 1002.125000 95.708333 302.916667 6.250005 0.001327 0.000000 18.208333 0 0.3
2012-01-04 5354.861935 41.541667-7.083333-16.958333 1008.250000 98.750000 286.666667 5.127319 0.000890 0.000000 22.083333 0 0.3
2012-01-05 5496.223993 46.916667-0.583333 -9.866667 1002.041667 90.750000 258.333333 5.162041 0.001746 0.000000 15.583333 0 0.3

每日预测

向dataframe添加新特征:工作日、一年中的一天和一周。

def addDailyTimeFeatures(df):
df['weekday'] = df.index.weekday
df['day'] = df.index.dayofyear
df['week'] = df.index.weekofyear
return df dailyElectricity = addDailyTimeFeatures(dailyElectricity)
dailyChilledWater = addDailyTimeFeatures(dailyChilledWater)
dailySteam = addDailyTimeFeatures(dailySteam)
每日电力预测
df_elect = dailyElectricity[['weekday', 'day', 'week',
'occupancy', 'electricity-kWh']] elect_train = pd.DataFrame(data=df_elect,
index=np.arange('2012-01', '2013-07',
dtype='datetime64[D]')).dropna()
elect_test = pd.DataFrame(data=df_elect,
index=np.arange('2013-07', '2014-11',
dtype='datetime64[D]')).dropna() XX_elect_train = elect_train.drop('electricity-kWh',
axis = 1).reset_index().drop('index', axis = 1)
XX_elect_test = elect_test.drop('electricity-kWh',
axis = 1).reset_index().drop('index', axis = 1) YY_elect_train = elect_train['electricity-kWh']
YY_elect_test = elect_test['electricity-kWh']
lr_elect = LinearRegression()
lr_elect.fit(XX_elect_train,YY_elect_train) y_lr = lr_elect.predict(XX_elect_test) print "The test score R2: ", lr_elect.score(XX_elect_test, YY_elect_test) print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_elect_train.columns, lr_elect.coef_),
columns = ['elect_features', 'linearRegr_Coefficients'])
The test score R2:  0.608937488563
The Linear Regression coefficients are
0 weekday -125.392163
1 day 0.550121
2 week -11.553215
3 occupancy 2830.298384
#Plot observed and Predicted electricity value
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_elect_test.index, YY_elect_test, label='Observed', color='k')
plt.plot(XX_elect_test.index, y_lr, label='Predicted', color='g')
plt.legend(loc='upper right')

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 图观察与线性回归预测使用。
fig = plt.figure(figsize=(6,6))
plt.plot(YY_elect_test, YY_elect_test, c='k')
plt.scatter(YY_elect_test, y_lr, c='g')
plt.xlabel('Observed Elec. Usage (kWh)')
plt.ylabel("Predicted Elec. Usage (kWh): $\hat{Y}_i$")
plt.title("Energy vs Predicted Elec.: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每日冷水预报
chilledw_train = pd.DataFrame(data=dailyChilledWater, index=np.arange('2012-01', '2013-07', dtype='datetime64[D]')).dropna()
chilledw_test = pd.DataFrame(data=dailyChilledWater, index=np.arange('2013-07', '2014-11', dtype='datetime64[D]')).dropna() XX_chilledw_train = chilledw_train.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1)
XX_chilledw_test = chilledw_test.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1) YY_chilledw_train = chilledw_train['chilledWater-TonDays']
YY_chilledw_test = chilledw_test['chilledWater-TonDays']
lr_chilledw = LinearRegression()
lr_chilledw.fit(XX_chilledw_train,YY_chilledw_train) print "The test score R2: ", lr_chilledw.score(XX_chilledw_test, YY_chilledw_test) print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_chilledw_train.columns, lr_chilledw.coef_), columns = ['chilledw_features', 'linearRegr_Coefficients'])
The test score R2:  0.830709188732
The Linear Regression coefficients are
0 RH-% 0.464299
1 T-C 6.062113
2 Tdew-C -2.486768
3 pressure-mbar -0.095268
4 solarRadiation-W/m2 0.042885
5 windDirection -0.025036
6 windSpeed-m/s -1.166902
7 humidityRatio-kg/kg 1673.166705
8 coolingDegrees 2.853128
9 heatingDegrees 4.421394
10 dehumidification 2999.125771
11 occupancy 0.571356
12 weekday -2.461900
13 day -0.010718
14 week 0.122757
# 实测和预测的电量值
y_lr = lr_chilledw.predict(XX_chilledw_test)
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_chilledw_test.index, YY_chilledw_test, label='Observed', color='k')
plt.plot(XX_chilledw_test.index, y_lr, label='Predicted', color='r')
plt.legend(loc='upper right')

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 观察到的和预测的使用情况。
fig = plt.figure(figsize=(6,6))
plt.plot(YY_chilledw_test, YY_chilledw_test, c='k')
plt.scatter(YY_chilledw_test, y_lr, c='r')
plt.xlabel('Observed Chilled Water Usage (TonDays)')
plt.ylabel("Predicted Chilled Water Usage (TonDays): $\hat{Y}_i$")
plt.title("Observed vs Predicted Chilled Water: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每日热水预测
steam_train = pd.DataFrame(data=dailySteam, index=np.arange('2012-01', '2013-07', dtype='datetime64[D]')).dropna()
steam_test = pd.DataFrame(data=dailySteam, index=np.arange('2013-07', '2014-11', dtype='datetime64[D]')).dropna() XX_steam_train = steam_train.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1)
XX_steam_test = steam_test.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1) YY_steam_train = steam_train['steam-LBS']
YY_steam_test = steam_test['steam-LBS']
lr_steam = LinearRegression()
lr_steam.fit(XX_steam_train,YY_steam_train) print "The test score R2: ", lr_steam.score(XX_steam_test, YY_steam_test) print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_steam_train.columns, lr_steam.coef_), columns = ['steam_features', 'linearRegr_Coefficients'])
The test score R2:  0.942276415896
The Linear Regression coefficients are
0 RH-% 66.535470
1 T-C 458.096751
2 Tdew-C -951.521615
3 pressure-mbar -30.891470
4 solarRadiation-W/m2 -18.446292
5 windDirection -7.828922
6 windSpeed-m/s 251.824413
7 humidityRatio-kg/kg 857001.445663
8 coolingDegrees -99.989152
9 heatingDegrees 1794.351286
10 dehumidification-482120.622688
11 occupancy 3150.501909
12 weekday -531.583401
13 day -1.499061
14 week -43.000664
# 实测和预测的电量值
y_lr = lr_steam.predict(XX_steam_test)
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_steam_test.index, YY_steam_test, label='Observed', color='k')
plt.plot(XX_steam_test.index, y_lr, label='Predicted', color='g')
plt.legend(loc='upper right')

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 绘制实际使用量与预测使用量。
fig = plt.figure(figsize=(6,6))
plt.plot(YY_steam_test, YY_steam_test, c='k')
plt.scatter(YY_steam_test, y_lr, c='g')
plt.xlabel('Observed Steam Usage (LBS)')
plt.ylabel("Predicted Steam Usage (LBS): $\hat{Y}_i$")
plt.title("Observed vs Predicted Steam: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每小时预测

向dataframe添加新特征:小时、工作日、年中的一天和周。

def addHourlyTimeFeatures(df):
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday
df['day'] = df.index.dayofyear
df['week'] = df.index.weekofyear
return df hourlyElectricity = addHourlyTimeFeatures(hourlyElectricity)
每小时电力预测
df_hourlyelect = hourlyElectricity[['hour', 'weekday', 'day', 'week', 'cosHour',
'occupancy', 'electricity-kWh']] hourlyelect_train = pd.DataFrame(data=df_hourlyelect, index=np.arange('2014-01-01 00:00:00', '2014-10-01 00:00:00', dtype='datetime64[h]')).dropna()
hourlyelect_test = pd.DataFrame(data=df_hourlyelect, index=np.arange('2014-10-01 00:00:00', '2014-11-01 00:00:00', dtype='datetime64[h]')).dropna() XX_hourlyelect_train = hourlyelect_train.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1)
XX_hourlyelect_test = hourlyelect_test.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1) YY_hourlyelect_train = hourlyelect_train['electricity-kWh']
YY_hourlyelect_test = hourlyelect_test['electricity-kWh']
lr_hourlyelect = LinearRegression()
lr_hourlyelect.fit(XX_hourlyelect_train,YY_hourlyelect_train) y_hourlyelect_lr = lr_hourlyelect.predict(XX_hourlyelect_test) print "The test score R2: ", lr_hourlyelect.score(XX_hourlyelect_test, YY_hourlyelect_test) print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_hourlyelect_train.columns, lr_hourlyelect.coef_), columns = ['hourlyelect_features', 'linearRegr_Coefficients'])
The test score R2:  0.714713369958
The Linear Regression coefficients are
0 hour -0.287362
1 weekday -6.995868
2 day -0.309981
3 week 0.955127
4 cosHour -81.049080
5 occupancy 114.803110
# 实测和预测的电量值
fig = plt.figure(figsize=(25,7))
plt.scatter(XX_hourlyelect_test.index, YY_hourlyelect_test, label='Observed', color='k')
plt.plot(XX_hourlyelect_test.index, y_hourlyelect_lr, label='Predicted', color='r')
plt.legend(loc='upper right')

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

#Plot Observed vs. Predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_hourlyelect_test, YY_hourlyelect_test, c='k')
plt.scatter(YY_hourlyelect_test, y_hourlyelect_lr, c='r')
plt.xlabel('Observed Elec. Usage (kWh)')
plt.ylabel("Predicted Elec. Usage (kWh): $\hat{Y}_i$")
plt.title("Energy vs Predicted Elec.: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每小时冷水预报
hourlychilledw_train = pd.DataFrame(data=hourlyChilledWater,
index=np.arange('2014-01-01 00:00:00',
'2014-09-01 00:00:00',
dtype='datetime64[h]')).dropna()
hourlychilledw_test = pd.DataFrame(data=hourlyChilledWater,
index=np.arange('2014-09-01 00:00:00',
'2014-11-01 00:00:00',
dtype='datetime64[h]')).dropna() XX_hourlychilledw_train =
hourlychilledw_train.drop('chilledWater-TonDays',
axis = 1).reset_index().drop('index', axis = 1)
XX_hourlychilledw_test =
hourlychilledw_test.drop('chilledWater-TonDays',
axis = 1).reset_index().drop('index', axis = 1) YY_hourlychilledw_train = hourlychilledw_train['chilledWater-TonDays']
YY_hourlychilledw_test = hourlychilledw_test['chilledWater-TonDays']
lr_hourlychilledw = LinearRegression()
lr_hourlychilledw.fit(XX_hourlychilledw_train,YY_hourlychilledw_train) y_hourlychilledw_lr = lr_hourlychilledw.predict(XX_hourlychilledw_test) print "The test score R2: ", lr_hourlychilledw.score(XX_hourlychilledw_test, YY_hourlychilledw_test) print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_hourlychilledw_train.columns,
lr_hourlychilledw.coef_),
columns = ['hourlychilledw_features', 'linearRegr_Coefficients'])
The test score R2:  0.709930521875
The Linear Regression coefficients are
0 RH-% -0.028198
1 T-C 0.459533
2 Tdew-C 0.166999
3 pressure-mbar -0.007099
4 solarRadiation-W/m2 0.001003
5 windDirection -0.000382
6 windSpeed-m/s 0.004837
7 humidityRatio-kg/kg -91.425425
8 coolingDegrees -0.172407
9 heatingDegrees 0.603195
10 dehumidification 226.397306
11 occupancy 0.483000
12 cosHour -0.562715
#Plot Observed and Predicted electricity value
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_hourlychilledw_test.index,
YY_hourlychilledw_test,
label='Observed',
color='k')
plt.plot(XX_hourlychilledw_test.index,
y_hourlychilledw_lr,
label='Predicted',
color='g')
plt.legend(loc='upper right')

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 观察到的和预测的使用情况。
fig = plt.figure(figsize=(6,6))
plt.plot(YY_hourlychilledw_test, YY_hourlychilledw_test, c='k')
plt.scatter(YY_hourlychilledw_test, y_hourlychilledw_lr, c='g')
plt.xlabel('Observed Chilled Water Usage (TonDays)')
plt.ylabel("Predicted Chilled Water Usage (TonDays): $\hat{Y}_i$")
plt.title("Observed vs Predicted Chilled Water: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每小时热水预测
hourlysteam_train = pd.DataFrame(data=hourlySteam,
index=np.arange('2012-01-01 00:00:00',
'2014-02-01 00:00:00',
dtype='datetime64[h]')).dropna()
hourlysteam_test = pd.DataFrame(data=hourlySteam,
index=np.arange('2014-02-01 00:00:00',
'2014-11-01 00:00:00',
dtype='datetime64[h]')).dropna() XX_hourlysteam_train =
hourlysteam_train.drop('steam-LBS',
axis = 1).reset_index().drop('index', axis = 1)
XX_hourlysteam_test =
hourlysteam_test.drop('steam-LBS',
axis = 1).reset_index().drop('index', axis = 1) YY_hourlysteam_train = hourlysteam_train['steam-LBS']
YY_hourlysteam_test = hourlysteam_test['steam-LBS']
lr_hourlysteam = LinearRegression()
lr_hourlysteam.fit(XX_hourlysteam_train,YY_hourlysteam_train) y_hourlysteam_lr = lr_hourlysteam.predict(XX_hourlysteam_test) print "The test score R2: ", lr_hourlysteam.score(XX_hourlysteam_test,
YY_hourlysteam_test) print "The coefficients Linear Regression are"
pd.DataFrame(zip(XX_hourlysteam_train.columns,
lr_hourlysteam.coef_),
columns = ['hourlysteam_features', 'linearRegr_Coefficients'])
The test score R2:  0.764295430491
The coefficients Linear Regression are
0 RH-% 5.367666
1 T-C 8.577206
2 Tdew-C -54.743326
3 pressure-mbar -0.279591
4 solarRadiation-W/m2 0.138138
5 windDirection 0.041451
6 windSpeed-m/s 13.943372
7 humidityRatio-kg/kg 75847.104324
8 coolingDegrees -31.597421
9 heatingDegrees 57.903822
10 dehumidification -8088.312347
11 occupancy 131.534596
12 cosHour -343.896782
# 图观测值和预测值
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_hourlysteam_test.index,
YY_hourlysteam_test,
label='Observed',
color='k')
plt.plot(XX_hourlysteam_test.index, y_hourlysteam_lr, label='Predicted', color='r')
plt.legend(loc='upper right')

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

#Plot Observed vs. Predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_hourlysteam_test, YY_hourlysteam_test, c='k')
plt.scatter(YY_hourlysteam_test, y_hourlysteam_lr, c='r')
plt.xlabel('Observed Steam Usage (LBS)')
plt.ylabel("Predicted Steam Usage (LBS): $\hat{Y}_i$")
plt.title("Observed vs Predicted Steam: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

支持向量回归和交叉验证

在这本笔记本中,将训练一个支持向量回归(SVR)模型来预测基于历史能源数据、几个天气变量、一天中的小时、一周中的一天、周末和假期的能源能耗。

为了做到这一点,我们将模型与2012年01月01日至2014年10月31日的每日和每小时的能源和天气数据进行拟合,并计算预测的平均残差平方。

在设计期间,我们使用交叉验证来微调SVR参数。由于SVR需要太多的时间来计算,在最后的笔记本中,将设置参数为交叉验证找到的最优值。仍然会显示作为交叉验证输入的参数范围。

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn import grid_search pd.options.display.mpl_style = 'default'
dailyElectricity = pd.read_excel('Data/dailyElectricityWithFeatures.xlsx')
dailyElectricity = dailyElectricity.drop('startDay', 1).drop('endDay', 1) dailyChilledWater = pd.read_excel('Data/dailyChilledWaterWithFeatures.xlsx')
dailyChilledWater = dailyChilledWater.drop('startDay', 1).drop('endDay', 1) dailySteam = pd.read_excel('Data/dailySteamWithFeatures.xlsx')
dailySteam = dailySteam.drop('startDay', 1).drop('endDay', 1) hourlyElectricity = pd.read_excel('Data/hourlyElectricityWithFeatures.xlsx')
hourlyElectricity = hourlyElectricity.drop('startTime', 1).drop('endTime', 1) hourlyChilledWater = pd.read_excel('Data/hourlyChilledWaterWithFeatures.xlsx')
hourlyChilledWater = hourlyChilledWater.drop('startTime', 1).drop('endTime', 1) hourlySteam = pd.read_excel('Data/hourlySteamWithFeatures.xlsx')
hourlySteam = hourlySteam.drop('startTime', 1).drop('endTime', 1) #显示dataframe
dailyElectricity.head()
2012-01-01 2800.244977 76.652174 7.173913 3.073913 1004.956522 95.260870 236.086957 4.118361 0.004796 0.086957 7.826087 0 0.0
2012-01-02 3168.974047 55.958333 5.833333 -2.937500 994.625000 87.333333 253.750000 5.914357 0.003415 0.000000 9.166667 0 0.3
2012-01-03 5194.533376 42.500000-3.208333-12.975000 1002.125000 95.708333 302.916667 6.250005 0.001327 0.000000 18.208333 0 0.3
2012-01-04 5354.861935 41.541667-7.083333-16.958333 1008.250000 98.750000 286.666667 5.127319 0.000890 0.000000 22.083333 0 0.3
2012-01-05 5496.223993 46.916667-0.583333 -9.866667 1002.041667 90.750000 258.333333 5.162041 0.001746 0.000000 15.583333 0 0.3

每日预测

向dataframe添加新特征:工作日、一年中的一天和一周。

def addDailyTimeFeatures(df):
df['weekday'] = df.index.weekday
df['day'] = df.index.dayofyear
df['week'] = df.index.weekofyear
return df dailyElectricity = addDailyTimeFeatures(dailyElectricity)
dailyChilledWater = addDailyTimeFeatures(dailyChilledWater)
dailySteam = addDailyTimeFeatures(dailySteam)
每日电力预测
df_elect = dailyElectricity[['weekday', 'day', 'week',
'occupancy', 'electricity-kWh']] elect_train = pd.DataFrame(data=df_elect,
index=np.arange('2012-01', '2013-07',
dtype='datetime64[D]')).dropna()
elect_test = pd.DataFrame(data=df_elect,
index=np.arange('2013-07', '2014-11',
dtype='datetime64[D]')).dropna() XX_elect_train = elect_train.drop('electricity-kWh',
axis = 1).reset_index().drop('index', axis = 1)
XX_elect_test = elect_test.drop('electricity-kWh',
axis = 1).reset_index().drop('index', axis = 1) YY_elect_train = elect_train['electricity-kWh']
YY_elect_test = elect_test['electricity-kWh']
# 输入参数范围交叉验证
gamma_range = [0.01, 0.001, 0.0001]
epsilon_range = [x * 0.1 for x in range(0, 2)]
C_range = range(1, 2500, 500) # 为了加快速度,首先找到C和gamma的最佳解析器,然后在方法调用中直接设置它们
tuned_parameters = [{
'kernel': ['rbf', 'linear'],
# 'C': C_range,
# 'gamma': gamma_range,
'epsilon': epsilon_range}] # 使用交叉验证搜索最佳参数。
svr_elect = GridSearchCV(SVR(C=2000, gamma=0.01), param_grid = tuned_parameters, verbose = 0) # 选择合适的回归模型
y_elect = svr_elect.fit(XX_elect_train, YY_elect_train).predict(XX_elect_test) print 'Optimum parameters C=2000 and gamma=0.01 for SVR'
print 'Optimum parameters epsilon and kernel for SVR: ', svr_elect.best_params_ print "The test score R2 for SVR: ", svr_elect.score(XX_elect_test, YY_elect_test) print("SVR mean squared error: %.2f"
% np.mean((YY_elect_test - svr_elect.predict(XX_elect_test)) ** 2))
Optimum parameters C=2000 and gamma=0.01 for SVR
Optimum parameters epsilon and kernel for SVR: {'epsilon': 0.1, 'kernel': 'rbf'}
The test score R2 for RBF: 0.691753544427
RBF mean squared error: 391582.21
# 在测试期间观测和预测的电力需求的时间序列。
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_elect_test.index, YY_elect_test, c='k', label='Observed')
plt.plot(XX_elect_test.index, y_elect, c='r', label='Predicted')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 观察到的和预测的使用情况。
fig = plt.figure(figsize=(6,6))
plt.scatter(YY_elect_test, YY_elect_test, c='k')
plt.scatter(YY_elect_test, y_elect, c='r')
plt.xlabel('Observed Elec. Usage (kWh)')
plt.ylabel("Predicted Elec. Usage (kWh): $\hat{Y}_i$")
plt.title("Energy vs Predicted Energy: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每日冷水预测
chilledw_train = pd.DataFrame(data=dailyChilledWater,
index=np.arange('2012-01', '2013-07',
dtype='datetime64[D]')).dropna()
chilledw_test = pd.DataFrame(data=dailyChilledWater,
index=np.arange('2013-07', '2014-11',
dtype='datetime64[D]')).dropna() XX_chilledw_train =
chilledw_train.drop('chilledWater-TonDays',
axis = 1).reset_index().drop('index', axis = 1)
XX_chilledw_test =
chilledw_test.drop('chilledWater-TonDays',
axis = 1).reset_index().drop('index', axis = 1) YY_chilledw_train = chilledw_train['chilledWater-TonDays']
YY_chilledw_test = chilledw_test['chilledWater-TonDays']
# SVR回归器的最优参数
gamma_range = [0.1, 0.01, 0.001, 0.0001]
epsilon_range = [x * 0.1 for x in range(0, 3)]
C_range = range(1, 5, 2) tuned_parameters = [{
'kernel': ['rbf', 'linear'],
# 'C': C_range,
# 'gamma': gamma_range,
'epsilon': epsilon_range}] svr_chilledw = GridSearchCV(SVR(C=3, gamma=0.0001), param_grid = tuned_parameters, verbose = 0) y_chilledw = svr_chilledw.fit(XX_chilledw_train, YY_chilledw_train).predict(XX_chilledw_test) print 'Optimum parameters C=3 and gamma=0.1 for SVR'
print 'Optimum epsilon and kernel: ', svr_chilledw.best_params_ print "The test score R2 for SVR: ", svr_chilledw.score(XX_chilledw_test, YY_chilledw_test) print("SVR mean squared error: %.2f"
% np.mean((YY_chilledw_test - svr_chilledw.predict(XX_chilledw_test)) ** 2))
Optimum parameters C=3 and gamma=0.1 for SVR
Optimum epsilon and kernel: {'epsilon': 0.1, 'kernel': 'linear'}
The test score R2 for SVR: 0.764904375769
SVR mean squared error: 443.92
# 使用SVR绘制观察和预测的冷水值
fig = plt.figure(figsize=(15,10))
plt.scatter(XX_chilledw_test.index, YY_chilledw_test, c='k', label='Observed')
plt.plot(XX_chilledw_test.index, y_chilledw, c='g', label='Predicted')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 观察到的和预测的使用情况。
fig = plt.figure(figsize=(6,6))
plt.scatter(YY_chilledw_test, YY_chilledw_test, c='k')
plt.scatter(YY_chilledw_test, y_chilledw, c='g')
plt.xlabel('Observed Chilled Water Usage (TonDays)')
plt.ylabel("Predicted Chilled Water Usage (TonDays): $\hat{Y}_i$")
plt.title("Observed vs Predicted Energy: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每日热水预测
steam_train = pd.DataFrame(data=dailySteam,
index=np.arange('2012-01', '2013-07',
dtype='datetime64[D]')).dropna()
steam_test = pd.DataFrame(data=dailySteam,
index=np.arange('2013-07', '2014-11',
dtype='datetime64[D]')).dropna() XX_steam_train = steam_train.drop('steam-LBS',
axis = 1).reset_index().drop('index', axis = 1)
XX_steam_test = steam_test.drop('steam-LBS',
axis = 1).reset_index().drop('index', axis = 1) YY_steam_train = steam_train['steam-LBS']
YY_steam_test = steam_test['steam-LBS']
gamma_range = [0.1, 0.01, 0.001, 0.0001]
epsilon_range = [x * 0.1 for x in range(0, 3)]
C_range = range(1, 500, 50) tuned_parameters = [{
'kernel': ['rbf', 'linear'],
# 'C': C_range,
'gamma': gamma_range,
'epsilon': epsilon_range}] svr_steam = GridSearchCV(SVR(C=50), param_grid = tuned_parameters, verbose = 0) y_steam = svr_steam.fit(XX_steam_train, YY_steam_train).predict(XX_steam_test) print 'Optimum parameters C=50 for SVR'
print 'Optimum epsilon, gamma and kernel: ', svr_steam.best_params_ print "The test score R2 for SVR: ", svr_steam.score(XX_steam_test, YY_steam_test) print("SVR mean squared error: %.2f"
% np.mean((YY_steam_test - svr_steam.predict(XX_steam_test)) ** 2))
Optimum parameters C=50 for SVR
Optimum epsilon, gamma and kernel: {'epsilon': 0.2, 'gamma': 0.1, 'kernel': 'linear'}
The test score R2 for SVR: 0.938467924325
SVR mean squared error: 20632451.77
# 图观测值和预测值
fig,ax = plt.subplots(1, 1,figsize=(20,10))
plt.scatter(XX_steam_test.index, YY_steam_test, c='k', label='Observed')
plt.plot(XX_steam_test.index, y_steam, c='r', label='Predicted')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 图观测值和预测值
fig = plt.figure(figsize=(6,6))
plt.scatter(YY_steam_test, YY_steam_test, c='k')
plt.scatter(YY_steam_test, y_steam, c='r')
plt.xlabel('Observed Steam Usage (LBS)')
plt.ylabel("Predicted Steam Usage (LBS): $\hat{Y}_i$")
plt.title("Observed vs Predicted Energy: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

每小时预测

每小时电力预测

向dataframe添加新特征:小时、工作日、年中的一天和周。

def addHourlyTimeFeatures(df):
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday
df['day'] = df.index.dayofyear
df['week'] = df.index.weekofyear
return df hourlyElectricity = addHourlyTimeFeatures(hourlyElectricity)
df_hourlyelect = hourlyElectricity[['hour', 'weekday', 'day', 'week',
'cosHour', 'occupancy', 'electricity-kWh']] hourlyelect_train = pd.DataFrame(data=df_hourlyelect,
index=np.arange('2014-01-01 00:00:00',
'2014-10-01 00:00:00',
dtype='datetime64[h]')).dropna()
hourlyelect_test = pd.DataFrame(data=df_hourlyelect,
index=np.arange('2014-10-01 00:00:00',
'2014-11-01 00:00:00',
dtype='datetime64[h]')).dropna() XX_hourlyelect_train =
hourlyelect_train.drop('electricity-kWh',
axis = 1).reset_index().drop('index', axis = 1)
XX_hourlyelect_test =
hourlyelect_test.drop('electricity-kWh',
axis = 1).reset_index().drop('index', axis = 1) YY_hourlyelect_train = hourlyelect_train['electricity-kWh']
YY_hourlyelect_test = hourlyelect_test['electricity-kWh']
gamma_range = [0.01, 0.001, 0.0001]
epsilon_range = [x * 0.1 for x in range(0, 2)]
C_range = range(1, 5, 1) tuned_parameters = [{
'kernel': ['rbf', 'linear'],
# 'C': C_range,
# 'gamma': gamma_range,
'epsilon': epsilon_range}] svr_hourlyelect = GridSearchCV(SVR(C=1, gamma=0.01), param_grid = tuned_parameters, verbose = 0) y_hourlyelect = svr_hourlyelect.fit(XX_hourlyelect_train, YY_hourlyelect_train).predict(XX_hourlyelect_test) print 'Optimum parameters C=1 and gamma=0.01 for SVR'
print 'Optimum epsilon and kernel for SVR: ', svr_hourlyelect.best_params_ print "The test score R2: ", svr_hourlyelect.score(XX_hourlyelect_test, YY_hourlyelect_test) print("SVR mean squared error: %.2f"
% np.mean((YY_hourlyelect_test -
svr_hourlyelect.predict(XX_hourlyelect_test)) ** 2))
Optimum parameters C=1 and gamma=0.01 for SVR
Optimum epsilon and kernel for SVR: {'epsilon': 0.1, 'kernel': 'linear'}
The test score R2: 0.747282383852
SVR mean squared error: 1561.24
# 实测和预测的电量值
fig = plt.figure(figsize=(20,10))
plt.scatter(XX_hourlyelect_test.index,
YY_hourlyelect_test,
label='Observed',
color='k')
plt.plot(XX_hourlyelect_test.index, y_hourlyelect, label='Predicted', color='g')
plt.legend(loc='upper right')

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

# 观察到的和预测的使用情况。
fig = plt.figure(figsize=(6,6))
plt.plot(YY_hourlyelect_test, YY_hourlyelect_test, c='k')
plt.scatter(YY_hourlyelect_test, y_hourlyelect, c='g')
plt.xlabel('Observed Elec. Usage (kWh)')
plt.ylabel("Predicted Elec. Usage (kWh): $\hat{Y}_i$")
plt.title("Observed vs Predicted Elec.: $Y_i$ vs $\hat{Y}_i$")

翻译——2_Linear Regression and Support Vector Regression-LMLPHP

下篇3_Gaussian Process Regression

05-14 18:13