Going through the time series forecasting youtube video (Time Series Forecasting with XGBoost - Use python and machine learning to predict energy consumption). Using machine learning to predict energy data.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import xgboost as xgb
import io
from google.colab import files
from sklearn.metrics import mean_squared_error
uploaded = files.upload()
Saving AEP_hourly.csv to AEP_hourly.csv
df = pd.read_csv(io.BytesIO(uploaded['AEP_hourly.csv']))
df['Datetime'] = pd.to_datetime(df['Datetime'])
df.sort_values('Datetime', inplace=True) #The data is not sorted in the csv file (discovered during later analysis). This will clean it up
df = df.set_index('Datetime')
df.head(10)
| AEP_MW | |
|---|---|
| Datetime | |
| 2004-10-01 01:00:00 | 12379.0 |
| 2004-10-01 02:00:00 | 11935.0 |
| 2004-10-01 03:00:00 | 11692.0 |
| 2004-10-01 04:00:00 | 11597.0 |
| 2004-10-01 05:00:00 | 11681.0 |
| 2004-10-01 06:00:00 | 12280.0 |
| 2004-10-01 07:00:00 | 13692.0 |
| 2004-10-01 08:00:00 | 14618.0 |
| 2004-10-01 09:00:00 | 14903.0 |
| 2004-10-01 10:00:00 | 15118.0 |
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')
df.plot(style='.',
figsize=(15, 5),
color=color_pal[0],
title='PJME Energy Use in MW')
plt.show()
Train/Test Split¶
train = df.loc[df.index < '01-01-2015']
test = df.loc[df.index >= '01-01-2015']
fig, ax = plt.subplots(figsize=(15, 5))
train.plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test.plot(ax=ax, label='Test Set')
ax.axvline('01-01-2015', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()
df.loc[(df.index > '01-01-2010') & (df.index < '01-08-2010')] \
.plot(figsize=(15, 5), title='Week Of Data')
plt.show()
Feature Creation¶
def create_features(df):
"""
Create time series features based on time series index.
"""
df = df.copy()
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
df['weekofyear'] = df.index.isocalendar().week
return df
df = create_features(df)
df.head()
| AEP_MW | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | |
|---|---|---|---|---|---|---|---|---|---|
| Datetime | |||||||||
| 2004-10-01 01:00:00 | 12379.0 | 1 | 4 | 4 | 10 | 2004 | 275 | 1 | 40 |
| 2004-10-01 02:00:00 | 11935.0 | 2 | 4 | 4 | 10 | 2004 | 275 | 1 | 40 |
| 2004-10-01 03:00:00 | 11692.0 | 3 | 4 | 4 | 10 | 2004 | 275 | 1 | 40 |
| 2004-10-01 04:00:00 | 11597.0 | 4 | 4 | 4 | 10 | 2004 | 275 | 1 | 40 |
| 2004-10-01 05:00:00 | 11681.0 | 5 | 4 | 4 | 10 | 2004 | 275 | 1 | 40 |
Visualize Feature¶
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='hour', y='AEP_MW')
ax.set_title('MW by Hour')
plt.show()
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='month', y='AEP_MW',palette='Blues')
ax.set_title('MW by Month')
plt.show()
/tmp/ipython-input-1997979185.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df, x='month', y='AEP_MW',palette='Blues')
Create Model¶
train = create_features(train)
test = create_features(test)
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year']
TARGET = 'AEP_MW'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds=50, learning_rate = 0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
[0] validation_0-rmse:2557.05852 validation_1-rmse:2669.66874 [100] validation_0-rmse:1667.99304 validation_1-rmse:1906.43321 [200] validation_0-rmse:1388.41609 validation_1-rmse:1728.03964 [300] validation_0-rmse:1260.06288 validation_1-rmse:1670.94367 [400] validation_0-rmse:1170.54347 validation_1-rmse:1651.32523 [500] validation_0-rmse:1113.48343 validation_1-rmse:1643.19503 [545] validation_0-rmse:1091.20163 validation_1-rmse:1646.54135
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=50,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=1000,
n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=50,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=1000,
n_jobs=None, num_parallel_tree=None, ...)Feature Importance¶
fi = pd.DataFrame(reg.feature_importances_,
index = reg.feature_names_in_,
columns=['importance']).sort_values('importance')
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()
As expected, the most important features are hour, month, and day of week. Day of year and quarter are probably excess variables that couold be infered from the first three variables.
Forecast on Test¶
test['prediction'] = reg.predict(X_test)
test.head()
| AEP_MW | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | prediction | |
|---|---|---|---|---|---|---|---|---|---|---|
| Datetime | ||||||||||
| 2015-01-01 00:00:00 | 16375.0 | 0 | 3 | 1 | 1 | 2015 | 1 | 1 | 1 | 16409.544922 |
| 2015-01-01 01:00:00 | 16172.0 | 1 | 3 | 1 | 1 | 2015 | 1 | 1 | 1 | 15945.625000 |
| 2015-01-01 02:00:00 | 15968.0 | 2 | 3 | 1 | 1 | 2015 | 1 | 1 | 1 | 15893.853516 |
| 2015-01-01 03:00:00 | 15749.0 | 3 | 3 | 1 | 1 | 2015 | 1 | 1 | 1 | 15935.795898 |
| 2015-01-01 04:00:00 | 15727.0 | 4 | 3 | 1 | 1 | 2015 | 1 | 1 | 1 | 15935.795898 |
df = df.merge(test[['prediction']], how='left', left_index=True, right_index=True)
df.tail()
| AEP_MW | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | prediction | |
|---|---|---|---|---|---|---|---|---|---|---|
| Datetime | ||||||||||
| 2018-08-02 20:00:00 | 17673.0 | 20 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 18112.775391 |
| 2018-08-02 21:00:00 | 17303.0 | 21 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 18112.775391 |
| 2018-08-02 22:00:00 | 17001.0 | 22 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 17780.941406 |
| 2018-08-02 23:00:00 | 15964.0 | 23 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 16259.825195 |
| 2018-08-03 00:00:00 | 14809.0 | 0 | 4 | 3 | 8 | 2018 | 215 | 3 | 31 | 14421.719727 |
ax = df[['AEP_MW']].plot(figsize=(15, 5))
df['prediction'].plot(ax=ax, style='.')
plt.legend(['Truth Data', 'Predictions'])
ax.set_title('Raw Data and Prediction')
plt.show()
ax = df.loc[(df.index > '04-01-2018') & (df.index < '04-08-2018')]['AEP_MW'] \
.plot(figsize=(15, 5), title='Week Of Data')
df.loc[(df.index > '04-01-2018') & (df.index < '04-08-2018')]['prediction'] \
.plot(style='.')
plt.legend(['Truth Data','Prediction'])
plt.show()
score = np.sqrt(mean_squared_error(test['AEP_MW'], test['prediction']))
print(f'RMSE Score on Test set: {score:0.2f}')
RMSE Score on Test set: 1643.09
Aggregate the energy usage error by date and report the most inaccurate dates.
test['error'] = np.abs(test[TARGET] - test['prediction'])
test['date'] = test.index.date
test.groupby(['date'])['error'].mean().sort_values(ascending=False).head(10)
| error | |
|---|---|
| date | |
| 2017-01-22 | 4872.108968 |
| 2015-02-20 | 4861.401571 |
| 2015-02-19 | 4769.359985 |
| 2018-01-06 | 4403.549683 |
| 2017-02-07 | 4290.881022 |
| 2018-01-07 | 4202.593099 |
| 2017-01-03 | 4168.695475 |
| 2016-02-03 | 4092.213298 |
| 2017-01-23 | 4076.364746 |
| 2017-02-12 | 4060.571086 |
At this point, I will begin exploring on my own. All of the worse performing points are in the January and February. Lets look at January and February of 2017 which contains 5 of the data points.
ax = df.loc[(df.index > '01-01-2017') & (df.index < '02-28-2017')]['AEP_MW'] \
.plot(figsize=(15, 5), title='Week Of Data')
df.loc[(df.index > '01-01-2017') & (df.index < '02-28-2017')]['prediction'] \
.plot(style='.')
plt.legend(['Truth Data','Prediction'])
plt.show()
Hmm, it looks like most of the errors are overpredictions (predicting higher energy usage than expected). Seeing that AEP is the electric utility for Ohio, I would hypothesize a significant amount of the energy usage for January and February would be related to heating homes. If there is unseasonably warm weather in January and February, we would possibly see a reduction in energy usage that would not be accruately preducted. I seem to remember (I was living in Chicago at the time) that the 2017 winter was relatively mild compared to other years.
Looking at the weather data (weather.gov) from January 2017 for Columbus, OH, the average high in January 2017 was 7 degrees greater than normal. The average low was 8 degrees greater than normal. Diving into the data further, we can see that on January 22, 2017 (highest error), the daily high was 64 deg F, 27 deg above normal. This would seem to confirm my hypothesis that the error is due to abnormally hot weather in the winter.
In terms of improving the prediction, the easiest addition would be to add a variable incorporating weather data. Based off of the exploration above, a variable tracking the difference between the expected high and the average high for the data would supply the most information.
Part 2 follow up¶
There was a follow up youtube video where some of the most replied suggestions were implemented. I shall do the same. Time Series Forecasting with XGBoost - Advanced Methods https://www.youtube.com/watch?v=z3ZnOW-S550&list=WL&index=13
1) Outlier removal¶
df['AEP_MW'].plot(kind='hist', bins=500)
<Axes: ylabel='Frequency'>
df.query('AEP_MW > 25000')
| AEP_MW | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | prediction | |
|---|---|---|---|---|---|---|---|---|---|---|
| Datetime | ||||||||||
| 2007-08-08 15:00:00 | 25164.0 | 15 | 2 | 3 | 8 | 2007 | 220 | 8 | 32 | NaN |
| 2007-08-08 16:00:00 | 25056.0 | 16 | 2 | 3 | 8 | 2007 | 220 | 8 | 32 | NaN |
| 2007-08-08 17:00:00 | 25140.0 | 17 | 2 | 3 | 8 | 2007 | 220 | 8 | 32 | NaN |
| 2007-08-09 17:00:00 | 25035.0 | 17 | 3 | 3 | 8 | 2007 | 221 | 9 | 32 | NaN |
| 2008-10-20 14:00:00 | 25695.0 | 14 | 0 | 4 | 10 | 2008 | 294 | 20 | 43 | NaN |
Looking at the histogram, we can see the data has a bell curve shape. However, looking back at the full plot, we can see there seems to be one data point in 2009 ish that is much higher than the surrounding data points. Looking at the month box plot, we can see this same data point might be in October of 2009. Let's look at October of 2008.
ax = df.loc[(df.index > '10-01-2008') & (df.index < '10-31-2008')]['AEP_MW'] \
.plot(figsize=(15, 5), title='Fall 2009 Data', marker = 'o', linestyle='')
plt.show()
Let's remove that data point which is 33% + larger than the surrounding data. Besides that, all the other outliers seem to be around the same date range, so I will keep them.
df = df.query('AEP_MW < 25_500').copy()
2) Time Series Cross-Validation¶
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
df = df.sort_index()
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)
fold = 0
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
train['AEP_MW'].plot(ax=axs[fold],
label='Training Set',
title=f'Data Train/Test Split Fold {fold}')
test['AEP_MW'].plot(ax=axs[fold],
label='Test Set')
axs[fold].axvline(test.index.min(), color='black', ls='--')
fold += 1
plt.show()
3) Lag Features¶
def add_lags(df):
target_map = df['AEP_MW'].to_dict()
df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
df['lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
df['lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
return df
df = add_lags(df)
Training
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
train = create_features(train)
test = create_features(test)
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
'lag1','lag2','lag3']
TARGET = 'AEP_MW'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:squarederror',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
[0] validation_0-rmse:15898.03113 validation_1-rmse:15263.64389 [100] validation_0-rmse:6031.10029 validation_1-rmse:5645.59194 [200] validation_0-rmse:2653.98825 validation_1-rmse:2426.91728 [300] validation_0-rmse:1707.08329 validation_1-rmse:1644.72726 [400] validation_0-rmse:1489.80648 validation_1-rmse:1551.11461 [462] validation_0-rmse:1446.49648 validation_1-rmse:1553.94478 [0] validation_0-rmse:15834.99651 validation_1-rmse:15222.40367 [100] validation_0-rmse:6007.58253 validation_1-rmse:5715.60355 [200] validation_0-rmse:2645.16579 validation_1-rmse:2504.38734 [300] validation_0-rmse:1707.50979 validation_1-rmse:1616.55439 [400] validation_0-rmse:1500.85941 validation_1-rmse:1452.26766 [500] validation_0-rmse:1442.28701 validation_1-rmse:1428.93763 [600] validation_0-rmse:1410.76972 validation_1-rmse:1428.32590 [627] validation_0-rmse:1404.56989 validation_1-rmse:1428.86707 [0] validation_0-rmse:15778.90821 validation_1-rmse:14549.79005 [100] validation_0-rmse:5985.23715 validation_1-rmse:5101.69865 [200] validation_0-rmse:2635.68625 validation_1-rmse:2054.50316 [300] validation_0-rmse:1704.31663 validation_1-rmse:1509.71998 [370] validation_0-rmse:1533.48604 validation_1-rmse:1518.78834 [0] validation_0-rmse:15679.14337 validation_1-rmse:14632.02881 [100] validation_0-rmse:5950.24136 validation_1-rmse:5461.94157 [200] validation_0-rmse:2627.49815 validation_1-rmse:2469.52470 [300] validation_0-rmse:1707.47979 validation_1-rmse:1780.09008 [400] validation_0-rmse:1506.35016 validation_1-rmse:1662.99200 [500] validation_0-rmse:1449.65781 validation_1-rmse:1647.25522 [526] validation_0-rmse:1440.26842 validation_1-rmse:1646.97284 [0] validation_0-rmse:15600.47280 validation_1-rmse:15012.06541 [100] validation_0-rmse:5924.68051 validation_1-rmse:5860.61123 [200] validation_0-rmse:2626.42679 validation_1-rmse:2778.27809 [300] validation_0-rmse:1717.69377 validation_1-rmse:1903.05223 [400] validation_0-rmse:1514.79626 validation_1-rmse:1713.95049 [500] validation_0-rmse:1458.23547 validation_1-rmse:1698.37746 [509] validation_0-rmse:1454.78082 validation_1-rmse:1700.80224
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
Score across folds 1564.0351 Fold scores:[np.float64(1550.4603694031152), np.float64(1427.667855316764), np.float64(1499.9665274669187), np.float64(1645.4343196934153), np.float64(1696.646615156534)]
3.1) Weather data¶
In order to see how the addition of weather data will affect the performance, I will add it instead of the lag data and see the performance of the data
pip install meteostat
Collecting meteostat Downloading meteostat-1.7.6-py3-none-any.whl.metadata (4.6 kB) Requirement already satisfied: pandas>=2 in /usr/local/lib/python3.12/dist-packages (from meteostat) (2.2.2) Requirement already satisfied: pytz in /usr/local/lib/python3.12/dist-packages (from meteostat) (2025.2) Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from meteostat) (2.0.2) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas>=2->meteostat) (2.9.0.post0) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=2->meteostat) (2025.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas>=2->meteostat) (1.17.0) Downloading meteostat-1.7.6-py3-none-any.whl (33 kB) Installing collected packages: meteostat Successfully installed meteostat-1.7.6
from datetime import datetime
from meteostat import Point, Daily, units
# Define the time period
start = datetime(2004, 1, 1)
end = datetime(2018, 8, 3)
# Latitude and Longitude of Columbus, OH, USA
location = Point(40, -83)
# Get daily data for the specified location and time period
data = Daily(location, start, end)
# Convert data to imperial units if desired (optional)
data = data.convert(units.imperial)
# Fetch the data
daily_temp_df = data.fetch()
daily_temp_df.plot(y=['tavg'])
plt.show()
Warning: Cannot load daily/2004/KOSU0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2005/KOSU0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2004/KTZR0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2005/KTZR0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2004/KLCK0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2004/KDLZ0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2005/KDLZ0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2006/KDLZ0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2007/KDLZ0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2008/KDLZ0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2009/KDLZ0.csv.gz from https://data.meteostat.net/ Warning: Cannot load daily/2010/KDLZ0.csv.gz from https://data.meteostat.net/
print(daily_temp_df.head(1))
print(daily_temp_df.tail(1))
tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
time
2005-01-01 50.2 48.2 51.8 NaN NaN <NA> 3.9 NaN <NA> <NA>
tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
time
2018-08-03 75.0 64.0 84.9 0.0 NaN <NA> 4.2 NaN 1018.1 <NA>
Looks like the weather data only goes back to January 1, 2005. That's ok. I will just use data from 2005 on for this test
df_weather = df.loc[df.index >= '01-01-2005']
df_weather.head()
| AEP_MW | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | prediction | lag1 | lag2 | lag3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Datetime | |||||||||||||
| 2005-01-01 00:00:00 | 12892.0 | 0 | 5 | 1 | 1 | 2005 | 1 | 1 | 53 | NaN | NaN | NaN | NaN |
| 2005-01-01 01:00:00 | 12316.0 | 1 | 5 | 1 | 1 | 2005 | 1 | 1 | 53 | NaN | NaN | NaN | NaN |
| 2005-01-01 02:00:00 | 11890.0 | 2 | 5 | 1 | 1 | 2005 | 1 | 1 | 53 | NaN | NaN | NaN | NaN |
| 2005-01-01 03:00:00 | 11579.0 | 3 | 5 | 1 | 1 | 2005 | 1 | 1 | 53 | NaN | NaN | NaN | NaN |
| 2005-01-01 04:00:00 | 11461.0 | 4 | 5 | 1 | 1 | 2005 | 1 | 1 | 53 | NaN | NaN | NaN | NaN |
Now we need to add the average daily temperature data to the energy data. Since the energy data is hourly, I will resample it to hourly data and use forward fill to use the average temperature for the entire day.
# Resample daily temperature data to hourly using forward-fill
hourly_temp_df = daily_temp_df['tavg'].resample('H').ffill()
print("\nUpsampled Hourly Temperature Data (First 5 rows):")
print(hourly_temp_df.head())
Upsampled Hourly Temperature Data (First 5 rows): time 2005-01-01 00:00:00 50.2 2005-01-01 01:00:00 50.2 2005-01-01 02:00:00 50.2 2005-01-01 03:00:00 50.2 2005-01-01 04:00:00 50.2 Freq: h, Name: tavg, dtype: float64
FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
merged_df = pd.merge(df_weather, hourly_temp_df, left_index=True, right_index=True)
merged_df.tail()
#merged_df = df_weather.merge(hourly_temp_df, on='datetime', how='left')
| AEP_MW | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | prediction | lag1 | lag2 | lag3 | tavg | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-08-02 20:00:00 | 17673.0 | 20 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 18112.775391 | 18705.0 | 20140.0 | 16078.0 | 72.7 |
| 2018-08-02 21:00:00 | 17303.0 | 21 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 18112.775391 | 18108.0 | 19616.0 | 15908.0 | 72.7 |
| 2018-08-02 22:00:00 | 17001.0 | 22 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 17780.941406 | 17544.0 | 19246.0 | 15783.0 | 72.7 |
| 2018-08-02 23:00:00 | 15964.0 | 23 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 16259.825195 | 16262.0 | 17943.0 | 14808.0 | 72.7 |
| 2018-08-03 00:00:00 | 14809.0 | 0 | 4 | 3 | 8 | 2018 | 215 | 3 | 31 | 14421.719727 | 15045.0 | 16491.0 | 13669.0 | 75.0 |
Let's run the training on the data. I will train the first model on only the date time feature. I will then add the average temperature to the second model to see it's impact.
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
merged_df = merged_df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(merged_df):
train = merged_df.iloc[train_idx]
test = merged_df.iloc[val_idx]
train = create_features(train)
test = create_features(test)
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year']
TARGET = 'AEP_MW'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:squarederror',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
[0] validation_0-rmse:15919.32529 validation_1-rmse:15260.09387 [100] validation_0-rmse:6074.77797 validation_1-rmse:5468.97507 [200] validation_0-rmse:2708.94876 validation_1-rmse:2331.44146 [300] validation_0-rmse:1734.90622 validation_1-rmse:1690.16608 [400] validation_0-rmse:1494.47518 validation_1-rmse:1637.37576 [500] validation_0-rmse:1412.82265 validation_1-rmse:1631.22793 [600] validation_0-rmse:1370.01671 validation_1-rmse:1615.88885 [700] validation_0-rmse:1341.55411 validation_1-rmse:1603.03996 [800] validation_0-rmse:1317.63729 validation_1-rmse:1581.00706 [900] validation_0-rmse:1298.33594 validation_1-rmse:1566.90554 [999] validation_0-rmse:1284.71780 validation_1-rmse:1562.21287 [0] validation_0-rmse:15852.55221 validation_1-rmse:15217.40241 [100] validation_0-rmse:6054.82674 validation_1-rmse:5450.63577 [200] validation_0-rmse:2713.59735 validation_1-rmse:2315.61490 [300] validation_0-rmse:1760.13594 validation_1-rmse:1622.84965 [400] validation_0-rmse:1522.47684 validation_1-rmse:1513.38632 [500] validation_0-rmse:1437.02999 validation_1-rmse:1487.58200 [600] validation_0-rmse:1394.83465 validation_1-rmse:1467.37681 [700] validation_0-rmse:1366.24235 validation_1-rmse:1450.46670 [800] validation_0-rmse:1341.70354 validation_1-rmse:1438.83801 [900] validation_0-rmse:1322.74958 validation_1-rmse:1423.02636 [999] validation_0-rmse:1306.77534 validation_1-rmse:1415.46878 [0] validation_0-rmse:15793.57570 validation_1-rmse:14544.80747 [100] validation_0-rmse:6034.36341 validation_1-rmse:4859.34313 [200] validation_0-rmse:2708.30119 validation_1-rmse:1926.42047 [300] validation_0-rmse:1763.84178 validation_1-rmse:1557.96818 [357] validation_0-rmse:1593.22044 validation_1-rmse:1583.09576 [0] validation_0-rmse:15690.55320 validation_1-rmse:14623.39203 [100] validation_0-rmse:6004.14878 validation_1-rmse:5104.87467 [200] validation_0-rmse:2709.79669 validation_1-rmse:2239.81896 [300] validation_0-rmse:1772.34647 validation_1-rmse:1735.14351 [400] validation_0-rmse:1535.35824 validation_1-rmse:1661.81061 [500] validation_0-rmse:1455.10409 validation_1-rmse:1634.60631 [600] validation_0-rmse:1408.09365 validation_1-rmse:1606.04842 [700] validation_0-rmse:1375.80033 validation_1-rmse:1581.94460 [800] validation_0-rmse:1353.22099 validation_1-rmse:1567.60810 [900] validation_0-rmse:1336.10302 validation_1-rmse:1549.79120 [999] validation_0-rmse:1321.78842 validation_1-rmse:1531.91717 [0] validation_0-rmse:15609.55353 validation_1-rmse:15005.47437 [100] validation_0-rmse:5980.05887 validation_1-rmse:5571.88600 [200] validation_0-rmse:2709.83896 validation_1-rmse:2607.11561 [300] validation_0-rmse:1778.68455 validation_1-rmse:1872.76565 [400] validation_0-rmse:1541.39039 validation_1-rmse:1719.25684 [500] validation_0-rmse:1458.66181 validation_1-rmse:1699.53719 [600] validation_0-rmse:1417.05157 validation_1-rmse:1685.07744 [682] validation_0-rmse:1390.10489 validation_1-rmse:1684.16992 Score across folds 1549.5958 Fold scores:[np.float64(1562.0637012201323), np.float64(1414.7445059544784), np.float64(1557.0306011532384), np.float64(1531.9171680488605), np.float64(1682.2230640312666)]
fi = pd.DataFrame(reg.feature_importances_,
index = reg.feature_names_in_,
columns=['importance']).sort_values('importance')
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
merged_df = merged_df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(merged_df):
train = merged_df.iloc[train_idx]
test = merged_df.iloc[val_idx]
train = create_features(train)
test = create_features(test)
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
'tavg']
TARGET = 'AEP_MW'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:squarederror',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
[0] validation_0-rmse:15918.68939 validation_1-rmse:15259.40198 [100] validation_0-rmse:5990.80751 validation_1-rmse:5289.59651 [200] validation_0-rmse:2491.29233 validation_1-rmse:1906.08593 [300] validation_0-rmse:1398.80560 validation_1-rmse:1179.41119 [400] validation_0-rmse:1103.08122 validation_1-rmse:1123.56388 [500] validation_0-rmse:1006.90660 validation_1-rmse:1104.13707 [600] validation_0-rmse:952.06116 validation_1-rmse:1078.57792 [700] validation_0-rmse:912.94214 validation_1-rmse:1051.89889 [800] validation_0-rmse:885.61288 validation_1-rmse:1028.67841 [900] validation_0-rmse:865.04234 validation_1-rmse:1008.45931 [999] validation_0-rmse:848.67355 validation_1-rmse:988.21122 [0] validation_0-rmse:15851.94636 validation_1-rmse:15216.31490 [100] validation_0-rmse:5973.14599 validation_1-rmse:5286.63352 [200] validation_0-rmse:2494.59685 validation_1-rmse:1929.88132 [300] validation_0-rmse:1415.77456 validation_1-rmse:1176.70316 [400] validation_0-rmse:1126.87075 validation_1-rmse:1083.33471 [500] validation_0-rmse:1028.85917 validation_1-rmse:1046.48953 [600] validation_0-rmse:975.47998 validation_1-rmse:1010.46774 [700] validation_0-rmse:935.97506 validation_1-rmse:979.72659 [800] validation_0-rmse:909.95368 validation_1-rmse:956.55169 [900] validation_0-rmse:888.58142 validation_1-rmse:937.04480 [999] validation_0-rmse:872.40503 validation_1-rmse:922.35614 [0] validation_0-rmse:15793.07026 validation_1-rmse:14545.90643 [100] validation_0-rmse:5954.57507 validation_1-rmse:4844.21554 [200] validation_0-rmse:2492.72274 validation_1-rmse:1680.18132 [300] validation_0-rmse:1417.58814 validation_1-rmse:1134.11459 [387] validation_0-rmse:1148.85464 validation_1-rmse:1125.51557 [0] validation_0-rmse:15690.00576 validation_1-rmse:14623.69791 [100] validation_0-rmse:5922.40903 validation_1-rmse:4953.44560 [200] validation_0-rmse:2487.16266 validation_1-rmse:1826.09771 [300] validation_0-rmse:1421.21121 validation_1-rmse:1171.75880 [400] validation_0-rmse:1131.02612 validation_1-rmse:1099.84963 [500] validation_0-rmse:1033.92959 validation_1-rmse:1063.16529 [600] validation_0-rmse:977.53804 validation_1-rmse:1023.84641 [700] validation_0-rmse:941.23097 validation_1-rmse:988.34928 [800] validation_0-rmse:913.94865 validation_1-rmse:959.44708 [900] validation_0-rmse:893.15006 validation_1-rmse:937.61593 [999] validation_0-rmse:877.32479 validation_1-rmse:920.09351 [0] validation_0-rmse:15608.88568 validation_1-rmse:15003.83467 [100] validation_0-rmse:5894.59674 validation_1-rmse:5297.31304 [200] validation_0-rmse:2482.96049 validation_1-rmse:2108.48200 [300] validation_0-rmse:1423.92894 validation_1-rmse:1257.93833 [400] validation_0-rmse:1130.66224 validation_1-rmse:1072.57476 [500] validation_0-rmse:1036.60259 validation_1-rmse:1019.01514 [600] validation_0-rmse:983.36312 validation_1-rmse:976.64898 [700] validation_0-rmse:947.21354 validation_1-rmse:951.13684 [800] validation_0-rmse:921.53074 validation_1-rmse:934.76627 [900] validation_0-rmse:900.64403 validation_1-rmse:923.56716 [999] validation_0-rmse:884.12595 validation_1-rmse:917.23955 Score across folds 973.7605 Fold scores:[np.float64(988.2112207690716), np.float64(922.3561394856883), np.float64(1120.9018307931397), np.float64(920.0935129271969), np.float64(917.2395482985938)]
fi = pd.DataFrame(reg.feature_importances_,
index = reg.feature_names_in_,
columns=['importance']).sort_values('importance')
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()
As expected, the average temperature was an important feature, reducing the rmse by ~40%. I also became the most important feature, replacing hour of day. Let's see the final prediction.
test['prediction'] = reg.predict(X_test)
merged_df = merged_df.merge(test[['prediction']], how='left', left_index=True, right_index=True)
merged_df.tail()
| AEP_MW | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | prediction_x | lag1 | lag2 | lag3 | tavg | prediction_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-08-02 20:00:00 | 17673.0 | 20 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 18112.775391 | 18705.0 | 20140.0 | 16078.0 | 72.7 | 17761.755859 |
| 2018-08-02 21:00:00 | 17303.0 | 21 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 18112.775391 | 18108.0 | 19616.0 | 15908.0 | 72.7 | 17746.158203 |
| 2018-08-02 22:00:00 | 17001.0 | 22 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 17780.941406 | 17544.0 | 19246.0 | 15783.0 | 72.7 | 17422.464844 |
| 2018-08-02 23:00:00 | 15964.0 | 23 | 3 | 3 | 8 | 2018 | 214 | 2 | 31 | 16259.825195 | 16262.0 | 17943.0 | 14808.0 | 72.7 | 16203.428711 |
| 2018-08-03 00:00:00 | 14809.0 | 0 | 4 | 3 | 8 | 2018 | 215 | 3 | 31 | 14421.719727 | 15045.0 | 16491.0 | 13669.0 | 75.0 | 15054.704102 |
ax = merged_df[['AEP_MW']].plot(figsize=(15, 5))
merged_df['prediction_y'].plot(ax=ax, style='.')
plt.legend(['Truth Data', 'Predictions'])
ax.set_title('Raw Data and Prediction')
plt.show()
ax = merged_df.loc[(merged_df.index > '04-01-2018') & (merged_df.index < '04-08-2018')]['AEP_MW'] \
.plot(figsize=(15, 5), title='Week Of Data')
merged_df.loc[(merged_df.index > '04-01-2018') & (merged_df.index < '04-08-2018')]['prediction_y'] \
.plot(style='.')
plt.legend(['Truth Data','Prediction'])
plt.show()
test['error'] = np.abs(test[TARGET] - test['prediction'])
test['date'] = test.index.date
test.groupby(['date'])['error'].mean().sort_values(ascending=False).head(10)
| error | |
|---|---|
| date | |
| 2017-09-04 | 1932.311523 |
| 2018-01-05 | 1882.234863 |
| 2017-12-25 | 1881.235270 |
| 2018-01-08 | 1874.117350 |
| 2018-01-07 | 1865.508382 |
| 2017-11-23 | 1720.531982 |
| 2018-04-24 | 1702.335693 |
| 2018-07-01 | 1604.843709 |
| 2018-01-18 | 1557.129801 |
| 2018-06-18 | 1525.826904 |
4) Future Prediction¶
# Retrain on all data
df = create_features(df)
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year',
'lag1','lag2','lag3']
TARGET = 'AEP_MW'
X_all = df[FEATURES]
y_all = df[TARGET]
reg = xgb.XGBRegressor(base_score=0.5,
booster='gbtree',
n_estimators=500,
objective='reg:squarederror',
max_depth=3,
learning_rate=0.01)
reg.fit(X_all, y_all,
eval_set=[(X_all, y_all)],
verbose=100)
[0] validation_0-rmse:15558.69835 [100] validation_0-rmse:5908.65263 [200] validation_0-rmse:2622.39482 [300] validation_0-rmse:1721.87521 [400] validation_0-rmse:1523.74705 [499] validation_0-rmse:1472.30123
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=500,
n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=500,
n_jobs=None, num_parallel_tree=None, ...)df.index.max()
Timestamp('2018-08-03 00:00:00')
# Create future dataframe
future = pd.date_range('2018-08-03','2019-08-01', freq='1h')
future_df = pd.DataFrame(index=future)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
future_w_features = df_and_future.query('isFuture').copy()
Prediction
future_w_features['pred'] = reg.predict(future_w_features[FEATURES])
future_w_features['pred'].plot(figsize=(10, 5),
color=color_pal[4],
ms=1,
lw=1,
title='Future Predictions')
plt.show()
future_w_features['pred'].loc[(future_w_features.index > '12-15-2018') & (future_w_features.index < '01-15-2019')] \
.plot(figsize=(15, 5), title='New year Data')
<Axes: title={'center': 'New year Data'}>