Going through the time series forecasting youtube video (Time Series Forecasting with XGBoost - Use python and machine learning to predict energy consumption). Using machine learning to predict energy data.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import xgboost as xgb
import io
from google.colab import files

from sklearn.metrics import mean_squared_error
In [2]:
 uploaded = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving AEP_hourly.csv to AEP_hourly.csv
In [3]:
df = pd.read_csv(io.BytesIO(uploaded['AEP_hourly.csv']))
df['Datetime'] = pd.to_datetime(df['Datetime'])
df.sort_values('Datetime', inplace=True) #The data is not sorted in the csv file (discovered during later analysis). This will clean it up
df = df.set_index('Datetime')
df.head(10)
Out[3]:
AEP_MW
Datetime
2004-10-01 01:00:00 12379.0
2004-10-01 02:00:00 11935.0
2004-10-01 03:00:00 11692.0
2004-10-01 04:00:00 11597.0
2004-10-01 05:00:00 11681.0
2004-10-01 06:00:00 12280.0
2004-10-01 07:00:00 13692.0
2004-10-01 08:00:00 14618.0
2004-10-01 09:00:00 14903.0
2004-10-01 10:00:00 15118.0
In [4]:
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

df.plot(style='.',
        figsize=(15, 5),
        color=color_pal[0],
        title='PJME Energy Use in MW')
plt.show()
No description has been provided for this image

Train/Test Split¶

In [5]:
train = df.loc[df.index < '01-01-2015']
test = df.loc[df.index >= '01-01-2015']

fig, ax = plt.subplots(figsize=(15, 5))
train.plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test.plot(ax=ax, label='Test Set')
ax.axvline('01-01-2015', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()
No description has been provided for this image
In [6]:
df.loc[(df.index > '01-01-2010') & (df.index < '01-08-2010')] \
    .plot(figsize=(15, 5), title='Week Of Data')
plt.show()
No description has been provided for this image

Feature Creation¶

In [7]:
def create_features(df):
  """
  Create time series features based on time series index.
  """
  df = df.copy()

  df['hour'] = df.index.hour
  df['dayofweek'] = df.index.dayofweek
  df['quarter'] = df.index.quarter
  df['month'] = df.index.month
  df['year'] = df.index.year
  df['dayofyear'] = df.index.dayofyear
  df['dayofmonth'] = df.index.day
  df['weekofyear'] = df.index.isocalendar().week

  return df

df = create_features(df)
df.head()
Out[7]:
AEP_MW hour dayofweek quarter month year dayofyear dayofmonth weekofyear
Datetime
2004-10-01 01:00:00 12379.0 1 4 4 10 2004 275 1 40
2004-10-01 02:00:00 11935.0 2 4 4 10 2004 275 1 40
2004-10-01 03:00:00 11692.0 3 4 4 10 2004 275 1 40
2004-10-01 04:00:00 11597.0 4 4 4 10 2004 275 1 40
2004-10-01 05:00:00 11681.0 5 4 4 10 2004 275 1 40

Visualize Feature¶

In [8]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='hour', y='AEP_MW')
ax.set_title('MW by Hour')
plt.show()
No description has been provided for this image
In [9]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='month', y='AEP_MW',palette='Blues')
ax.set_title('MW by Month')
plt.show()
/tmp/ipython-input-1997979185.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x='month', y='AEP_MW',palette='Blues')
No description has been provided for this image

Create Model¶

In [10]:
train = create_features(train)
test = create_features(test)

FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year']
TARGET = 'AEP_MW'

X_train = train[FEATURES]
y_train = train[TARGET]

X_test = test[FEATURES]
y_test = test[TARGET]
In [11]:
reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds=50, learning_rate = 0.01)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)
[0]	validation_0-rmse:2557.05852	validation_1-rmse:2669.66874
[100]	validation_0-rmse:1667.99304	validation_1-rmse:1906.43321
[200]	validation_0-rmse:1388.41609	validation_1-rmse:1728.03964
[300]	validation_0-rmse:1260.06288	validation_1-rmse:1670.94367
[400]	validation_0-rmse:1170.54347	validation_1-rmse:1651.32523
[500]	validation_0-rmse:1113.48343	validation_1-rmse:1643.19503
[545]	validation_0-rmse:1091.20163	validation_1-rmse:1646.54135
Out[11]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=50,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=1000,
             n_jobs=None, num_parallel_tree=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=50,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=1000,
             n_jobs=None, num_parallel_tree=None, ...)

Feature Importance¶

In [12]:
fi = pd.DataFrame(reg.feature_importances_,
             index = reg.feature_names_in_,
             columns=['importance']).sort_values('importance')

fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()
No description has been provided for this image

As expected, the most important features are hour, month, and day of week. Day of year and quarter are probably excess variables that couold be infered from the first three variables.

Forecast on Test¶

In [13]:
test['prediction'] = reg.predict(X_test)
test.head()
Out[13]:
AEP_MW hour dayofweek quarter month year dayofyear dayofmonth weekofyear prediction
Datetime
2015-01-01 00:00:00 16375.0 0 3 1 1 2015 1 1 1 16409.544922
2015-01-01 01:00:00 16172.0 1 3 1 1 2015 1 1 1 15945.625000
2015-01-01 02:00:00 15968.0 2 3 1 1 2015 1 1 1 15893.853516
2015-01-01 03:00:00 15749.0 3 3 1 1 2015 1 1 1 15935.795898
2015-01-01 04:00:00 15727.0 4 3 1 1 2015 1 1 1 15935.795898
In [14]:
df = df.merge(test[['prediction']], how='left', left_index=True, right_index=True)
df.tail()
Out[14]:
AEP_MW hour dayofweek quarter month year dayofyear dayofmonth weekofyear prediction
Datetime
2018-08-02 20:00:00 17673.0 20 3 3 8 2018 214 2 31 18112.775391
2018-08-02 21:00:00 17303.0 21 3 3 8 2018 214 2 31 18112.775391
2018-08-02 22:00:00 17001.0 22 3 3 8 2018 214 2 31 17780.941406
2018-08-02 23:00:00 15964.0 23 3 3 8 2018 214 2 31 16259.825195
2018-08-03 00:00:00 14809.0 0 4 3 8 2018 215 3 31 14421.719727
In [15]:
ax = df[['AEP_MW']].plot(figsize=(15, 5))
df['prediction'].plot(ax=ax, style='.')
plt.legend(['Truth Data', 'Predictions'])
ax.set_title('Raw Data and Prediction')
plt.show()
No description has been provided for this image
In [16]:
ax = df.loc[(df.index > '04-01-2018') & (df.index < '04-08-2018')]['AEP_MW'] \
    .plot(figsize=(15, 5), title='Week Of Data')
df.loc[(df.index > '04-01-2018') & (df.index < '04-08-2018')]['prediction'] \
    .plot(style='.')
plt.legend(['Truth Data','Prediction'])
plt.show()
No description has been provided for this image
In [17]:
score = np.sqrt(mean_squared_error(test['AEP_MW'], test['prediction']))
print(f'RMSE Score on Test set: {score:0.2f}')
RMSE Score on Test set: 1643.09

Aggregate the energy usage error by date and report the most inaccurate dates.

In [18]:
test['error'] = np.abs(test[TARGET] - test['prediction'])
test['date'] = test.index.date
test.groupby(['date'])['error'].mean().sort_values(ascending=False).head(10)
Out[18]:
error
date
2017-01-22 4872.108968
2015-02-20 4861.401571
2015-02-19 4769.359985
2018-01-06 4403.549683
2017-02-07 4290.881022
2018-01-07 4202.593099
2017-01-03 4168.695475
2016-02-03 4092.213298
2017-01-23 4076.364746
2017-02-12 4060.571086

At this point, I will begin exploring on my own. All of the worse performing points are in the January and February. Lets look at January and February of 2017 which contains 5 of the data points.

In [19]:
ax = df.loc[(df.index > '01-01-2017') & (df.index < '02-28-2017')]['AEP_MW'] \
    .plot(figsize=(15, 5), title='Week Of Data')
df.loc[(df.index > '01-01-2017') & (df.index < '02-28-2017')]['prediction'] \
    .plot(style='.')
plt.legend(['Truth Data','Prediction'])
plt.show()
No description has been provided for this image

Hmm, it looks like most of the errors are overpredictions (predicting higher energy usage than expected). Seeing that AEP is the electric utility for Ohio, I would hypothesize a significant amount of the energy usage for January and February would be related to heating homes. If there is unseasonably warm weather in January and February, we would possibly see a reduction in energy usage that would not be accruately preducted. I seem to remember (I was living in Chicago at the time) that the 2017 winter was relatively mild compared to other years.

Looking at the weather data (weather.gov) from January 2017 for Columbus, OH, the average high in January 2017 was 7 degrees greater than normal. The average low was 8 degrees greater than normal. Diving into the data further, we can see that on January 22, 2017 (highest error), the daily high was 64 deg F, 27 deg above normal. This would seem to confirm my hypothesis that the error is due to abnormally hot weather in the winter.

In terms of improving the prediction, the easiest addition would be to add a variable incorporating weather data. Based off of the exploration above, a variable tracking the difference between the expected high and the average high for the data would supply the most information.

Part 2 follow up¶

There was a follow up youtube video where some of the most replied suggestions were implemented. I shall do the same. Time Series Forecasting with XGBoost - Advanced Methods https://www.youtube.com/watch?v=z3ZnOW-S550&list=WL&index=13

1) Outlier removal¶

In [20]:
df['AEP_MW'].plot(kind='hist', bins=500)
Out[20]:
<Axes: ylabel='Frequency'>
No description has been provided for this image
In [21]:
df.query('AEP_MW > 25000')
Out[21]:
AEP_MW hour dayofweek quarter month year dayofyear dayofmonth weekofyear prediction
Datetime
2007-08-08 15:00:00 25164.0 15 2 3 8 2007 220 8 32 NaN
2007-08-08 16:00:00 25056.0 16 2 3 8 2007 220 8 32 NaN
2007-08-08 17:00:00 25140.0 17 2 3 8 2007 220 8 32 NaN
2007-08-09 17:00:00 25035.0 17 3 3 8 2007 221 9 32 NaN
2008-10-20 14:00:00 25695.0 14 0 4 10 2008 294 20 43 NaN

Looking at the histogram, we can see the data has a bell curve shape. However, looking back at the full plot, we can see there seems to be one data point in 2009 ish that is much higher than the surrounding data points. Looking at the month box plot, we can see this same data point might be in October of 2009. Let's look at October of 2008.

In [22]:
ax = df.loc[(df.index > '10-01-2008') & (df.index < '10-31-2008')]['AEP_MW'] \
    .plot(figsize=(15, 5), title='Fall 2009 Data', marker = 'o', linestyle='')
plt.show()
No description has been provided for this image

Let's remove that data point which is 33% + larger than the surrounding data. Besides that, all the other outliers seem to be around the same date range, so I will keep them.

In [23]:
df = df.query('AEP_MW < 25_500').copy()

2) Time Series Cross-Validation¶

In [24]:
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
df = df.sort_index()
In [25]:
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)

fold = 0
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]
    train['AEP_MW'].plot(ax=axs[fold],
                          label='Training Set',
                          title=f'Data Train/Test Split Fold {fold}')
    test['AEP_MW'].plot(ax=axs[fold],
                         label='Test Set')
    axs[fold].axvline(test.index.min(), color='black', ls='--')
    fold += 1
plt.show()
No description has been provided for this image

3) Lag Features¶

In [26]:
def add_lags(df):
    target_map = df['AEP_MW'].to_dict()
    df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df['lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
    df['lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
    return df
In [27]:
df = add_lags(df)

Training

In [28]:
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
df = df.sort_index()


fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]

    train = create_features(train)
    test = create_features(test)

    FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
                'lag1','lag2','lag3']
    TARGET = 'AEP_MW'

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]

    reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                           n_estimators=1000,
                           early_stopping_rounds=50,
                           objective='reg:squarederror',
                           max_depth=3,
                           learning_rate=0.01)
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)

    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)
[0]	validation_0-rmse:15898.03113	validation_1-rmse:15263.64389
[100]	validation_0-rmse:6031.10029	validation_1-rmse:5645.59194
[200]	validation_0-rmse:2653.98825	validation_1-rmse:2426.91728
[300]	validation_0-rmse:1707.08329	validation_1-rmse:1644.72726
[400]	validation_0-rmse:1489.80648	validation_1-rmse:1551.11461
[462]	validation_0-rmse:1446.49648	validation_1-rmse:1553.94478
[0]	validation_0-rmse:15834.99651	validation_1-rmse:15222.40367
[100]	validation_0-rmse:6007.58253	validation_1-rmse:5715.60355
[200]	validation_0-rmse:2645.16579	validation_1-rmse:2504.38734
[300]	validation_0-rmse:1707.50979	validation_1-rmse:1616.55439
[400]	validation_0-rmse:1500.85941	validation_1-rmse:1452.26766
[500]	validation_0-rmse:1442.28701	validation_1-rmse:1428.93763
[600]	validation_0-rmse:1410.76972	validation_1-rmse:1428.32590
[627]	validation_0-rmse:1404.56989	validation_1-rmse:1428.86707
[0]	validation_0-rmse:15778.90821	validation_1-rmse:14549.79005
[100]	validation_0-rmse:5985.23715	validation_1-rmse:5101.69865
[200]	validation_0-rmse:2635.68625	validation_1-rmse:2054.50316
[300]	validation_0-rmse:1704.31663	validation_1-rmse:1509.71998
[370]	validation_0-rmse:1533.48604	validation_1-rmse:1518.78834
[0]	validation_0-rmse:15679.14337	validation_1-rmse:14632.02881
[100]	validation_0-rmse:5950.24136	validation_1-rmse:5461.94157
[200]	validation_0-rmse:2627.49815	validation_1-rmse:2469.52470
[300]	validation_0-rmse:1707.47979	validation_1-rmse:1780.09008
[400]	validation_0-rmse:1506.35016	validation_1-rmse:1662.99200
[500]	validation_0-rmse:1449.65781	validation_1-rmse:1647.25522
[526]	validation_0-rmse:1440.26842	validation_1-rmse:1646.97284
[0]	validation_0-rmse:15600.47280	validation_1-rmse:15012.06541
[100]	validation_0-rmse:5924.68051	validation_1-rmse:5860.61123
[200]	validation_0-rmse:2626.42679	validation_1-rmse:2778.27809
[300]	validation_0-rmse:1717.69377	validation_1-rmse:1903.05223
[400]	validation_0-rmse:1514.79626	validation_1-rmse:1713.95049
[500]	validation_0-rmse:1458.23547	validation_1-rmse:1698.37746
[509]	validation_0-rmse:1454.78082	validation_1-rmse:1700.80224
In [29]:
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
Score across folds 1564.0351
Fold scores:[np.float64(1550.4603694031152), np.float64(1427.667855316764), np.float64(1499.9665274669187), np.float64(1645.4343196934153), np.float64(1696.646615156534)]

3.1) Weather data¶

In order to see how the addition of weather data will affect the performance, I will add it instead of the lag data and see the performance of the data

In [30]:
pip install meteostat
Collecting meteostat
  Downloading meteostat-1.7.6-py3-none-any.whl.metadata (4.6 kB)
Requirement already satisfied: pandas>=2 in /usr/local/lib/python3.12/dist-packages (from meteostat) (2.2.2)
Requirement already satisfied: pytz in /usr/local/lib/python3.12/dist-packages (from meteostat) (2025.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from meteostat) (2.0.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas>=2->meteostat) (2.9.0.post0)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=2->meteostat) (2025.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas>=2->meteostat) (1.17.0)
Downloading meteostat-1.7.6-py3-none-any.whl (33 kB)
Installing collected packages: meteostat
Successfully installed meteostat-1.7.6
In [31]:
from datetime import datetime
from meteostat import Point, Daily, units

# Define the time period
start = datetime(2004, 1, 1)
end = datetime(2018, 8, 3)

# Latitude and Longitude of Columbus, OH, USA
location = Point(40, -83)

# Get daily data for the specified location and time period
data = Daily(location, start, end)

# Convert data to imperial units if desired (optional)
data = data.convert(units.imperial)

# Fetch the data
daily_temp_df = data.fetch()

daily_temp_df.plot(y=['tavg'])
plt.show()
Warning: Cannot load daily/2004/KOSU0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2005/KOSU0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2004/KTZR0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2005/KTZR0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2004/KLCK0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2004/KDLZ0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2005/KDLZ0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2006/KDLZ0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2007/KDLZ0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2008/KDLZ0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2009/KDLZ0.csv.gz from https://data.meteostat.net/
Warning: Cannot load daily/2010/KDLZ0.csv.gz from https://data.meteostat.net/
No description has been provided for this image
In [32]:
print(daily_temp_df.head(1))
print(daily_temp_df.tail(1))
            tavg  tmin  tmax  prcp  snow  wdir  wspd  wpgt  pres  tsun
time                                                                  
2005-01-01  50.2  48.2  51.8   NaN   NaN  <NA>   3.9   NaN  <NA>  <NA>
            tavg  tmin  tmax  prcp  snow  wdir  wspd  wpgt    pres  tsun
time                                                                    
2018-08-03  75.0  64.0  84.9   0.0   NaN  <NA>   4.2   NaN  1018.1  <NA>

Looks like the weather data only goes back to January 1, 2005. That's ok. I will just use data from 2005 on for this test

In [33]:
df_weather = df.loc[df.index >= '01-01-2005']
df_weather.head()
Out[33]:
AEP_MW hour dayofweek quarter month year dayofyear dayofmonth weekofyear prediction lag1 lag2 lag3
Datetime
2005-01-01 00:00:00 12892.0 0 5 1 1 2005 1 1 53 NaN NaN NaN NaN
2005-01-01 01:00:00 12316.0 1 5 1 1 2005 1 1 53 NaN NaN NaN NaN
2005-01-01 02:00:00 11890.0 2 5 1 1 2005 1 1 53 NaN NaN NaN NaN
2005-01-01 03:00:00 11579.0 3 5 1 1 2005 1 1 53 NaN NaN NaN NaN
2005-01-01 04:00:00 11461.0 4 5 1 1 2005 1 1 53 NaN NaN NaN NaN

Now we need to add the average daily temperature data to the energy data. Since the energy data is hourly, I will resample it to hourly data and use forward fill to use the average temperature for the entire day.

In [34]:
# Resample daily temperature data to hourly using forward-fill
hourly_temp_df = daily_temp_df['tavg'].resample('H').ffill()

print("\nUpsampled Hourly Temperature Data (First 5 rows):")
print(hourly_temp_df.head())
Upsampled Hourly Temperature Data (First 5 rows):
time
2005-01-01 00:00:00    50.2
2005-01-01 01:00:00    50.2
2005-01-01 02:00:00    50.2
2005-01-01 03:00:00    50.2
2005-01-01 04:00:00    50.2
Freq: h, Name: tavg, dtype: float64
FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
In [35]:
merged_df = pd.merge(df_weather, hourly_temp_df, left_index=True, right_index=True)
merged_df.tail()
#merged_df = df_weather.merge(hourly_temp_df, on='datetime', how='left')
Out[35]:
AEP_MW hour dayofweek quarter month year dayofyear dayofmonth weekofyear prediction lag1 lag2 lag3 tavg
2018-08-02 20:00:00 17673.0 20 3 3 8 2018 214 2 31 18112.775391 18705.0 20140.0 16078.0 72.7
2018-08-02 21:00:00 17303.0 21 3 3 8 2018 214 2 31 18112.775391 18108.0 19616.0 15908.0 72.7
2018-08-02 22:00:00 17001.0 22 3 3 8 2018 214 2 31 17780.941406 17544.0 19246.0 15783.0 72.7
2018-08-02 23:00:00 15964.0 23 3 3 8 2018 214 2 31 16259.825195 16262.0 17943.0 14808.0 72.7
2018-08-03 00:00:00 14809.0 0 4 3 8 2018 215 3 31 14421.719727 15045.0 16491.0 13669.0 75.0

Let's run the training on the data. I will train the first model on only the date time feature. I will then add the average temperature to the second model to see it's impact.

In [36]:
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
merged_df = merged_df.sort_index()


fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(merged_df):
    train = merged_df.iloc[train_idx]
    test = merged_df.iloc[val_idx]

    train = create_features(train)
    test = create_features(test)

    FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year']
    TARGET = 'AEP_MW'

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]

    reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                           n_estimators=1000,
                           early_stopping_rounds=50,
                           objective='reg:squarederror',
                           max_depth=3,
                           learning_rate=0.01)
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)

    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
[0]	validation_0-rmse:15919.32529	validation_1-rmse:15260.09387
[100]	validation_0-rmse:6074.77797	validation_1-rmse:5468.97507
[200]	validation_0-rmse:2708.94876	validation_1-rmse:2331.44146
[300]	validation_0-rmse:1734.90622	validation_1-rmse:1690.16608
[400]	validation_0-rmse:1494.47518	validation_1-rmse:1637.37576
[500]	validation_0-rmse:1412.82265	validation_1-rmse:1631.22793
[600]	validation_0-rmse:1370.01671	validation_1-rmse:1615.88885
[700]	validation_0-rmse:1341.55411	validation_1-rmse:1603.03996
[800]	validation_0-rmse:1317.63729	validation_1-rmse:1581.00706
[900]	validation_0-rmse:1298.33594	validation_1-rmse:1566.90554
[999]	validation_0-rmse:1284.71780	validation_1-rmse:1562.21287
[0]	validation_0-rmse:15852.55221	validation_1-rmse:15217.40241
[100]	validation_0-rmse:6054.82674	validation_1-rmse:5450.63577
[200]	validation_0-rmse:2713.59735	validation_1-rmse:2315.61490
[300]	validation_0-rmse:1760.13594	validation_1-rmse:1622.84965
[400]	validation_0-rmse:1522.47684	validation_1-rmse:1513.38632
[500]	validation_0-rmse:1437.02999	validation_1-rmse:1487.58200
[600]	validation_0-rmse:1394.83465	validation_1-rmse:1467.37681
[700]	validation_0-rmse:1366.24235	validation_1-rmse:1450.46670
[800]	validation_0-rmse:1341.70354	validation_1-rmse:1438.83801
[900]	validation_0-rmse:1322.74958	validation_1-rmse:1423.02636
[999]	validation_0-rmse:1306.77534	validation_1-rmse:1415.46878
[0]	validation_0-rmse:15793.57570	validation_1-rmse:14544.80747
[100]	validation_0-rmse:6034.36341	validation_1-rmse:4859.34313
[200]	validation_0-rmse:2708.30119	validation_1-rmse:1926.42047
[300]	validation_0-rmse:1763.84178	validation_1-rmse:1557.96818
[357]	validation_0-rmse:1593.22044	validation_1-rmse:1583.09576
[0]	validation_0-rmse:15690.55320	validation_1-rmse:14623.39203
[100]	validation_0-rmse:6004.14878	validation_1-rmse:5104.87467
[200]	validation_0-rmse:2709.79669	validation_1-rmse:2239.81896
[300]	validation_0-rmse:1772.34647	validation_1-rmse:1735.14351
[400]	validation_0-rmse:1535.35824	validation_1-rmse:1661.81061
[500]	validation_0-rmse:1455.10409	validation_1-rmse:1634.60631
[600]	validation_0-rmse:1408.09365	validation_1-rmse:1606.04842
[700]	validation_0-rmse:1375.80033	validation_1-rmse:1581.94460
[800]	validation_0-rmse:1353.22099	validation_1-rmse:1567.60810
[900]	validation_0-rmse:1336.10302	validation_1-rmse:1549.79120
[999]	validation_0-rmse:1321.78842	validation_1-rmse:1531.91717
[0]	validation_0-rmse:15609.55353	validation_1-rmse:15005.47437
[100]	validation_0-rmse:5980.05887	validation_1-rmse:5571.88600
[200]	validation_0-rmse:2709.83896	validation_1-rmse:2607.11561
[300]	validation_0-rmse:1778.68455	validation_1-rmse:1872.76565
[400]	validation_0-rmse:1541.39039	validation_1-rmse:1719.25684
[500]	validation_0-rmse:1458.66181	validation_1-rmse:1699.53719
[600]	validation_0-rmse:1417.05157	validation_1-rmse:1685.07744
[682]	validation_0-rmse:1390.10489	validation_1-rmse:1684.16992
Score across folds 1549.5958
Fold scores:[np.float64(1562.0637012201323), np.float64(1414.7445059544784), np.float64(1557.0306011532384), np.float64(1531.9171680488605), np.float64(1682.2230640312666)]
In [37]:
fi = pd.DataFrame(reg.feature_importances_,
             index = reg.feature_names_in_,
             columns=['importance']).sort_values('importance')

fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()
No description has been provided for this image
In [38]:
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
merged_df = merged_df.sort_index()


fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(merged_df):
    train = merged_df.iloc[train_idx]
    test = merged_df.iloc[val_idx]

    train = create_features(train)
    test = create_features(test)

    FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
                'tavg']
    TARGET = 'AEP_MW'

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]

    reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                           n_estimators=1000,
                           early_stopping_rounds=50,
                           objective='reg:squarederror',
                           max_depth=3,
                           learning_rate=0.01)
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)

    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
[0]	validation_0-rmse:15918.68939	validation_1-rmse:15259.40198
[100]	validation_0-rmse:5990.80751	validation_1-rmse:5289.59651
[200]	validation_0-rmse:2491.29233	validation_1-rmse:1906.08593
[300]	validation_0-rmse:1398.80560	validation_1-rmse:1179.41119
[400]	validation_0-rmse:1103.08122	validation_1-rmse:1123.56388
[500]	validation_0-rmse:1006.90660	validation_1-rmse:1104.13707
[600]	validation_0-rmse:952.06116	validation_1-rmse:1078.57792
[700]	validation_0-rmse:912.94214	validation_1-rmse:1051.89889
[800]	validation_0-rmse:885.61288	validation_1-rmse:1028.67841
[900]	validation_0-rmse:865.04234	validation_1-rmse:1008.45931
[999]	validation_0-rmse:848.67355	validation_1-rmse:988.21122
[0]	validation_0-rmse:15851.94636	validation_1-rmse:15216.31490
[100]	validation_0-rmse:5973.14599	validation_1-rmse:5286.63352
[200]	validation_0-rmse:2494.59685	validation_1-rmse:1929.88132
[300]	validation_0-rmse:1415.77456	validation_1-rmse:1176.70316
[400]	validation_0-rmse:1126.87075	validation_1-rmse:1083.33471
[500]	validation_0-rmse:1028.85917	validation_1-rmse:1046.48953
[600]	validation_0-rmse:975.47998	validation_1-rmse:1010.46774
[700]	validation_0-rmse:935.97506	validation_1-rmse:979.72659
[800]	validation_0-rmse:909.95368	validation_1-rmse:956.55169
[900]	validation_0-rmse:888.58142	validation_1-rmse:937.04480
[999]	validation_0-rmse:872.40503	validation_1-rmse:922.35614
[0]	validation_0-rmse:15793.07026	validation_1-rmse:14545.90643
[100]	validation_0-rmse:5954.57507	validation_1-rmse:4844.21554
[200]	validation_0-rmse:2492.72274	validation_1-rmse:1680.18132
[300]	validation_0-rmse:1417.58814	validation_1-rmse:1134.11459
[387]	validation_0-rmse:1148.85464	validation_1-rmse:1125.51557
[0]	validation_0-rmse:15690.00576	validation_1-rmse:14623.69791
[100]	validation_0-rmse:5922.40903	validation_1-rmse:4953.44560
[200]	validation_0-rmse:2487.16266	validation_1-rmse:1826.09771
[300]	validation_0-rmse:1421.21121	validation_1-rmse:1171.75880
[400]	validation_0-rmse:1131.02612	validation_1-rmse:1099.84963
[500]	validation_0-rmse:1033.92959	validation_1-rmse:1063.16529
[600]	validation_0-rmse:977.53804	validation_1-rmse:1023.84641
[700]	validation_0-rmse:941.23097	validation_1-rmse:988.34928
[800]	validation_0-rmse:913.94865	validation_1-rmse:959.44708
[900]	validation_0-rmse:893.15006	validation_1-rmse:937.61593
[999]	validation_0-rmse:877.32479	validation_1-rmse:920.09351
[0]	validation_0-rmse:15608.88568	validation_1-rmse:15003.83467
[100]	validation_0-rmse:5894.59674	validation_1-rmse:5297.31304
[200]	validation_0-rmse:2482.96049	validation_1-rmse:2108.48200
[300]	validation_0-rmse:1423.92894	validation_1-rmse:1257.93833
[400]	validation_0-rmse:1130.66224	validation_1-rmse:1072.57476
[500]	validation_0-rmse:1036.60259	validation_1-rmse:1019.01514
[600]	validation_0-rmse:983.36312	validation_1-rmse:976.64898
[700]	validation_0-rmse:947.21354	validation_1-rmse:951.13684
[800]	validation_0-rmse:921.53074	validation_1-rmse:934.76627
[900]	validation_0-rmse:900.64403	validation_1-rmse:923.56716
[999]	validation_0-rmse:884.12595	validation_1-rmse:917.23955
Score across folds 973.7605
Fold scores:[np.float64(988.2112207690716), np.float64(922.3561394856883), np.float64(1120.9018307931397), np.float64(920.0935129271969), np.float64(917.2395482985938)]
In [39]:
fi = pd.DataFrame(reg.feature_importances_,
             index = reg.feature_names_in_,
             columns=['importance']).sort_values('importance')

fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()
No description has been provided for this image

As expected, the average temperature was an important feature, reducing the rmse by ~40%. I also became the most important feature, replacing hour of day. Let's see the final prediction.

In [40]:
test['prediction'] = reg.predict(X_test)
merged_df = merged_df.merge(test[['prediction']], how='left', left_index=True, right_index=True)
merged_df.tail()
Out[40]:
AEP_MW hour dayofweek quarter month year dayofyear dayofmonth weekofyear prediction_x lag1 lag2 lag3 tavg prediction_y
2018-08-02 20:00:00 17673.0 20 3 3 8 2018 214 2 31 18112.775391 18705.0 20140.0 16078.0 72.7 17761.755859
2018-08-02 21:00:00 17303.0 21 3 3 8 2018 214 2 31 18112.775391 18108.0 19616.0 15908.0 72.7 17746.158203
2018-08-02 22:00:00 17001.0 22 3 3 8 2018 214 2 31 17780.941406 17544.0 19246.0 15783.0 72.7 17422.464844
2018-08-02 23:00:00 15964.0 23 3 3 8 2018 214 2 31 16259.825195 16262.0 17943.0 14808.0 72.7 16203.428711
2018-08-03 00:00:00 14809.0 0 4 3 8 2018 215 3 31 14421.719727 15045.0 16491.0 13669.0 75.0 15054.704102
In [41]:
ax = merged_df[['AEP_MW']].plot(figsize=(15, 5))
merged_df['prediction_y'].plot(ax=ax, style='.')
plt.legend(['Truth Data', 'Predictions'])
ax.set_title('Raw Data and Prediction')
plt.show()
No description has been provided for this image
In [42]:
ax = merged_df.loc[(merged_df.index > '04-01-2018') & (merged_df.index < '04-08-2018')]['AEP_MW'] \
    .plot(figsize=(15, 5), title='Week Of Data')
merged_df.loc[(merged_df.index > '04-01-2018') & (merged_df.index < '04-08-2018')]['prediction_y'] \
    .plot(style='.')
plt.legend(['Truth Data','Prediction'])
plt.show()
No description has been provided for this image
In [43]:
test['error'] = np.abs(test[TARGET] - test['prediction'])
test['date'] = test.index.date
test.groupby(['date'])['error'].mean().sort_values(ascending=False).head(10)
Out[43]:
error
date
2017-09-04 1932.311523
2018-01-05 1882.234863
2017-12-25 1881.235270
2018-01-08 1874.117350
2018-01-07 1865.508382
2017-11-23 1720.531982
2018-04-24 1702.335693
2018-07-01 1604.843709
2018-01-18 1557.129801
2018-06-18 1525.826904

4) Future Prediction¶

In [44]:
# Retrain on all data
df = create_features(df)

FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year',
            'lag1','lag2','lag3']
TARGET = 'AEP_MW'

X_all = df[FEATURES]
y_all = df[TARGET]

reg = xgb.XGBRegressor(base_score=0.5,
                       booster='gbtree',
                       n_estimators=500,
                       objective='reg:squarederror',
                       max_depth=3,
                       learning_rate=0.01)
reg.fit(X_all, y_all,
        eval_set=[(X_all, y_all)],
        verbose=100)
[0]	validation_0-rmse:15558.69835
[100]	validation_0-rmse:5908.65263
[200]	validation_0-rmse:2622.39482
[300]	validation_0-rmse:1721.87521
[400]	validation_0-rmse:1523.74705
[499]	validation_0-rmse:1472.30123
Out[44]:
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=500,
             n_jobs=None, num_parallel_tree=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=500,
             n_jobs=None, num_parallel_tree=None, ...)
In [45]:
df.index.max()
Out[45]:
Timestamp('2018-08-03 00:00:00')
In [46]:
# Create future dataframe
future = pd.date_range('2018-08-03','2019-08-01', freq='1h')
future_df = pd.DataFrame(index=future)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
In [47]:
future_w_features = df_and_future.query('isFuture').copy()

Prediction

In [48]:
future_w_features['pred'] = reg.predict(future_w_features[FEATURES])
In [49]:
future_w_features['pred'].plot(figsize=(10, 5),
                               color=color_pal[4],
                               ms=1,
                               lw=1,
                               title='Future Predictions')
plt.show()
No description has been provided for this image
In [50]:
future_w_features['pred'].loc[(future_w_features.index > '12-15-2018') & (future_w_features.index < '01-15-2019')] \
    .plot(figsize=(15, 5), title='New year Data')
Out[50]:
<Axes: title={'center': 'New year Data'}>
No description has been provided for this image
In [ ]: