# import librariesimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# load the data
raw = pd.read_csv('Metro_Interstate_Traffic_Volume.csv')# display first five rows
raw.head()# display details for each column
raw.info()
1
2
3
4
5
6
7
8
9
10
raw.head()
raw.info()
查看
info
信息,我们发现
data_time
这一类目是
object
类型,所以我们需要将其转化为
datetime
类型:
# convert date_time column to datetime type
raw.date_time = pd.to_datetime(raw.date_time)
1
2
特征工程
从上面的
info
方法的输出中,我们知道除了
date_time
列之外还有其他的分类特征。但是由于本文的主要主题是处理时间序列数据,我们将重点关注针对
date_time
的特性工程。
获取
Day name
的方式和上面几个数据有所不同。我们想要确定
raw.date_time
序列中关于星期几的信息,需要以下两个步骤。首先,通过
pd.Series.dt.day_name()
生成
day name
序列。然后,我们需要通过
pd.get_dummies()
进行独热编码(one-hot encode)。
# first: extract the day name literal
to_one_hot = raw.date_time.dt.day_name()# second: one hot encode to 7 columns
days = pd.get_dummies(to_one_hot)#display data
days
# is_holiday flag
is_holiday = raw.holiday.apply(lambda x :0if x =="None"else1)
1
2
我们需要考虑的最后一个分类特征是天气。我们只对该特征进行如下独热编码。
# one-hot encode weather
weathers = pd.get_dummies(raw.weather_main)#display data
weathers
1
2
3
4
独热编码后的Weather信息
特征处理后的数据
现在,我们终于有了最终的可用于训练的数据!让我们创建一个名为
features
的全新数据集,它包含所有的特征,包括数值型特征(我们从原始数据中按原样放置)和类型特征(我们设计的特性)。
# features table#first step: include features with single column nature
features = pd.DataFrame({'temp': raw.temp,'rain_1h': raw.rain_1h,'snow_1h': raw.snow_1h,'clouds_all': raw.clouds_all,'month': months,'day_of_month': day_of_months,'hour': hours,'is_holiday': is_holiday,'is_weekend': is_weekend
})#second step: concat with one-hot encode typed features
features = pd.concat([features, days, dayparts, weathers], axis =1)# target column
target = raw.traffic_volume
from sklearn import datasets, ensemble
# define the model parameters
params ={'n_estimators':500,'max_depth':4,'min_samples_split':5,'learning_rate':0.01,'loss':'ls'}# instantiate and train the model
gb_reg = ensemble.GradientBoostingRegressor(**params)
gb_reg.fit(X_train, y_train)
fig, ax = plt.subplots(figsize =(12,6))
index_ordered = raw.date_time.astype('str').tolist()[-len(X_test):][-100:]
ax.set_xlabel('Date')
ax.set_ylabel('Traffic Volume')# the actual values
ax.plot(index_ordered, y_test[-100:].to_numpy(), color='k', ls='-', label ='actual')# predictions of model with engineered features
ax.plot(index_ordered, gb_reg.predict(X_test)[-100:], color='b', ls='--', label ='predicted; with date-time features')# predictions of model without engineered features
ax.plot(index_ordered, gb_reg_lite.predict(X_test_lite)[-100:], color='r', ls='--', label ='predicted; w/o date-time features')
every_nth =5for n, label inenumerate(ax.xaxis.get_ticklabels()):if n % every_nth !=0:
label.set_visible(False)
ax.tick_params(axis='x', labelrotation=90)
plt.legend()
plt.title('Actual vs predicted on the last 100 data points')
plt.draw()