%reload_ext autoreload
%autoreload 2
from fastai.tabular import *
To create the feature-engineered train_clean and test_clean from the Kaggle competition data, run rossman_data_clean.ipynb. One important step that deals with time series is this:
为了从Kaggle竞赛数据集中创建经过特征工程处理的train_clean和test_clean,运行rossman_data_clean.ipynb,其中一个处理时间序列数据的重要步骤如下:
add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)
path = Config().data_path()/'rossmann'
train_df = pd.read_pickle(path/'train_clean')
train_df.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| index | 0 | 1 | 2 | 3 | 4 |
| Store | 1 | 2 | 3 | 4 | 5 |
| DayOfWeek | 5 | 5 | 5 | 5 | 5 |
| Date | 2015-07-31 | 2015-07-31 | 2015-07-31 | 2015-07-31 | 2015-07-31 |
| Sales | 5263 | 6064 | 8314 | 13995 | 4822 |
| Customers | 555 | 625 | 821 | 1498 | 559 |
| Open | 1 | 1 | 1 | 1 | 1 |
| Promo | 1 | 1 | 1 | 1 | 1 |
| StateHoliday | False | False | False | False | False |
| SchoolHoliday | 1 | 1 | 1 | 1 | 1 |
| Year | 2015 | 2015 | 2015 | 2015 | 2015 |
| Month | 7 | 7 | 7 | 7 | 7 |
| Week | 31 | 31 | 31 | 31 | 31 |
| Day | 31 | 31 | 31 | 31 | 31 |
| Dayofweek | 4 | 4 | 4 | 4 | 4 |
| Dayofyear | 212 | 212 | 212 | 212 | 212 |
| Is_month_end | True | True | True | True | True |
| Is_month_start | False | False | False | False | False |
| Is_quarter_end | False | False | False | False | False |
| Is_quarter_start | False | False | False | False | False |
| Is_year_end | False | False | False | False | False |
| Is_year_start | False | False | False | False | False |
| Elapsed | 1438300800 | 1438300800 | 1438300800 | 1438300800 | 1438300800 |
| StoreType | c | a | a | c | a |
| Assortment | a | a | a | c | a |
| CompetitionDistance | 1270 | 570 | 14130 | 620 | 29910 |
| CompetitionOpenSinceMonth | 9 | 11 | 12 | 9 | 4 |
| CompetitionOpenSinceYear | 2008 | 2007 | 2006 | 2009 | 2015 |
| Promo2 | 0 | 1 | 1 | 0 | 0 |
| Promo2SinceWeek | 1 | 13 | 14 | 1 | 1 |
| ... | ... | ... | ... | ... | ... |
| Min_Sea_Level_PressurehPa | 1015 | 1017 | 1017 | 1014 | 1016 |
| Max_VisibilityKm | 31 | 10 | 31 | 10 | 10 |
| Mean_VisibilityKm | 15 | 10 | 14 | 10 | 10 |
| Min_VisibilitykM | 10 | 10 | 10 | 10 | 10 |
| Max_Wind_SpeedKm_h | 24 | 14 | 14 | 23 | 14 |
| Mean_Wind_SpeedKm_h | 11 | 11 | 5 | 16 | 11 |
| Max_Gust_SpeedKm_h | NaN | NaN | NaN | NaN | NaN |
| Precipitationmm | 0 | 0 | 0 | 0 | 0 |
| CloudCover | 1 | 4 | 2 | 6 | 4 |
| Events | Fog | Fog | Fog | NaN | NaN |
| WindDirDegrees | 13 | 309 | 354 | 282 | 290 |
| StateName | Hessen | Thueringen | NordrheinWestfalen | Berlin | Sachsen |
| CompetitionOpenSince | 2008-09-15 | 2007-11-15 | 2006-12-15 | 2009-09-15 | 2015-04-15 |
| CompetitionDaysOpen | 2510 | 2815 | 3150 | 2145 | 107 |
| CompetitionMonthsOpen | 24 | 24 | 24 | 24 | 3 |
| Promo2Since | 1900-01-01 | 2010-03-29 | 2011-04-04 | 1900-01-01 | 1900-01-01 |
| Promo2Days | 0 | 1950 | 1579 | 0 | 0 |
| Promo2Weeks | 0 | 25 | 25 | 0 | 0 |
| AfterSchoolHoliday | 0 | 0 | 0 | 0 | 0 |
| BeforeSchoolHoliday | 0 | 0 | 0 | 0 | 0 |
| AfterStateHoliday | 57 | 67 | 57 | 67 | 57 |
| BeforeStateHoliday | 0 | 0 | 0 | 0 | 0 |
| AfterPromo | 0 | 0 | 0 | 0 | 0 |
| BeforePromo | 0 | 0 | 0 | 0 | 0 |
| SchoolHoliday_bw | 5 | 5 | 5 | 5 | 5 |
| StateHoliday_bw | 0 | 0 | 0 | 0 | 0 |
| Promo_bw | 5 | 5 | 5 | 5 | 5 |
| SchoolHoliday_fw | 7 | 1 | 5 | 1 | 1 |
| StateHoliday_fw | 0 | 0 | 0 | 0 | 0 |
| Promo_fw | 5 | 1 | 5 | 1 | 1 |
93 rows × 5 columns
n = len(train_df); n
844338
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]
small_train_df.head()
| Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
|---|---|---|---|---|---|---|
| 267 | 268 | 5 | NaN | 4520.0 | 67 | 7492 |
| 604 | 606 | 5 | NaN | 2260.0 | 61 | 7187 |
| 983 | 986 | 5 | Feb,May,Aug,Nov | 620.0 | 61 | 7051 |
| 1636 | 525 | 4 | NaN | 1870.0 | 55 | 9673 |
| 2348 | 123 | 3 | NaN | 16760.0 | 50 | 10007 |
small_test_df.head()
| Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
|---|---|---|---|---|---|---|
| 420510 | 829 | 3 | NaN | 110.0 | 55 | 6802 |
| 420654 | 973 | 3 | Jan,Apr,Jul,Oct | 330.0 | 59 | 6644 |
| 420990 | 194 | 2 | Feb,May,Aug,Nov | 16970.0 | 55 | 4720 |
| 421308 | 512 | 2 | Mar,Jun,Sept,Dec | 590.0 | 72 | 6248 |
| 421824 | 1029 | 2 | NaN | 1590.0 | 64 | 8004 |
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)
small_test_df.head()
| Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
|---|---|---|---|---|---|---|
| 420510 | NaN | 3 | NaN | 110.0 | 55 | 6802 |
| 420654 | 973.0 | 3 | Jan,Apr,Jul,Oct | 330.0 | 59 | 6644 |
| 420990 | NaN | 2 | Feb,May,Aug,Nov | 16970.0 | 55 | 4720 |
| 421308 | 512.0 | 2 | Mar,Jun,Sept,Dec | 590.0 | 72 | 6248 |
| 421824 | 1029.0 | 2 | NaN | 1590.0 | 64 | 8004 |
small_train_df.PromoInterval.cat.categories
Index(['Feb,May,Aug,Nov', 'Jan,Apr,Jul,Oct', 'Mar,Jun,Sept,Dec'], dtype='object')
small_train_df['PromoInterval'].cat.codes[:5]
267 -1 604 -1 983 0 1636 -1 2348 -1 dtype: int8
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)
small_train_df[small_train_df['CompetitionDistance_na'] == True]
| Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | CompetitionDistance_na | |
|---|---|---|---|---|---|---|---|
| 185749 | 622 | 2 | NaN | 2300.0 | 93 | 4508 | True |
train_df = pd.read_pickle(path/'train_clean')
test_df = pd.read_pickle(path/'test_clean')
len(train_df),len(test_df)
(844338, 41088)
procs=[FillMissing, Categorify, Normalize]
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()
test_df['Date'].min(), test_df['Date'].max()
('2015-08-01', '2015-09-17')
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut
41395
valid_idx = range(cut)
df[dep_var].head()
0 5263 1 6064 2 8314 3 13995 4 4822 Name: Sales, dtype: int64
data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
.split_by_idx(valid_idx)
.label_from_df(cols=dep_var, label_cls=FloatList, log=True)
.add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
.databunch())
doc(FloatList)
max_log_y = np.log(np.max(train_df['Sales'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
y_range=y_range, metrics=exp_rmspe)
learn.model
TabularModel(
(embeds): ModuleList(
(0): Embedding(1116, 81)
(1): Embedding(8, 5)
(2): Embedding(4, 3)
(3): Embedding(13, 7)
(4): Embedding(32, 11)
(5): Embedding(3, 3)
(6): Embedding(26, 10)
(7): Embedding(27, 10)
(8): Embedding(5, 4)
(9): Embedding(4, 3)
(10): Embedding(4, 3)
(11): Embedding(24, 9)
(12): Embedding(9, 5)
(13): Embedding(13, 7)
(14): Embedding(53, 15)
(15): Embedding(22, 9)
(16): Embedding(7, 5)
(17): Embedding(7, 5)
(18): Embedding(4, 3)
(19): Embedding(4, 3)
(20): Embedding(9, 5)
(21): Embedding(9, 5)
(22): Embedding(3, 3)
(23): Embedding(3, 3)
)
(emb_drop): Dropout(p=0.04)
(bn_cont): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(layers): Sequential(
(0): Linear(in_features=233, out_features=1000, bias=True)
(1): ReLU(inplace)
(2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(3): Dropout(p=0.001)
(4): Linear(in_features=1000, out_features=500, bias=True)
(5): ReLU(inplace)
(6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(7): Dropout(p=0.01)
(8): Linear(in_features=500, out_features=1, bias=True)
)
)
len(data.train_ds.cont_names)
16
learn.lr_find()
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
learn.recorder.plot()
learn.fit_one_cycle(5, 1e-3, wd=0.2)
| epoch | train_loss | valid_loss | exp_rmspe |
|---|---|---|---|
| 1 | 0.023587 | 0.020941 | 0.140551 |
| 2 | 0.017678 | 0.023431 | 0.132211 |
| 3 | 0.017453 | 0.016929 | 0.120169 |
| 4 | 0.012608 | 0.016296 | 0.109245 |
| 5 | 0.010222 | 0.011238 | 0.105433 |
learn.save('1')
learn.recorder.plot_losses(last=-1)
learn.load('1');
learn.fit_one_cycle(5, 3e-4)
| epoch | train_loss | valid_loss | exp_rmspe |
|---|---|---|---|
| 1 | 0.012223 | 0.014312 | 0.116988 |
| 2 | 0.012001 | 0.017789 | 0.117619 |
| 3 | 0.011402 | 0.035596 | 0.114396 |
| 4 | 0.010067 | 0.015125 | 0.113652 |
| 5 | 0.009148 | 0.031326 | 0.116344 |
learn.fit_one_cycle(5, 3e-4)
| epoch | train_loss | valid_loss | exp_rmspe |
|---|---|---|---|
| 1 | 0.011840 | 0.013236 | 0.110483 |
| 2 | 0.010765 | 0.057664 | 0.129586 |
| 3 | 0.010101 | 0.042744 | 0.111584 |
| 4 | 0.008820 | 0.116893 | 0.135458 |
| 5 | 0.009144 | 0.017969 | 0.126323 |
(10th place in the competition was 0.108)
test_preds=learn.get_preds(DatasetType.Test)
test_df["Sales"]=np.exp(test_preds[0].data).numpy().T[0]
test_df[["Id","Sales"]]=test_df[["Id","Sales"]].astype("int")
test_df[["Id","Sales"]].to_csv("rossmann_submission.csv",index=False)