Hello. I have a problem with implementing the approach, which Jeremy used for the 3-rd lesson, for another competition: https://www.kaggle.com/c/competitive-data-science-predict-future-sales
I made a feature engineering and data preprocessing. Everything seems fine. The loss is falling, and the validation is good. But the prediction is terrible (like 100 times more terrible than the validation)
Here is the code:
from fastai.structured import *
from fastai.column_data import *
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
import numpy as np
import datetime
from dateutil import relativedelta
# reading all the data
PATH = 'data/avito/'
def join_df(left, right, left_on, right_on=None, suffix='_y'):
if right_on is None: right_on = left_on
return left.merge(right, how='left', left_on=left_on, right_on=right_on, suffixes=("", suffix))
def rmse(y_pred, targ):
return sqrt(mean_squared_error(targ, y_pred))
cat_vars = ["shop_id", "item_id", "item_category_id"]
dep = "item_cnt_day"
table_names = ['train', 'items', 'item_categories', 'shops', 'test']
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names]
train, items, item_categories, shops, test_table = tables
# no need a date. all dates will be grouped by its month
train = train.drop('date', axis=1)
# concatenating the tables
sales_table = join_df(train, items, "item_id", "item_id")
sales_table = join_df(sales_table, test_table, ["item_id", "shop_id"], ["item_id", "shop_id"])
# preprocessing sales
sales_table[dep] = sales_table[dep].astype(np.float32)
sales_table[dep] = sales_table[dep].fillna(0).astype('float32')
sales_table[dep][sales_table[dep] < 0] = 0
sales_table[dep][sales_table[dep] > 20] = 20
sales_table['item_price'][sales_table['item_price'] < 0] = 0
df = sales_table[cat_vars + [dep, "ID", "date_block_num", 'item_price']]
# dropping unnecessary data (we don't need data without ID)
df[dep] = df[dep].fillna(0).astype('float32')
df["ID"] = df["ID"].fillna(-1)
df = df[df.ID != -1]
df = df.reset_index(drop=True)
# putting prices into bins. these bins will be used in train and interpolated into test data
price_bins = [0, 100, 300, 500, 1000, 1500, 2000, 2500, 3000, 4000, 5000, 100000]
price_labels = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4', 'C1', 'C2', 'D']
train_df = df.groupby(['date_block_num','ID']+cat_vars, as_index=False).agg({'item_cnt_day':'sum', 'item_price':'mean'})
train_df['price_bin'] = pd.cut(train_df['item_price'], price_bins, labels=price_labels)
temp_price_id_buckets_df = train_df[['ID', 'price_bin']].drop_duplicates().reset_index(drop=True)
price_id_buckets_df = temp_price_id_buckets_df.groupby('ID').nth(0).reset_index()
# grouping train data into months (because we need the forecast for a month). each bin is the first date of each month
train_df['tdate'] = datetime.date(2013, 1, 1)
train_df = train_df.reset_index(drop=True)
for i_ in range(0, 36):
target_date = datetime.date(2013, 1, 1) + relativedelta.relativedelta(months=i_)
train_df['tdate'][train_df['date_block_num'] == i_] = target_date
train_df["tdate"] = pd.to_datetime(train_df.tdate)
train_df = train_df.drop('date_block_num', axis=1)
add_datepart(train_df, "tdate", drop=True)
train_df = train_df.reset_index(drop=True)
cat_vars = ["shop_id", "item_id", "item_category_id", "tIs_quarter_end", "tIs_quarter_start", "tIs_year_end", "tIs_year_start", "tMonth", "tYear", 'price_bin']
for v in cat_vars:
train_df[v] = train_df[v].astype('category').cat.as_ordered()
print("tables processed")
# constructing test table
test_df = join_df(test_table, items, "item_id", "item_id")
test_df = test_df.drop('item_name', axis=1)
test_df = test_df.reset_index(drop=True)
test_df = join_df(test_df, price_id_buckets_df, "ID", "ID")
test_df['price_bin'] = test_df['price_bin'].fillna('A1')
test_df[dep] = 0
test_df["tdate"] = datetime.date(2015, 11, 1)
add_datepart(test_df, "tdate", drop=True)
for v in cat_vars:
test_df[v] = test_df[v].astype('category').cat.as_ordered()
test_df[dep] = test_df[dep].astype(np.float32)
train_df[dep] = train_df[dep].astype(np.float32)
# preparing data for processing and forecasting
train_df.reset_index(inplace=True)
df_train, y_tr, nas, mapper = proc_df(train_df, dep, do_scale=True, ignore_flds='ID')
df_train = df_train.drop('index', axis=1)
test_df.reset_index(inplace=True)
for v in cat_vars:
df_train[v] = df_train[v].astype('category').cat.as_ordered()
df_test, y_ts, nas, mapper = proc_df(test_df, dep, do_scale=True, ignore_flds='ID')
df_test = df_test.drop('index', axis=1)
for v in cat_vars:
df_test[v] = df_test[v].astype('category').cat.as_ordered()
# processing and forecasting
samp_size = len(train_df)
train_ratio = 0.75
train_size = int(samp_size * train_ratio)
val_idx = list(range(train_size, len(df_train)))
yl = np.log(y_tr)
yl[yl == -np.inf] = 0
y_range = (0, np.log(20))
md = ColumnarModelData.from_data_frame(PATH, val_idx, df_train[cat_vars], yl.astype(np.float32), cat_flds=cat_vars, bs=128, test_df=df_test[cat_vars])
cat_sz = [(c, len(df_train[c].astype('category').cat.categories)+1) for c in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
m = md.get_learner(emb_szs, 1, 0.04, 1, [1000, 500], [0.001, 0.01], y_range=y_range)
# m.lr_find()
#
# n_skip = 800
# n_skip_end = 5
#
# plt.ylabel("loss")
# plt.xlabel("learning rate (log scale)")
# plt.plot(m.sched.lrs[n_skip:-(n_skip_end + 1)], m.sched.losses[n_skip:-(n_skip_end + 1)])
# plt.xscale('log')
# plt.legend()
# plt.show()
lr = 1e-3
m.fit(lr, 1, metrics=[rmse], cycle_len=1)
pred_test = m.predict(True)
pred_test_e = np.exp(pred_test)
df_test['item_cnt_month'] = pred_test_e
csv_fn = f'{PATH}sub1.csv'
df_test[['ID', 'item_cnt_month']].to_csv(csv_fn, index=False)
I understand that the problem can be in test data I am using for prediction. But I have tested the approach for ‘sliding back’ prediction frame with something like this (1 date_block_num = 1 month):
...
train_df_t = train_df[train_df['date_block_num'] != 32]
test_df = train_df[train_df['date_block_num'] == 32]
...
with the same (bad) results…
I’m stuck. Can you help me? What can be the problem here?