To better understand classification of structured data, I was looking for a rather simple Kaggle Competition and choose Forest Cover Type.
This is a classification problem and therefore different from the regression problem shown in class. As described within the forum, classification for structured data doesn’t appear to work out-of-the-box
like regression does. I have read through the fourms, have tried the various suggestions including: changing the metric function and switching the final layer to softmax. Unfortunately, nothing has worked.
Hopefully the many gurus here can help walk me through what I need to in order to get this working and provide a roadmap for others.
Attached are my attempts to solve the problem. The only real feature engineering I have done has been to convert the One-Hot-Encoded features across multiple columns to single category features.
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
PATH = 'data/forest-cover-type/'
f = pd.read_csv(f'{PATH}train.csv')
Convert One-Hot-Encodings for ‘Soil Type’ and ‘Wilderness Area’ each to a single category column.
hot = f.columns[f.columns.str.startswith('Soil_Type')].tolist()
f['Soil_Type'] = (np.arange(1,len(hot) + 1) * f[hot]).sum(axis=1).astype('category').cat.as_ordered()
f = f.drop(columns=hot)
hot = f.columns[f.columns.str.startswith('Wilderness_Area')].tolist()
f['Wilderness_Area'] = (np.arange(1,len(hot) + 1) * f[hot]).sum(axis=1).astype('category').cat.as_ordered()
f = f.drop(columns=hot)
f['Cover_Type'] = f['Cover_Type'].astype('category')
f = f.set_index('Id')
cats = ['Soil_Type','Wilderness_Area']
conts = list(set(f.columns) - set(cats))
cat_sz = [(c, len(f[c].cat.categories)+1) for c in cats]
cat_sz
[('Soil_Type', 39), ('Wilderness_Area', 5)]
emb_szs = [(c, min(50, (c+1)//2)) for (_,c) in cat_sz]
emb_szs
[(39, 20), (5, 3)]
x,y,nas,mapper = proc_df(f ,y_fld='Cover_Type', do_scale=True)
y
array([4, 4, 1, 1, 4, 1, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, 4, 4, ..., 2, 5, 2, 5, 2, 5, 2, 2, 2, 5, 5, 5,
5, 2, 2, 2, 2, 2, 2, 2], dtype=int8)
x.head()
Elevation | Aspect | Slope | Horizontal_Distance_To_Hydrology | Vertical_Distance_To_Hydrology | Horizontal_Distance_To_Roadways | Hillshade_9am | Hillshade_Noon | Hillshade_3pm | Horizontal_Distance_To_Fire_Points | Soil_Type | Wilderness_Area | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Id | ||||||||||||
1 | -0.367095 | -0.959980 | -1.597132 | 0.146639 | -0.834074 | -0.908681 | 0.271454 | 0.571653 | 0.281259 | 4.334805 | 27 | 1 |
2 | -0.381461 | -0.914559 | -1.715424 | -0.072337 | -0.932054 | -0.999246 | 0.238732 | 0.703225 | 0.346627 | 4.285710 | 27 | 1 |
3 | 0.130912 | -0.160577 | -0.887379 | 0.194243 | 0.227369 | 1.106379 | 0.696843 | 0.834797 | -0.002005 | 4.191156 | 11 | 1 |
4 | 0.085421 | -0.015231 | 0.177250 | 0.070474 | 1.092853 | 1.038455 | 0.827731 | 0.834797 | -0.285268 | 4.272981 | 28 | 1 |
5 | -0.369489 | -1.014485 | -1.715424 | -0.353198 | -0.850404 | -0.998491 | 0.238732 | 0.659368 | 0.324838 | 4.237524 | 27 | 1 |
x.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15120 entries, 1 to 15120
Data columns (total 12 columns):
Elevation 15120 non-null float64
Aspect 15120 non-null float64
Slope 15120 non-null float64
Horizontal_Distance_To_Hydrology 15120 non-null float64
Vertical_Distance_To_Hydrology 15120 non-null float64
Horizontal_Distance_To_Roadways 15120 non-null float64
Hillshade_9am 15120 non-null float64
Hillshade_Noon 15120 non-null float64
Hillshade_3pm 15120 non-null float64
Horizontal_Distance_To_Fire_Points 15120 non-null float64
Soil_Type 15120 non-null int8
Wilderness_Area 15120 non-null int8
dtypes: float64(10), int8(2)
memory usage: 1.3 MB
cats
['Soil_Type', 'Wilderness_Area']
conts
['Hillshade_3pm',
'Horizontal_Distance_To_Fire_Points',
'Horizontal_Distance_To_Roadways',
'Hillshade_9am',
'Aspect',
'Vertical_Distance_To_Hydrology',
'Horizontal_Distance_To_Hydrology',
'Cover_Type',
'Elevation',
'Hillshade_Noon',
'Slope']
val_idxs = get_cv_idxs(len(x),val_pct=.2)
As a regression problem (is_reg=True) this works as expected.
data = ColumnarModelData.from_data_frame(PATH, val_idxs, x, y.astype(np.float64), cat_flds=cats,
bs=128, test_df=None, is_reg=True)
learn = data.get_learner(emb_szs, len(x.columns)-len(cats),
0.04, 1, [1000,500], [0.001,0.01])
learn.lr_find()
HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))
56%|█████▌ | 53/95 [00:01<00:00, 51.84it/s, loss=5.47]
As a classification problem (is_reg=False) this fails.
data = ColumnarModelData.from_data_frame(PATH, val_idxs, x, y.astype('int'), cat_flds=cats,
bs=128, test_df=None, is_reg=False)
learn = data.get_learner(emb_szs, len(x.columns)-len(cats),
0.04, 1, [1000,500], [0.001,0.01])
learn.lr_find()
HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))
0%| | 0/95 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-24-d81c6bd29d71> in <module>()
----> 1 learn.lr_find()
~/projects/forest-cover-type/fastai/learner.py in lr_find(self, start_lr, end_lr, wds, linear, **kwargs)
328 layer_opt = self.get_layer_opt(start_lr, wds)
329 self.sched = LR_Finder(layer_opt, len(self.data.trn_dl), end_lr, linear=linear)
--> 330 self.fit_gen(self.model, self.data, layer_opt, 1, **kwargs)
331 self.load('tmp')
332
~/projects/forest-cover-type/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
232 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
233 swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 234 swa_eval_freq=swa_eval_freq, **kwargs)
235
236 def get_layer_groups(self): return self.models.get_layer_groups()
~/projects/forest-cover-type/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, **kwargs)
138 batch_num += 1
139 for cb in callbacks: cb.on_batch_begin()
--> 140 loss = model_stepper.step(V(x),V(y), epoch)
141 avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
142 debias_loss = avg_loss / (1 - avg_mom**batch_num)
~/projects/forest-cover-type/fastai/model.py in step(self, xs, y, epoch)
55 if self.loss_scale != 1: assert(self.fp16); loss = loss*self.loss_scale
56 if self.reg_fn: loss = self.reg_fn(output, xtra, raw_loss)
---> 57 loss.backward()
58 if self.fp16: update_fp32_grads(self.fp32_params, self.m)
59 if self.loss_scale != 1:
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
165 Variable.
166 """
--> 167 torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
168
169 def register_hook(self, hook):
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
97
98 Variable._execution_engine.run_backward(
---> 99 variables, grad_variables, retain_graph)
100
101
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/function.py in apply(self, *args)
89
90 def apply(self, *args):
---> 91 return self._forward_cls.backward(self, *args)
92
93
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/_functions/dropout.py in backward(ctx, grad_output)
46 def backward(ctx, grad_output):
47 if ctx.p > 0 and ctx.train:
---> 48 return grad_output.mul(Variable(ctx.noise)), None, None, None
49 else:
50 return grad_output, None, None, None
RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/lib/THC/generated/../generic/THCTensorMathPointwise.cu:367