Structured Data Classification of a Kaggle Competition:Forest Cover Type

To better understand classification of structured data, I was looking for a rather simple Kaggle Competition and choose Forest Cover Type.

This is a classification problem and therefore different from the regression problem shown in class. As described within the forum, classification for structured data doesn’t appear to work out-of-the-box like regression does. I have read through the fourms, have tried the various suggestions including: changing the metric function and switching the final layer to softmax. Unfortunately, nothing has worked.

Hopefully the many gurus here can help walk me through what I need to in order to get this working and provide a roadmap for others.

Attached are my attempts to solve the problem. The only real feature engineering I have done has been to convert the One-Hot-Encoded features across multiple columns to single category features.

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
PATH = 'data/forest-cover-type/'
f = pd.read_csv(f'{PATH}train.csv')

Convert One-Hot-Encodings for ‘Soil Type’ and ‘Wilderness Area’ each to a single category column.

hot = f.columns[f.columns.str.startswith('Soil_Type')].tolist()
f['Soil_Type'] = (np.arange(1,len(hot) + 1) * f[hot]).sum(axis=1).astype('category').cat.as_ordered()
f = f.drop(columns=hot)
hot = f.columns[f.columns.str.startswith('Wilderness_Area')].tolist()
f['Wilderness_Area'] = (np.arange(1,len(hot) + 1) * f[hot]).sum(axis=1).astype('category').cat.as_ordered()
f = f.drop(columns=hot)
f['Cover_Type'] = f['Cover_Type'].astype('category')
f = f.set_index('Id')
cats = ['Soil_Type','Wilderness_Area']
conts = list(set(f.columns) - set(cats))
cat_sz = [(c, len(f[c].cat.categories)+1) for c in cats]
cat_sz
[('Soil_Type', 39), ('Wilderness_Area', 5)]
emb_szs = [(c, min(50, (c+1)//2)) for (_,c) in cat_sz]
emb_szs
[(39, 20), (5, 3)]
x,y,nas,mapper = proc_df(f ,y_fld='Cover_Type', do_scale=True)
y
array([4, 4, 1, 1, 4, 1, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, 4, 4, ..., 2, 5, 2, 5, 2, 5, 2, 2, 2, 5, 5, 5,
       5, 2, 2, 2, 2, 2, 2, 2], dtype=int8)
x.head()
Elevation Aspect Slope Horizontal_Distance_To_Hydrology Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways Hillshade_9am Hillshade_Noon Hillshade_3pm Horizontal_Distance_To_Fire_Points Soil_Type Wilderness_Area
Id
1 -0.367095 -0.959980 -1.597132 0.146639 -0.834074 -0.908681 0.271454 0.571653 0.281259 4.334805 27 1
2 -0.381461 -0.914559 -1.715424 -0.072337 -0.932054 -0.999246 0.238732 0.703225 0.346627 4.285710 27 1
3 0.130912 -0.160577 -0.887379 0.194243 0.227369 1.106379 0.696843 0.834797 -0.002005 4.191156 11 1
4 0.085421 -0.015231 0.177250 0.070474 1.092853 1.038455 0.827731 0.834797 -0.285268 4.272981 28 1
5 -0.369489 -1.014485 -1.715424 -0.353198 -0.850404 -0.998491 0.238732 0.659368 0.324838 4.237524 27 1
x.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15120 entries, 1 to 15120
Data columns (total 12 columns):
Elevation                             15120 non-null float64
Aspect                                15120 non-null float64
Slope                                 15120 non-null float64
Horizontal_Distance_To_Hydrology      15120 non-null float64
Vertical_Distance_To_Hydrology        15120 non-null float64
Horizontal_Distance_To_Roadways       15120 non-null float64
Hillshade_9am                         15120 non-null float64
Hillshade_Noon                        15120 non-null float64
Hillshade_3pm                         15120 non-null float64
Horizontal_Distance_To_Fire_Points    15120 non-null float64
Soil_Type                             15120 non-null int8
Wilderness_Area                       15120 non-null int8
dtypes: float64(10), int8(2)
memory usage: 1.3 MB
cats
['Soil_Type', 'Wilderness_Area']
conts
['Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Aspect',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Hydrology',
 'Cover_Type',
 'Elevation',
 'Hillshade_Noon',
 'Slope']
val_idxs = get_cv_idxs(len(x),val_pct=.2)

As a regression problem (is_reg=True) this works as expected.

data = ColumnarModelData.from_data_frame(PATH, val_idxs, x, y.astype(np.float64), cat_flds=cats, 
                                         bs=128, test_df=None, is_reg=True)
learn = data.get_learner(emb_szs, len(x.columns)-len(cats),
                   0.04, 1, [1000,500], [0.001,0.01])
learn.lr_find()
HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))


 56%|█████▌    | 53/95 [00:01<00:00, 51.84it/s, loss=5.47]

As a classification problem (is_reg=False) this fails.

data = ColumnarModelData.from_data_frame(PATH, val_idxs, x, y.astype('int'), cat_flds=cats, 
                                         bs=128, test_df=None, is_reg=False)
learn = data.get_learner(emb_szs, len(x.columns)-len(cats),
                   0.04, 1, [1000,500], [0.001,0.01])
learn.lr_find()
HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))


  0%|          | 0/95 [00:00<?, ?it/s]



---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

<ipython-input-24-d81c6bd29d71> in <module>()
----> 1 learn.lr_find()


~/projects/forest-cover-type/fastai/learner.py in lr_find(self, start_lr, end_lr, wds, linear, **kwargs)
    328         layer_opt = self.get_layer_opt(start_lr, wds)
    329         self.sched = LR_Finder(layer_opt, len(self.data.trn_dl), end_lr, linear=linear)
--> 330         self.fit_gen(self.model, self.data, layer_opt, 1, **kwargs)
    331         self.load('tmp')
    332 


~/projects/forest-cover-type/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
    232             metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
    233             swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 234             swa_eval_freq=swa_eval_freq, **kwargs)
    235 
    236     def get_layer_groups(self): return self.models.get_layer_groups()


~/projects/forest-cover-type/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, **kwargs)
    138             batch_num += 1
    139             for cb in callbacks: cb.on_batch_begin()
--> 140             loss = model_stepper.step(V(x),V(y), epoch)
    141             avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
    142             debias_loss = avg_loss / (1 - avg_mom**batch_num)


~/projects/forest-cover-type/fastai/model.py in step(self, xs, y, epoch)
     55         if self.loss_scale != 1: assert(self.fp16); loss = loss*self.loss_scale
     56         if self.reg_fn: loss = self.reg_fn(output, xtra, raw_loss)
---> 57         loss.backward()
     58         if self.fp16: update_fp32_grads(self.fp32_params, self.m)
     59         if self.loss_scale != 1:


~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
    165                 Variable.
    166         """
--> 167         torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
    168 
    169     def register_hook(self, hook):


~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
     97 
     98     Variable._execution_engine.run_backward(
---> 99         variables, grad_variables, retain_graph)
    100 
    101 


~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/function.py in apply(self, *args)
     89 
     90     def apply(self, *args):
---> 91         return self._forward_cls.backward(self, *args)
     92 
     93 


~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/_functions/dropout.py in backward(ctx, grad_output)
     46     def backward(ctx, grad_output):
     47         if ctx.p > 0 and ctx.train:
---> 48             return grad_output.mul(Variable(ctx.noise)), None, None, None
     49         else:
     50             return grad_output, None, None, None


RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/lib/THC/generated/../generic/THCTensorMathPointwise.cu:367
2 Likes