IndexError with tabular test set predictions

jvanelteren · June 7, 2020, 2:54pm

I want to predict on tabular learner on the kaggle housing dataset, however when I run
dl = learn.dls.test_dl(test)
res= learn.get_preds(dl=dl)

I get an error

IndexError Traceback (most recent call last)
in
1 dl = learn.dls.test_dl(test)
----> 2 res= learn.get_preds(dl=dl)

C:\ProgramData\Miniconda3\lib\site-packages\fastai2\learner.py in get_preds(self, ds_idx, dl, with_input, with_decoded, with_loss, act, inner, **kwargs)
217 for mgr in ctx_mgrs: stack.enter_context(mgr)
218 self(event.begin_epoch if inner else _before_epoch)
–> 219 self._do_epoch_validate(dl=dl)
220 self(event.after_epoch if inner else _after_epoch)
221 if act is None: act = getattr(self.loss_func, ‘activation’, noop)

C:\ProgramData\Miniconda3\lib\site-packages\fastai2\learner.py in _do_epoch_validate(self, ds_idx, dl)
173 dl,old,has = change_attrs(dl, names, [False,False])
174 self.dl = dl; self(‘begin_validate’)
–> 175 with torch.no_grad(): self.all_batches()
176 except CancelValidException: self(‘after_cancel_validate’)
177 finally:

C:\ProgramData\Miniconda3\lib\site-packages\fastai2\learner.py in all_batches(self)
141 def all_batches(self):
142 self.n_iter = len(self.dl)
–> 143 for o in enumerate(self.dl): self.one_batch(*o)
144
145 def one_batch(self, i, b):

C:\ProgramData\Miniconda3\lib\site-packages\fastai2\learner.py in one_batch(self, i, b)
147 try:
148 self._split(b); self(‘begin_batch’)
–> 149 self.pred = self.model(*self.xb); self(‘after_pred’)
150 if len(self.yb) == 0: return
151 self.loss = self.loss_func(self.pred, *self.yb); self(‘after_loss’)

C:\ProgramData\Miniconda3\lib\site-packages\torch\nn\modules\module.py in call(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
–> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)

C:\ProgramData\Miniconda3\lib\site-packages\fastai2\tabular\model.py in forward(self, x_cat, x_cont)
46 def forward(self, x_cat, x_cont=None):
47 if self.n_emb != 0:
—> 48 x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
49 x = torch.cat(x, 1)
50 x = self.emb_drop(x)

C:\ProgramData\Miniconda3\lib\site-packages\fastai2\tabular\model.py in (.0)
46 def forward(self, x_cat, x_cont=None):
47 if self.n_emb != 0:
—> 48 x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
49 x = torch.cat(x, 1)
50 x = self.emb_drop(x)

C:\ProgramData\Miniconda3\lib\site-packages\torch\nn\modules\module.py in call(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
–> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)

C:\ProgramData\Miniconda3\lib\site-packages\torch\nn\modules\sparse.py in forward(self, input)
112 return F.embedding(
113 input, self.weight, self.padding_idx, self.max_norm,
–> 114 self.norm_type, self.scale_grad_by_freq, self.sparse)
115
116 def extra_repr(self):

C:\ProgramData\Miniconda3\lib\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1722 # remove once script supports set_grad_enabled
1723 no_grad_embedding_renorm(weight, input, max_norm, norm_type)
-> 1724 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
1725
1726

IndexError: index out of range in self

The error is due to some categorical feature ‘MiscFeature’, which has the value ‘Shed’. This value also occurs in the training set.

df[‘MiscFeature’].value_counts(dropna=False)

NaN 1373
Shed 40
Othr 1
Name: MiscFeature, dtype: int64

Setting all the values of this feature to np.nan works:

test[‘MiscFeature’]=np.nan
dl = learn.dls.test_dl(test)
res= learn.get_preds(dl=dl)

But I’m don’t understand why the error was generated in the first place? The df has other categorical columns as well and doesn’t trip on them

Tendo · June 7, 2020, 7:58pm

hello @jvanelteren did you drop the MiscFeature column in the training set? If you did, using in the test_df will throw errors

jvanelteren · June 7, 2020, 8:13pm

No I didn’t. To fix the above error I just set the values of MiscFeature to nan.

I did also experiment with dropping the feature on both train and test set, that worked as well.

Just don’t understand why the column is giving problems. Maybe the large amount of nan in the train set has something to do with it