Tabular Data Cannot Predict Test Set

I was trying out the Tabular Data module to solve a problem at my workplace. The idea of using embedding for categorical variable is brilliant :slight_smile: .

I trained for one cycle and have got very good results on the validation set so far.

However, I am unable to test on the test set. I run the following command:

learner.predict(learner.data.test_ds)

I got the following KeyError:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

KeyError: 'AWBWeight'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-77-46ef6cd0da53> in <module>()
----> 1 learner.predict(learner.data.test_ds)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/basic_train.py in predict(self, item, **kwargs)
    357     def predict(self, item:ItemBase, **kwargs):
    358         "Return predicted class, label and probabilities for `item`."
--> 359         batch = self.data.one_item(item)
    360         res = self.pred_batch(batch=batch)
    361         pred,x = res[0],batch[0]

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/basic_data.py in one_item(self, item, detach, denorm, cpu)
    178         "Get `item` into a batch. Optionally `detach` and `denorm`."
    179         ds = self.single_ds
--> 180         with ds.set_item(item):
    181             return self.one_batch(ds_type=DatasetType.Single, detach=detach, denorm=denorm, cpu=cpu)
    182 

~/anaconda3/envs/pytorch_p36/lib/python3.6/contextlib.py in __enter__(self)
     79     def __enter__(self):
     80         try:
---> 81             return next(self.gen)
     82         except StopIteration:
     83             raise RuntimeError("generator didn't yield") from None

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/data_block.py in set_item(self, item)
    593     def set_item(self,item):
    594         "For inference, will briefly replace the dataset with one that only contains `item`."
--> 595         self.item = self.x.process_one(item)
    596         yield None
    597         self.item = None

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/data_block.py in process_one(self, item, processor)
     80         if processor is not None: self.processor = processor
     81         self.processor = listify(self.processor)
---> 82         for p in self.processor: item = p.process_one(item)
     83         return item
     84 

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/tabular/data.py in process_one(self, item)
     44     def process_one(self, item):
     45         df = pd.DataFrame([item,item])
---> 46         for proc in self.procs: proc(df, test=True)
     47         if len(self.cat_names) != 0:
     48             codes = np.stack([c.cat.codes.values for n,c in df[self.cat_names].items()], 1).astype(np.int64) + 1

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/tabular/transform.py in __call__(self, df, test)
    122         "Apply the correct function to `df` depending on `test`."
    123         func = self.apply_test if test else self.apply_train
--> 124         func(df)
    125 
    126     def apply_train(self, df:DataFrame):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/tabular/transform.py in apply_test(self, df)
    175                     if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
    176                 df.loc[:,name] = df.loc[:,name].fillna(self.na_dict[name])
--> 177             elif pd.isnull(df[name]).sum() != 0:
    178                 raise Exception(f"""There are nan values in field {name} but there were none in the training set. 
    179                 Please fix those manually.""")

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

KeyError: 'AWBWeight'

This post was super useful; especially the second post which illustrates how to use get_preds.