Key Error When using TabularDataBunch.from_df() with a test set without the dependent variable

KevinB · October 20, 2018, 3:45am

The issue: I get a key error when I try to use the following command:

X_trn, X_val = train_test_split(trn_df.copy(), test_size=0.1, random_state=42)

tfms = [FillMissing, Categorify]

cat_names = ['Product_Info_2']

dep_var = 'Response'

data = TabularDataBunch.from_df(PATH, X_trn, X_val, dep_var, test_df=tst_df, tfms=tfms, cat_names=cat_names)

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3077             try:
-> 3078                 return self._engine.get_loc(key)
   3079             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Response'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-9-e4a38e29d73a> in <module>()
      7 dep_var = 'Response'
      8 
----> 9 data = TabularDataBunch.from_df(PATH, X_trn, X_val, dep_var, test_df=tst_df, tfms=tfms, cat_names=cat_names)

~/fastai_v1/fastai/kaggle/Competitions/fastai/tabular/data.py in from_df(cls, path, train_df, valid_df, dep_var, test_df, tfms, cat_names, cont_names, stats, log_output, **kwargs)
     83         if test_df is not None:
     84             datasets.append(TabularDataset.from_dataframe(test_df, dep_var, train_ds.tfms, train_ds.cat_names,
---> 85                                                       train_ds.cont_names, train_ds.stats, log_output))
     86         return cls.create(*datasets, path=path, **kwargs)
     87 

~/fastai_v1/fastai/kaggle/Competitions/fastai/tabular/data.py in from_dataframe(cls, df, dep_var, tfms, cat_names, cont_names, stats, log_output)
     64                 tfms[i] = tfm
     65                 cat_names, cont_names = tfm.cat_names, tfm.cont_names
---> 66         ds = cls(df, dep_var, cat_names, cont_names, stats, log_output)
     67         ds.tfms,ds.cat_names,ds.cont_names = tfms,cat_names,cont_names
     68         return ds

~/fastai_v1/fastai/kaggle/Competitions/fastai/tabular/data.py in __init__(self, df, dep_var, cat_names, cont_names, stats, log_output)
     22     def __init__(self, df:DataFrame, dep_var:str, cat_names:OptStrList=None, cont_names:OptStrList=None,
     23                  stats:OptStats=None, log_output:bool=False):
---> 24         if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes.astype(np.int64)
     25         self.y = np2model_tensor(df[dep_var].values)
     26         if log_output: self.y = torch.log(self.y.float())

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2487         res = cache.get(item)
   2488         if res is None:
-> 2489             values = self._data.get(item)
   2490             res = self._box_item_values(item, values)
   2491             cache[item] = res

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3078                 return self._engine.get_loc(key)
   3079             except KeyError:
-> 3080                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3081 
   3082         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Response'

I found out that the issue here is that my test dataframe does not have the dependent variable. The easiest way to fix this is to add this column so in my case I just add the following:

tst_df['Response'] = 1

This adds a Response column to the test dataframe and labels it as 1 for all of them.

Hope this helps somebody else that runs into this issue!

This could potentially be fixed from a dev side of things, but I’m not sure if that is desired or not. I believe in most cases your test set won’t have actual values (if it did, it would be validation) so I think you could fix it by adding the following code to the TabularDataBunch.from_df class:

@classmethod
def from_df(cls, path, train_df:DataFrame, valid_df:DataFrame, dep_var:str, test_df:OptDataFrame=None,
                    tfms:OptTabTfms=None, cat_names:OptStrList=None, cont_names:OptStrList=None,
                    stats:OptStats=None, log_output:bool=False, **kwargs)->DataBunch:
    "Create a `DataBunch` from train/valid/test dataframes."
    cat_names = ifnone(cat_names, [])
    cont_names = ifnone(cont_names, list(set(train_df)-set(cat_names)-{dep_var}))
    train_ds = TabularDataset.from_dataframe(train_df, dep_var, tfms, cat_names, cont_names, stats, log_output)
    valid_ds = TabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names,
                                         train_ds.cont_names, train_ds.stats, log_output)
    datasets = [train_ds, valid_ds]
    if test_df is not None:
        #if the dependent variable column doesn't exist in the test dataframe, 
        #add a column for it and fill it with whatever the most common value is 
        if dep_var not in test_df.columns:
            test_df[dep_var] = train_df[dep_var].mode().values()
        datasets.append(TabularDataset.from_dataframe(test_df, dep_var, train_ds.tfms, train_ds.cat_names,
                                                  train_ds.cont_names, train_ds.stats, log_output))
    return cls.create(*datasets, path=path, **kwargs)

This is what I added (couldn’t figure out how to highlight it)

        #if the dependent variable column doesn't exist in the test dataframe, 
        #add a column for it and fill it with whatever the most common value is 
        if dep_var not in test_df.columns:
            test_df[dep_var] = train_df[dep_var].mode().values()

sgugger · October 20, 2018, 6:46pm

Seems most reasonable. I’ll add this fix today, thanks for pointing it out.