The issue: I get a key error when I try to use the following command:
X_trn, X_val = train_test_split(trn_df.copy(), test_size=0.1, random_state=42)
tfms = [FillMissing, Categorify]
cat_names = ['Product_Info_2']
dep_var = 'Response'
data = TabularDataBunch.from_df(PATH, X_trn, X_val, dep_var, test_df=tst_df, tfms=tfms, cat_names=cat_names)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3077 try:
-> 3078 return self._engine.get_loc(key)
3079 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Response'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-9-e4a38e29d73a> in <module>()
7 dep_var = 'Response'
8
----> 9 data = TabularDataBunch.from_df(PATH, X_trn, X_val, dep_var, test_df=tst_df, tfms=tfms, cat_names=cat_names)
~/fastai_v1/fastai/kaggle/Competitions/fastai/tabular/data.py in from_df(cls, path, train_df, valid_df, dep_var, test_df, tfms, cat_names, cont_names, stats, log_output, **kwargs)
83 if test_df is not None:
84 datasets.append(TabularDataset.from_dataframe(test_df, dep_var, train_ds.tfms, train_ds.cat_names,
---> 85 train_ds.cont_names, train_ds.stats, log_output))
86 return cls.create(*datasets, path=path, **kwargs)
87
~/fastai_v1/fastai/kaggle/Competitions/fastai/tabular/data.py in from_dataframe(cls, df, dep_var, tfms, cat_names, cont_names, stats, log_output)
64 tfms[i] = tfm
65 cat_names, cont_names = tfm.cat_names, tfm.cont_names
---> 66 ds = cls(df, dep_var, cat_names, cont_names, stats, log_output)
67 ds.tfms,ds.cat_names,ds.cont_names = tfms,cat_names,cont_names
68 return ds
~/fastai_v1/fastai/kaggle/Competitions/fastai/tabular/data.py in __init__(self, df, dep_var, cat_names, cont_names, stats, log_output)
22 def __init__(self, df:DataFrame, dep_var:str, cat_names:OptStrList=None, cont_names:OptStrList=None,
23 stats:OptStats=None, log_output:bool=False):
---> 24 if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes.astype(np.int64)
25 self.y = np2model_tensor(df[dep_var].values)
26 if log_output: self.y = torch.log(self.y.float())
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
2487 res = cache.get(item)
2488 if res is None:
-> 2489 values = self._data.get(item)
2490 res = self._box_item_values(item, values)
2491 cache[item] = res
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3078 return self._engine.get_loc(key)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Response'
I found out that the issue here is that my test dataframe does not have the dependent variable. The easiest way to fix this is to add this column so in my case I just add the following:
tst_df['Response'] = 1
This adds a Response column to the test dataframe and labels it as 1 for all of them.
Hope this helps somebody else that runs into this issue!
This could potentially be fixed from a dev side of things, but I’m not sure if that is desired or not. I believe in most cases your test set won’t have actual values (if it did, it would be validation) so I think you could fix it by adding the following code to the TabularDataBunch.from_df class:
@classmethod
def from_df(cls, path, train_df:DataFrame, valid_df:DataFrame, dep_var:str, test_df:OptDataFrame=None,
tfms:OptTabTfms=None, cat_names:OptStrList=None, cont_names:OptStrList=None,
stats:OptStats=None, log_output:bool=False, **kwargs)->DataBunch:
"Create a `DataBunch` from train/valid/test dataframes."
cat_names = ifnone(cat_names, [])
cont_names = ifnone(cont_names, list(set(train_df)-set(cat_names)-{dep_var}))
train_ds = TabularDataset.from_dataframe(train_df, dep_var, tfms, cat_names, cont_names, stats, log_output)
valid_ds = TabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names,
train_ds.cont_names, train_ds.stats, log_output)
datasets = [train_ds, valid_ds]
if test_df is not None:
#if the dependent variable column doesn't exist in the test dataframe,
#add a column for it and fill it with whatever the most common value is
if dep_var not in test_df.columns:
test_df[dep_var] = train_df[dep_var].mode().values()
datasets.append(TabularDataset.from_dataframe(test_df, dep_var, train_ds.tfms, train_ds.cat_names,
train_ds.cont_names, train_ds.stats, log_output))
return cls.create(*datasets, path=path, **kwargs)
This is what I added (couldn’t figure out how to highlight it)
#if the dependent variable column doesn't exist in the test dataframe,
#add a column for it and fill it with whatever the most common value is
if dep_var not in test_df.columns:
test_df[dep_var] = train_df[dep_var].mode().values()