String category not treated as category


(Anders) #1

Edit: I made a stupid typo and used the wrong dataframe. The library is working as it should.

I have a column of strings and use train_cats(df) to convert it to categories. I don’t understand why I have to do this:

df.garageCode = df.garageCode.cat.codes

Otherwise the code fails to create the model data

md = ColumnarModelData.from_data_frame(PATH, val_idx, df,  yl.astype(np.float32), cat_flds=cat_vars, bs=128)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-62-07ea0cb85ec0> in <module>()
----> 1 md = ColumnarModelData.from_data_frame(PATH, val_idx, df,  yl.astype(np.float32), cat_flds=cat_vars, bs=128)

~/fastai/courses/ml1/fastai/column_data.py in from_data_frame(cls, path, val_idxs, df, y, cat_flds, bs, is_reg, is_multi, test_df)
     71     def from_data_frame(cls, path, val_idxs, df, y, cat_flds, bs, is_reg=True, is_multi=False, test_df=None):
     72         ((val_df, trn_df), (val_y, trn_y)) = split_by_idx(val_idxs, df, y)
---> 73         return cls.from_data_frames(path, trn_df, val_df, trn_y, val_y, cat_flds, bs, is_reg, is_multi, test_df=test_df)
     74 
     75     def get_learner(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,

~/fastai/courses/ml1/fastai/column_data.py in from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, is_reg, is_multi, test_df)
     65     def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, is_reg, is_multi, test_df=None):
     66         test_ds = ColumnarDataset.from_data_frame(test_df, cat_flds, None, is_reg, is_multi) if test_df is not None else None
---> 67         return cls(path, ColumnarDataset.from_data_frame(trn_df, cat_flds, trn_y, is_reg, is_multi),
     68                     ColumnarDataset.from_data_frame(val_df, cat_flds, val_y, is_reg, is_multi), bs, test_ds=test_ds)
     69 

~/fastai/courses/ml1/fastai/column_data.py in from_data_frame(cls, df, cat_flds, y, is_reg, is_multi)
     45     @classmethod
     46     def from_data_frame(cls, df, cat_flds, y=None, is_reg=True, is_multi=False):
---> 47         return cls.from_data_frames(df[cat_flds], df.drop(cat_flds, axis=1), y, is_reg, is_multi)
     48 
     49 

~/fastai/courses/ml1/fastai/column_data.py in from_data_frames(cls, df_cat, df_cont, y, is_reg, is_multi)
     41         cat_cols = [c.values for n,c in df_cat.items()]
     42         cont_cols = [c.values for n,c in df_cont.items()]
---> 43         return cls(cat_cols, cont_cols, y, is_reg, is_multi)
     44 
     45     @classmethod

~/fastai/courses/ml1/fastai/column_data.py in __init__(self, cats, conts, y, is_reg, is_multi)
     24     def __init__(self, cats, conts, y, is_reg, is_multi):
     25         n = len(cats[0]) if cats else len(conts[0])
---> 26         self.cats = np.stack(cats, 1).astype(np.int64) if cats else np.zeros((n,1))
     27         self.conts = np.stack(conts, 1).astype(np.float32) if conts else np.zeros((n,1))
     28         self.y = np.zeros((n,1)) if y is None else y

ValueError: invalid literal for int() with base 10: 'NORREPORT'

(Sam Lloyd) #2

Have you applied proc_df to it? Got to do that to convert cats to integer indices


(Anders) #3

I went through the code again and it is working correctly. I did apply proc_df. I think I made a typo and swapped the names of two different dataframes.

My code in short

#import data
train_cats(dfCats) #string variables to category
#do feature engineering
add_datepart(dfCats, 'date')
#set correct data types. v is a list of column names of continous variables
for v in contin_vars: df[v] = df[v].astype('float32')
#again, but v is a list of categorical variables
for v in cat_vars: df[v] = df[v].astype('category').cat.as_ordered()
dfX, y, nas, mapper = proc_df(df, 'vehicleCount', do_scale=True)