AttributeError: Can only use .cat accessor with a 'category' dtype

Does anyone know how to deal with the following error with tabularlist?

----------------------------------------------------------------------
AttributeError                       Traceback (most recent call last)
<ipython-input-8-ca8ea327f64f> in <module>
 19                            .split_by_idx(valid_indx)
 20 #                            .split_by_rand_pct(0.2)
---> 21                            .label_from_df(cols=dep_var)
 22                            .add_test(test,label=0)
 23                            .databunch())

~/anaconda3/envs/econda/lib/python3.7/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
475             self.valid = fv(*args, from_item_lists=True, **kwargs)
476             self.__class__ = LabelLists
--> 477             self.process()
478             return self
479         return _inner

~/anaconda3/envs/econda/lib/python3.7/site-packages/fastai/data_block.py in process(self)
529         "Process the inner datasets."
530         xp,yp = self.get_processors()
--> 531         for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
532         #progress_bar clear the outputs so in some case warnings issued during processing disappear.
533         for ds in self.lists:

~/anaconda3/envs/econda/lib/python3.7/site-packages/fastai/data_block.py in process(self, xp, yp, name)
709                     p.warns = []
710                 self.x,self.y = self.x[~filt],self.y[~filt]
--> 711         self.x.process(xp)
712         return self
713 

~/anaconda3/envs/econda/lib/python3.7/site-packages/fastai/data_block.py in process(self, processor)
 81         if processor is not None: self.processor = processor
 82         self.processor = listify(self.processor)
---> 83         for p in self.processor: p.process(self)
 84         return self
 85 

~/anaconda3/envs/econda/lib/python3.7/site-packages/fastai/tabular/data.py in process(self, ds)
 71         self.cat_names,self.cont_names = ds.cat_names,ds.cont_names
 72         if len(ds.cat_names) != 0:
---> 73             ds.codes = np.stack([c.cat.codes.values for n,c in ds.inner_df[ds.cat_names].items()], 1).astype(np.int64) + 1
 74             self.classes = ds.classes = OrderedDict({n:np.concatenate([['#na#'],c.cat.categories.values])
 75                                       for n,c in ds.inner_df[ds.cat_names].items()})

~/anaconda3/envs/econda/lib/python3.7/site-packages/fastai/tabular/data.py in <listcomp>(.0)
 71         self.cat_names,self.cont_names = ds.cat_names,ds.cont_names
 72         if len(ds.cat_names) != 0:
---> 73             ds.codes = np.stack([c.cat.codes.values for n,c in ds.inner_df[ds.cat_names].items()], 1).astype(np.int64) + 1
 74             self.classes = ds.classes = OrderedDict({n:np.concatenate([['#na#'],c.cat.categories.values])
 75                                       for n,c in ds.inner_df[ds.cat_names].items()})

~/anaconda3/envs/econda/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5174             or name in self._accessors
   5175         ):
-> 5176             return object.__getattribute__(self, name)
   5177         else:
   5178             if self._info_axis._can_hold_identifiers_and_holds_name(name):

~/anaconda3/envs/econda/lib/python3.7/site-packages/pandas/core/accessor.py in __get__(self, obj, cls)
173             # we're accessing the attribute of the class, i.e., Dataset.geo
174             return self._accessor
--> 175         accessor_obj = self._accessor(obj)
176         # Replace the property with the accessor object. Inspired by:
177         # http://www.pydanny.com/cached-property.html

~/anaconda3/envs/econda/lib/python3.7/site-packages/pandas/core/arrays/categorical.py in __init__(self, data)
   2591 
   2592     def __init__(self, data):
-> 2593         self._validate(data)
   2594         self._parent = data.values
   2595         self._index = data.index

~/anaconda3/envs/econda/lib/python3.7/site-packages/pandas/core/arrays/categorical.py in _validate(data)
   2601         if not is_categorical_dtype(data.dtype):
   2602             raise AttributeError(
-> 2603                 "Can only use .cat accessor with a " "'category' dtype"
   2604             )
   2605 

AttributeError: Can only use .cat accessor with a 'category' dtype

I tried running my code without procs, using my own custom function below:

def convert_types(df,cat_names,cont_names):
dtypes = {cat:'category' for cat in cat_names}
dtypes.update({cont:'float32' for cont in cont_names})
df = df.astype(dtypes)
return df

But that didn’t seem to fix anything.

Here is some of my code:

def convert_types(df,cat_names,cont_names):
      dtypes = {cat:'category' for cat in cat_names}
      dtypes.update({cont:'float32' for cont in cont_names})
      df = df.astype(dtypes)
      return df

df = convert_types(df, cat_names, cont_names+[dep_var])
test_df = convert_types(test_df, cat_names, cont_names+[dep_var])

# Percent of original dataframe
test_pct = 0
valid_pct = 0.2

# Masks for separating dataframe sets
cut_test = int(test_pct * len(df))+1
cut_valid = int(valid_pct*len(df))+cut_test

# Test Dataframe: Use this if no separate test df provided
# test_df = df.iloc[cut_test:cut_valid].copy()

valid_indx = range(cut_test,cut_valid) # range of validation indices, used for fastai

# Initialize Test Data
test = TabularList.from_df(test_df, cat_names=cat_names, cont_names=cont_names)

# Initialize All Data
data = (TabularList.from_df(df=df, path=path, cat_names=cat_names, cont_names=cont_names)
                           .split_by_idx(valid_indx)
#                            .split_by_rand_pct(0.2)
                           .label_from_df(cols=dep_var)
                           .add_test(test,label=0)
                           .databunch())
1 Like

It says your cat variables are not category dtype yet. Have you tried checking the variables after running convert_types? Like df[‘workclass’].cat.categories to print out the workclass categories for e.g.

2 Likes

Thank you, I accidentally had one of my continuous variables also listed with my categorical variables. Checking df[‘workclass’].cat.categories helped me find the bug

1 Like

What I did is changing the type from object to category:

 train['y']=train['y'].astype('category')
2 Likes

In my case, the order of my procs was incorrect.

I had to change [Categorify, FillMissing, Normalize] to [FillMissing, Categorify, Normalize] to remove this error.

1 Like

For context to why this behavior exists, FillMissing will add a categorical column that Categorify doesn’t pick up if you have it first. This is fixed better in v2 as transforms have orders to them inherently

1 Like

Use train_cats before using proc_df.
If your variables are not categorical then train_cats will not do anything and later part of code will give error.
Can you also share the code?