A walk with fastai2 - Tabular - Study Group and Online Lectures Megathread

Here’s the links to what I discussed:

3 Likes

I have a regular pandas dataframe with following columns and datatypes:

kundensegment      category
kalkDB2             float64
angebotsjahr       category
angebotsvolumen    category
vertriebsweg       category
angebotsstatus     category
produkt            category
region             category
ausschreibung      category
dtype: object

I have defined categorical and continuous variables like so:

cat_vars = ['kundensegment', 'angebotsjahr', 'angebotsvolumen', 'vertriebsweg', 'angebotsstatus', 'produkt', 'region', 'ausschreibung']
cont_vars = ['kalkDB2']

When trying to build a TabularPandas object with to = TabularPandas(df, cat, cat_vars) I get following error.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-105-7179139bdcc4> in <module>
----> 1 to = TabularPandas(df, cat, cat_vars)

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastai2/tabular/core.py in __init__(self, df, procs, cat_names, cont_names, y_names, block_y, splits, do_setup, device)
    121         self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
    122         self.split = len(splits[0])
--> 123         if do_setup: self.setup()
    124 
    125     def new(self, df):

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastai2/tabular/core.py in setup(self)
    132     def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    133     def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
--> 134     def setup(self): self.procs.setup(self)
    135     def process(self): self.procs(self)
    136     def loc(self): return self.items.loc

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/transform.py in setup(self, items, train_setup)
    177         tfms = self.fs[:]
    178         self.fs.clear()
--> 179         for t in tfms: self.add(t,items, train_setup)
    180 
    181     def add(self,t, items=None, train_setup=False):

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/transform.py in add(self, t, items, train_setup)
    180 
    181     def add(self,t, items=None, train_setup=False):
--> 182         t.setup(items, train_setup)
    183         self.fs.append(t)
    184 

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastai2/tabular/core.py in setup(self, items, train_setup)
    178         super().setup(getattr(items,'train',items), train_setup=False)
    179         # Procs are called as soon as data is available
--> 180         return self(items.items if isinstance(items,Datasets) else items)
    181 
    182 # Cell

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/transform.py in __call__(self, x, **kwargs)
     70     @property
     71     def name(self): return getattr(self, '_name', _get_name(self))
---> 72     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     73     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     74     def __repr__(self): return f'{self.name}: {self.encodes} {self.decodes}'

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     94     "A `Transform` that modifies in-place and just returns whatever it's passed"
     95     def _call(self, fn, x, split_idx=None, **kwargs):
---> 96         super()._call(fn,x,split_idx,**kwargs)
     97         return x
     98 

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     81         if split_idx!=self.split_idx and self.split_idx is not None: return x
     82         f = getattr(self, fn)
---> 83         if not _is_tuple(x): return self._do_call(f, x, **kwargs)
     84         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     85         return retain_type(res, x)

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     86 
     87     def _do_call(self, f, x, **kwargs):
---> 88         return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
     89 
     90 add_docs(Transform, decode="Delegate to `decodes` to undo transform", setup="Delegate to `setups` to set up transform")

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastai2/tabular/core.py in encodes(self, to)
    194         self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}
    195 
--> 196     def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
    197     def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
    198     def __getitem__(self,k): return self.classes[k]

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastai2/tabular/core.py in transform(self, cols, f, all_col)
    155     def transform(self, cols, f, all_col=True):
    156         if not all_col: cols = [c for c in cols if c in self.items.columns]
--> 157         if len(cols) > 0: self[cols] = self[cols].transform(f)
    158 
    159 # Cell

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastcore/foundation.py in __getitem__(self, k)
    268     def __init__(self, items): self.items = items
    269     def __len__(self): return len(self.items)
--> 270     def __getitem__(self, k): return self.items[k]
    271     def __setitem__(self, k, v): self.items[list(k) if isinstance(k,CollBase) else k] = v
    272     def __delitem__(self, i): del(self.items[i])

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2485         """Return the cached item, item represents a label indexer."""
   2486         cache = self._item_cache
-> 2487         res = cache.get(item)
   2488         if res is None:
   2489             values = self._data.get(item)

TypeError: unhashable type: 'L'

I can’t figure out where fastai fails to build the TabularPandas object… any hints?

It looks like you haven’t specified a dependent variable. That might cause this error, though I’m not sure.

It’s the same when I specify a dependent variable:

cat_vars = ['kundensegment', 'angebotsjahr', 'angebotsvolumen', 'vertriebsweg', 'produkt', 'region', 'ausschreibung']
cont_vars = ['kalkDB2']
dep_var = 'angebotsstatus'
procs = Categorize()
to = TabularPandas(df, procs, cat_vars, cont_vars, y_names=dep_var)

The error message, although shorter, lands at the same point.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-230-3add9598d722> in <module>
----> 1 to = TabularPandas(df, procs, cat_vars, cont_vars, y_names=dep_var)

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/fastai2/tabular/core.py in __init__(self, df, procs, cat_names, cont_names, y_names, block_y, splits, do_setup, device)
    113         if block_y is None and self.y_names:
    114             # Make ys categorical if they're not numeric
--> 115             ys = df[self.y_names]
    116             if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): block_y = CategoryBlock()
    117             else: block_y = RegressionBlock()

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2485         """Return the cached item, item represents a label indexer."""
   2486         cache = self._item_cache
-> 2487         res = cache.get(item)
   2488         if res is None:
   2489             values = self._data.get(item)

TypeError: unhashable type: 'L'

Are you sure you have the last version of fastcore and fastai2? I can’t reproduce your bug.

1 Like

Very strange… I was able to reproduce the bug inside an Azure ML notebook with following library versions:

fastai2                               0.0.11
fastcore                              0.1.14

However, running the same versions locally didn’t throw any error.

@faib what version of pandas?

Awesome, you got it @muellerzr :raised_hands:
Seems like currently the default pandas version in azure notebooks is 0.23!

1 Like

Great lesson @muellerzr! I love your teaching style. :heart_eyes:
For the ones wanting a bit more detail about loc vs iloc I recommend this video until 15:00 (as ix is deprecated).
For the differences between map, apply and applymap this other great video.
Still haf-way through the tabular lesson but so far my inputs!

3 Likes

on 01 - Adults.ipynb i’m trying to run the following :

 _, __, y = learn.get_preds(dl=df2)

and am getting this error:

TypeError: string indices must be integers

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/fastai2/torch_core.py in find_bs(b)
    471 def find_bs(b):
    472     "Recursively search the batch size of `b`."
--> 473     return item_find(b).shape[0]
    474 
    475 # Cell

AttributeError: 'str' object has no attribute 'shape'

i tried adding

 learn.bs = 1

entire notebook here:
https://colab.research.google.com/drive/1OHTj0E09WZhmuAg4ppb-_eR728pul6-0

I tried stepping through the debugger, and this looks suspicious

ipdb> u
> /usr/local/lib/python3.6/dist-packages/fastai2/learner.py(337)accumulate()
    335     def reset(self):           self.total,self.count = 0.,0
    336     def accumulate(self, learn):
--> 337         bs = find_bs(learn.yb)
    338         self.total += to_detach(learn.loss.mean())*bs
    339         self.count += bs

but do not know how to further troubleshoot. @muellerzr if you have any thoughts, it would be appreciated.

You should check the notebook again. I never pass the DataFrame into get_preds. I build a test DataLoader first. (So I’m not sure where you found this code?)

Edit @foobar8675 just noticed I never actually showed a get_preds example (this is coming next week). But you need to build a test DataLoader before, you can’t just pass in a Pandas dataframe :slight_smile:

whoops, i misread.

1 Like

i got it from way back when : 02_Deployment (Tabular section)

although I’m still not sure on how to get it to work. i would guess this would work

dl = learn.dls.test_dl(df2)
_, __, y = learn.get_preds(dl=dl2)

but still get an error

/usr/local/lib/python3.6/dist-packages/fastprogress/fastprogress.py:74: UserWarning: Your generator is empty.
  warn("Your generator is empty.")
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-96-f967a6a6129c> in <module>()
      1 dl = learn.dls.test_dl(df2)
----> 2 _, __, y = learn.get_preds(dl=dl2)

12 frames
/usr/local/lib/python3.6/dist-packages/fastai2/torch_core.py in to_concat(xs, dim)
    211 def to_concat(xs, dim=0):
    212     "Concat the element in `xs` (recursively if they are tuples/lists of tensors)"
--> 213     if is_listy(xs[0]): return type(xs[0])([to_concat([x[i] for x in xs], dim=dim) for i in range_of(xs[0])])
    214     if isinstance(xs[0],dict):  return {k: to_concat([x[k] for x in xs], dim=dim) for k in xs.keys()}
    215     #We may receives xs that are not concatenatable (inputs of a text classifier for instance),

IndexError: list index out of range

Try using dl instead of dl2? (Otherwise I’ll try myself later :slight_smile: )

if you can try when you can, it’d be appreciated.

dl = learn.dls.test_dl(df2)
_, __, y = learn.get_preds(dl=dl)

unfortunately is not working, although i’m glad you spotted my typo. :slight_smile:

Also, as my conference was cancelled I’ll be doing class tommorow evening. I’ll post more tommorow :slight_smile:

1 Like

@foobar8675 our of curiosity, try using the dev versions of fastcore and fastai2 and see if you still get the same thing for me please :slight_smile:

gladly - but is there a webpage you can point me to to get the dev versions in a notebook?

The FAQ has this.

https://forums.fast.ai/t/fastai-v2-faq-and-links-read-this-before-posting-please/53517/4

1 Like

Well, I tried to atleast. There’s a bit of madness on campus so I wasn’t able to get as far as I wanted. I’ll keep everyone updated if I decide to do one in the middle of the week (as everything is closed down) but for the time being presume regular schedule of next week :slight_smile: