Fastai v2 tabular

muellerzr · October 29, 2019, 2:46pm

It looks like IndexSplitter is returning a regular array instead of two L's which is causing the issue. But then it will process for quite a while when creating the TabularPandas object, looks like during the transforms. Stack after waiting 10 minutes:

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-14-1ed41788e5c8> in <module>()
      1 to = TabularPandas(train_df, procs=procs, cat_names=cat_vars, cont_names=cont_vars,
----> 2                    y_names=dep_var, is_y_cat=False, splits=splits)

21 frames
/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in __init__(self, df, procs, cat_names, cont_names, y_names, is_y_cat, splits, do_setup)
     39         self.cont_y = None if     is_y_cat else y_names
     40         self.split = len(splits[0])
---> 41         if do_setup: self.procs.setup(self)
     42 
     43     def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in setup(self, items)
    192         tfms = self.fs[:]
    193         self.fs.clear()
--> 194         for t in tfms: self.add(t,items)
    195 
    196     def add(self,t, items=None):

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in add(self, t, items)
    195 
    196     def add(self,t, items=None):
--> 197         t.setup(items)
    198         self.fs.append(t)
    199 

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in setup(self, items)
     81         super().setup(getattr(items,'train',items))
     82         # Procs are called as soon as data is available
---> 83         return self(items.items if isinstance(items,DataSource) else items)
     84 
     85 #Cell

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in __call__(self, x, **kwargs)
     84     @property
     85     def use_as_item(self): return ifnone(self.as_item_force, self.as_item)
---> 86     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     87     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     88     def setup(self, items=None): return self.setups(items)

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in _call(self, fn, x, split_idx, **kwargs)
    105     "A `Transform` that modifies in-place and just returns whatever it's passed"
    106     def _call(self, fn, x, split_idx=None, **kwargs):
--> 107         super()._call(fn,x,split_idx,**kwargs)
    108         return x
    109 

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in _call(self, fn, x, split_idx, **kwargs)
     92         if split_idx!=self.split_idx and self.split_idx is not None: return x
     93         f = getattr(self, fn)
---> 94         if self.use_as_item or not is_listy(x): return self._do_call(f, x, **kwargs)
     95         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     96         return retain_type(res, x)

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in _do_call(self, f, x, **kwargs)
     97 
     98     def _do_call(self, f, x, **kwargs):
---> 99         return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
    100 
    101 add_docs(Transform, decode="Delegate to `decodes` to undo transform", setup="Delegate to `setups` to set up transform")

/usr/local/lib/python3.6/dist-packages/fastai2/core/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = types.MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in encodes(self, to)
     92     def _decode_cats(self, c): return c.map(dict(enumerate(self[c.name].items)))
     93     def encodes(self, to):
---> 94         to.transform(to.cat_names, partial(self._apply_cats,1))
     95         to.transform(L(to.cat_y),  partial(self._apply_cats,0))
     96     def decodes(self, to): to.transform(to.all_cat_names, self._decode_cats)

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in transform(self, cols, f)
     58 #Cell
     59 class TabularPandas(Tabular):
---> 60     def transform(self, cols, f): self[cols] = self[cols].transform(f)
     61 
     62 #Cell

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in transform(self, func, axis, *args, **kwargs)
   6733         if axis == 1:
   6734             return self.T.transform(func, *args, **kwargs).T
-> 6735         return super().transform(func, *args, **kwargs)
   6736 
   6737     def apply(

/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in transform(self, func, *args, **kwargs)
  10810     @Appender(_shared_docs["transform"] % dict(axis="", **_shared_doc_kwargs))
  10811     def transform(self, func, *args, **kwargs):
> 10812         result = self.agg(func, *args, **kwargs)
  10813         if is_scalar(result) or len(result) != len(self):
  10814             raise ValueError("transforms cannot produce " "aggregated results")

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in aggregate(self, func, axis, *args, **kwargs)
   6714             pass
   6715         if result is None:
-> 6716             return self.apply(func, axis=axis, args=args, **kwargs)
   6717         return result
   6718 

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6926             kwds=kwds,
   6927         )
-> 6928         return op.get_result()
   6929 
   6930     def applymap(self, func):

/usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in get_result(self)
    184             return self.apply_raw()
    185 
--> 186         return self.apply_standard()
    187 
    188     def apply_empty_result(self):

/usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in apply_standard(self)
    290 
    291         # compute the result using the series generator
--> 292         self.apply_series_generator()
    293 
    294         # wrap results

/usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in apply_series_generator(self)
    319             try:
    320                 for i, v in enumerate(series_gen):
--> 321                     results[i] = self.f(v)
    322                     keys.append(v.name)
    323             except Exception as e:

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in _apply_cats(self, add, c)
     89     def setups(self, to):
     90         self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.all_cat_names}
---> 91     def _apply_cats (self, add, c): return c.cat.codes+add if is_categorical_dtype(c) else c.map(self[c.name].o2i)
     92     def _decode_cats(self, c): return c.map(dict(enumerate(self[c.name].items)))
     93     def encodes(self, to):

/usr/local/lib/python3.6/dist-packages/pandas/core/series.py in map(self, arg, na_action)
   3826         dtype: object
   3827         """
-> 3828         new_values = super()._map_values(arg, na_action=na_action)
   3829         return self._constructor(new_values, index=self.index).__finalize__(self)
   3830 

/usr/local/lib/python3.6/dist-packages/pandas/core/base.py in _map_values(self, mapper, na_action)
   1298 
   1299         # mapper is a function
-> 1300         new_values = map_f(values, mapper)
   1301 
   1302         return new_values

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

/usr/local/lib/python3.6/dist-packages/pandas/core/base.py in <lambda>(x)
   1253                 # convert mapper to a lookup function (GH #15999).
   1254                 dict_with_default = mapper
-> 1255                 mapper = lambda x: dict_with_default[x]
   1256             else:
   1257                 # Dictionary does not have a default. Thus it's safe to

KeyboardInterrupt:

sgugger · October 29, 2019, 8:55pm

Does the issue disappear if you convert those arrays to Ls?

muellerzr · October 29, 2019, 8:57pm

The initial issue, yes. But then it gets stuck doing transforms. (The stack trace I posted where I quit after 10 minutes)

sgugger · October 29, 2019, 9:03pm

Except we can’t see anything in a stack trace interrupted like this, so I don’t know what cause the issue.

muellerzr · October 29, 2019, 9:04pm

Ah I understand now. Would it be better to do a %debug before running the cell? Or what would you recommend?

I’ll try that tonight and update you if I notice an origin

muellerzr · October 29, 2019, 11:40pm

@sgugger ~~odd, reboot and it started working again. I’ll let you know if I run into this bug again~~
The issue seems to be with Categorify, as it takes quite a long time (if that’s known it’s okay. Not trying to complain! Just trying to work out how to go about this )

sgugger · October 30, 2019, 12:31am

Normally it’s supposed to be fast, using pd.Series.unique to determine the unique categories. Not sure what’s holding you up…

muellerzr · October 30, 2019, 12:35am

That was my thought too. I will most likely move to other notebooks for now and come back to it later (if you haven’t when you’re redoing the course-v3) But let me see what I can do tonight

muellerzr · October 30, 2019, 1:33am

Doesn’t seem to bottleneck in CategoryMap, investigating further.

The problem is with the Events column and the PromoInterval column. The rest of the colums take <2 seconds, but these do not for some reason

muellerzr · October 30, 2019, 2:05am

Aha! Found the solution if a non-category type is trying to be done as a category (eg PromoInterval was not a category), it will not work well. Potential solution is if it is a category (In cat_names), convert each column to a category before calling the encodes? eg in setup.

#export
class Categorify(TabularProc):
    "Transform the categorical variables to that type."
    order = 1
    def setups(self, to):
        self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.all_cat_names}
    def _apply_cats (self, add, c): 
      if is_categorical_dtype(c):
        return c.cat.codes+add  
      else:
        c = self._to_cat(c)
        return c.cat.codes+add
    def _to_cat (self, c): return c.astype('category')
    def _decode_cats(self, c): return c.map(dict(enumerate(self[c.name].items)))
    def encodes(self, to):
        to.transform(to.cat_names, partial(self._apply_cats,1))
        to.transform(L(to.cat_y),  partial(self._apply_cats,0))
    def decodes(self, to): to.transform(to.all_cat_names, self._decode_cats)
    def __getitem__(self,k): return self.classes[k]

Let me know your thoughts.

Time Comparison (without the two problem columns):

Original: 35.2s
Mine: 17.8s

Adults:
Original: 357ms
Mine: 244ms

Edit: @sgugger (sorry to @ you) should we just pre-process our categorical columns to a category type and call it a day? Or what are your thoughts

Ralph · November 5, 2019, 3:22pm

I would also like an option to do an external cat to integer mapping and use the value rather than the cat.code. In the case where you will be training on the same data repeatedly, it would be much faster to convert once and be done with it. This also has the benefit of much smaller dataframes - my dailies are 17gb in memory as object and 3.5 after conversion.

Not to mention cases where account numbers or whatever are already integers.

jeremy · November 5, 2019, 7:35pm

Pandas can do that for you.

Ralph · November 5, 2019, 7:55pm

I am mentally mixing my v1 and v2 - gotta remember where I am posting…

v1 gives a ‘Can only use .cat accessor with a ‘category’ dtype’ error if you don’t categorify.

I did a version of v2 with an ‘if is_numerical_dtype’ line added to Category_Map in 06_data_transforms’ which I believe sorted things but that was several weeks ago.

cs224 · November 6, 2019, 8:30am

I am trying to use the TabularModel and started from a frech checkout of fastai_dev. Currently TabularModel tries to use BnDropLin, but it seems that this was renamed to LinBnDrop. After adapting that piece I can run 41_tabular_model.

Why is the 41_tabular_model not exported yet? The cells are not marked for export. I guess this is intentional?

When I run the last cell with “notebook2script(all_fs=True)”, somehow I end up with all files under dev/local having changed their relativ imports, e.g. a line
from .basics import *
is changed to
from ......basics import *

I can fix that manually, but I guess I did something wrong with the set-up? I installed the package after the clone via pip install -e .. Did somebody else run into a similar problem before?

cs224 · November 6, 2019, 11:15am

I found one more issue with ReadTabBatch. It already says: TODO: use float for cont targ.

I changed the current line:
def encodes(self, to): return tensor(to.cats).long(),tensor(to.conts).float(), tensor(to.targ).long()
to
def encodes(self, to): return tensor(to.cats).long(),tensor(to.conts).float(), tensor(to.targ).long() if to.cat_y else tensor(to.targ).float()

I am too new to fast.ai, to tell if this is the correct way to fix this issue or not.

jeremy · November 7, 2019, 4:22am

Not only was it renamed, but also the layers were reordered (as the name suggests). It’s possible that the tabular model needs some tweaking as a result.

muellerzr · November 9, 2019, 6:56pm

I’m looking at the new tabular API, are you looking to do something like a FloatBlock for regression tasks? And if so, any pointers for trying to implement such a task?

sgugger · November 9, 2019, 8:36pm

The regular TransformBlock should be fine for that.

muellerzr · November 9, 2019, 8:57pm

Thanks for the answer Do you mean:

to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=dep_var,
                   splits=splits, block_y=TransformBlock)

Doing so does:
RuntimeError: Expected object of scalar type Float but got scalar type Long for argument #2 'target' in call to _thnn_mse_loss_forward

I also tried defining type_tfms to Float, TransformBlock(type_tfms=Float)

sgugger · November 9, 2019, 10:48pm

It looks like you need to convert your targets to floats? Float is a type, not a transform, so replace that by a function lambda x: float(x).

Edit: Even better, use MSELossFlat() which should convert your target to float automatically.