Fastai v2 tabular

@sgugger finally getting back to look at this. I tried MSELossFlat() for my loss function, but it did not convert oddly enough. MSELossFlat did wind up working.

Here is what I am currently trying:

tab = TabularPandas(train_df, procs=procs, cat_names=cat_vars, cont_names=cont_vars, y_names=dep_var, splits=splits, block_y=TransformBlock(type_tfms=lambda x: float(x)))

model = TabularModel(get_emb_sz(tab), len(tab.cont_names), 1, [1000,500], y_range=y_range)
opt_func = partial(Adam, wd=0.01, eps=1e-5)
learn = Learner(tab.databunch(), model, MSELossFlat(), opt_func=opt_func, metrics=rmse)

It wonā€™t really ā€œtrainā€ and epoch time is 37 minutes on a GPU! By wont really ā€œtrainā€, initial loss is 58017752.000000. I may wait a bit and move to NLP for my guides if youā€™re planning on getting to Rossmann (eventually), as itā€™s causing quite the headache for me :sweat_smile:

1 Like

Iā€™m trying to do learn.show_results() and I get ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 2 has 1 dimension(s)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-c3b657dcc9ae> in <module>()
----> 1 learn.show_results()

11 frames
/usr/local/lib/python3.6/dist-packages/fastai2/learner.py in show_results(self, ds_idx, dl, max_n, **kwargs)
    322         b = dl.one_batch()
    323         _,_,preds = self.get_preds(dl=[b], with_decoded=True)
--> 324         self.dbunch.show_results(b, preds, max_n=max_n, **kwargs)
    325 
    326     def show_training_loop(self):

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in show_results(self, b, out, max_n, ctxs, show, **kwargs)
     83         x,y,its = self.show_batch(b, max_n=max_n, show=False)
     84         b_out = b[:self.n_inp] + (tuple(out) if is_listy(out) else (out,))
---> 85         x1,y1,outs = self.show_batch(b_out, max_n=max_n, show=False)
     86         res = (x,x1,None,None) if its is None else (x, y, its, outs.itemgot(slice(self.n_inp,None)))
     87         if not show: return res

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in show_batch(self, b, max_n, ctxs, show, **kwargs)
     77     def show_batch(self, b=None, max_n=9, ctxs=None, show=True, **kwargs):
     78         if b is None: b = self.one_batch()
---> 79         if not show: return self._pre_show_batch(b, max_n=max_n)
     80         show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
     81 

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in _pre_show_batch(self, b, max_n)
     69     def _pre_show_batch(self, b, max_n=9):
     70         "Decode `b` to be ready for `show_batch`"
---> 71         b = self.decode(b)
     72         if hasattr(b, 'show'): return b,None,None
     73         its = self._decode_batch(b, max_n, full=False)

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in decode(self, b)
     59             if isinstance(f,Pipeline): f.split_idx=split_idx
     60 
---> 61     def decode(self, b): return self.before_batch.decode(self.after_batch.decode(self._retain_dl(b)))
     62     def decode_batch(self, b, max_n=9, full=True): return self._decode_batch(self.decode(b), max_n, full)
     63 

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in decode(self, o, full)
    207 
    208     def decode  (self, o, full=True):
--> 209         if full: return compose_tfms(o, tfms=self.fs, is_enc=False, reverse=True, split_idx=self.split_idx)
    210         #Not full means we decode up to the point the item knows how to show itself.
    211         for f in reversed(self.fs):

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
    147     for f in tfms:
    148         if not is_enc: f = f.decode
--> 149         x = f(x, **kwargs)
    150     return x
    151 

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in decode(self, x, **kwargs)
     86     def use_as_item(self): return ifnone(self.as_item_force, self.as_item)
     87     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
---> 88     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     89     def setup(self, items=None): return self.setups(items)
     90     def __repr__(self): return f'{self.__class__.__name__}: {self.use_as_item} {self.encodes} {self.decodes}'

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in _call(self, fn, x, split_idx, **kwargs)
     93         if split_idx!=self.split_idx and self.split_idx is not None: return x
     94         f = getattr(self, fn)
---> 95         if self.use_as_item or not is_listy(x): return self._do_call(f, x, **kwargs)
     96         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     97         return retain_type(res, x)

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in _do_call(self, f, x, **kwargs)
     98 
     99     def _do_call(self, f, x, **kwargs):
--> 100         return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
    101 
    102 add_docs(Transform, decode="Delegate to `decodes` to undo transform", setup="Delegate to `setups` to set up transform")

/usr/local/lib/python3.6/dist-packages/fastai2/core/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = types.MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in decodes(self, o)
    155     def decodes(self, o):
    156         cats,conts,targs = to_np(o)
--> 157         vals = np.concatenate([cats,conts,targs], axis=1)
    158         df = pd.DataFrame(vals, columns=self.to.all_col_names)
    159         to = self.to.new(df)

<__array_function__ internals> in concatenate(*args, **kwargs)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 2 has 1 dimension(s)

(This is on ADULTs)

Databunch was made like so:

trn_dl = TabDataLoader(to.train, bs=64, shuffle=True, drop_last=True)
val_dl = TabDataLoader(to.valid, bs=128)
dbunch = DataBunch(trn_dl, val_dl)

Not sure show_results works well on tabular yet. Iā€™ll get to it when I get to port the rossman lesson, for now we are working on extracting the development in notebook functionality in a separate package so I donā€™t expect Iā€™ll have time to work on this until the end of the week.

2 Likes

It was working earlier, but with your changes to the new tabular it may not be anymore. Sounds good :slight_smile: Glad to see we get those separate library building packages. Thanks!!!

Just a useful tip for people, you can shed a few seconds on creating your TabularPandas if you preprocess your category columns to a category (instead of having it convert in place) :slight_smile:

ie:

for var in cat_vars:
  train_df[var] = train_df[var].astype('category')

For example:
No preprocess Rossmann: 24.4s
Preprocess Rossmann: 19.7s (including the time to convert the categories)

Also @sgugger thank you for figuring out what was going on with Rossmann :slight_smile: Did you face that timing issue at all when you were exploring it? Or was it something on my end?

Edit: I see now, you used pd.Categorical() to convert it first :slight_smile:

1 Like

Yes the timing issue was on fastai2: we used map to convert the classes into category codes and that was super slow.

1 Like

Is this fixed now?

Yep :slight_smile:

1 Like

I have a pretty general question about tabular models:
Does columns order in data impact model performance?
Iā€™m currently working on timeseries forecasting and it seems that the modelā€™s valid_loss performances have decreased.

Numerical or categorical columns (or if both mostly which)? Thatā€™s a very interesting behavior

I built a function to handle automatically my cat and cont variable names and I used pythonā€™s sets to make use of intersections and subset operations. (cat always come before cont)
But sets never return the same order(which I thought wasnā€™t important at the time :slight_smile: ).
But if the cat variables are not orderd the same way, embedings wonā€™t match if you wand to load backed-up states of the model.
So now I sort cat and cont variables alphabeticaly after generating the 2 sets.
I re-ran the train with these newly orded variables and lost something like 10% on log loss (1.28 best_val_loss) compared to the train I did yesterday (1.12 best_val_loss).

( Of course this loss might be due to something else that passed under my radar , also I may have had some luck with the model initialization yesterday)

@sgugger there seems to be a bug in tabular with the ConfusionMatrix. See the following trace:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-10-ab4f144f3c10> in <module>()
----> 1 interp.plot_confusion_matrix()

1 frames
/usr/local/lib/python3.6/dist-packages/fastai2/interpret.py in plot_confusion_matrix(self, normalize, title, cmap, norm_dec, plot_txt, **kwargs)
     63         "Plot the confusion matrix, with `title` and using `cmap`."
     64         # This function is mainly copied from the sklearn docs
---> 65         cm = self.confusion_matrix()
     66         if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
     67         fig = plt.figure(**kwargs)

/usr/local/lib/python3.6/dist-packages/fastai2/interpret.py in confusion_matrix(self)
     56         "Confusion matrix as an `np.ndarray`."
     57         x = torch.arange(0, len(self.vocab))
---> 58         cm = ((self.decoded==x[:,None]) & (self.targs==x[:,None,None])).sum(2)
     59         return to_np(cm)
     60 

RuntimeError: The size of tensor a (2) must match the size of tensor b (200) at non-singleton dimension 1

(This is ripped right from the ADULTs notebook in terms of how I set everything up. I just added in a ClassificationInterpretation instance plus plot_confusion_matrix)

The fix is the following inside the confusion_matrix call:

cm = ((self.decoded==x[:,None]) & (self.targs.squeeze()==x[:,None,None])).sum(2)

Though Iā€™m unsure if that will affect vision etc. probably a delegates for tabular specifically would be needed?

2 Likes

I guess the squeeze wonā€™t hurt. Will add that.

1 Like

@sgugger I think itā€™s best to generally pass a dim parameter to squeeze, otherwise you might accidentally quite, for example, a batch size of 1 into a scalar.

1 Like

True. Iā€™ll fix this tomorrow.

@sgugger what is the proper way to add a test set with an exported learner here? Iā€™ve tried:

dl = test_dl(exp_learn.dbunch, df)

But when I do show_batch I get
AttributeError: 'DataFrame' object has no attribute 'with_cont'

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-39-c4380ecf3ea4> in <module>()
----> 1 dl.show_batch()

11 frames
/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in show_batch(self, b, max_n, ctxs, show, **kwargs)
     75 
     76     def show_batch(self, b=None, max_n=9, ctxs=None, show=True, **kwargs):
---> 77         if b is None: b = self.one_batch()
     78         if not show: return self._pre_show_batch(b, max_n=max_n)
     79         show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)

/usr/local/lib/python3.6/dist-packages/fastai2/data/load.py in one_batch(self)
    128     def one_batch(self):
    129         if self.n is not None and len(self)==0: raise ValueError(f'This DataLoader does not contain any batches')
--> 130         with self.fake_l.no_multiproc(): res = first(self)
    131         if hasattr(self, 'it'): delattr(self, 'it')
    132         return res

/usr/local/lib/python3.6/dist-packages/fastcore/utils.py in first(x)
    172 def first(x):
    173     "First element of `x`, or None if missing"
--> 174     try: return next(iter(x))
    175     except StopIteration: return None
    176 

/usr/local/lib/python3.6/dist-packages/fastai2/data/load.py in __iter__(self)
     97         for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
     98             if self.device is not None: b = to_device(b, self.device)
---> 99             yield self.after_batch(b)
    100         self.after_iter()
    101         if hasattr(self, 'it'): delattr(self, 'it')

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in __call__(self, o)
    177         self.fs.append(t)
    178 
--> 179     def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
    180     def __repr__(self): return f"Pipeline: {self.fs}"
    181     def __getitem__(self,i): return self.fs[i]

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
    125     for f in tfms:
    126         if not is_enc: f = f.decode
--> 127         x = f(x, **kwargs)
    128     return x
    129 

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in __call__(self, x, **kwargs)
     60     @property
     61     def use_as_item(self): return ifnone(self.as_item_force, self.as_item)
---> 62     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     63     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     64     def __repr__(self): return f'{self.__class__.__name__}: {self.use_as_item} {self.encodes} {self.decodes}'

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     71         if split_idx!=self.split_idx and self.split_idx is not None: return x
     72         f = getattr(self, fn)
---> 73         if self.use_as_item or not is_listy(x): return self._do_call(f, x, **kwargs)
     74         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     75         return retain_type(res, x)

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     76 
     77     def _do_call(self, f, x, **kwargs):
---> 78         return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
     79 
     80 add_docs(Transform, decode="Delegate to `decodes` to undo transform", setup="Delegate to `setups` to set up transform")

/usr/local/lib/python3.6/dist-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in encodes(self, to)
    255     # TODO: use float for cont targ
    256     def encodes(self, to):
--> 257         if not to.with_cont: return tensor(to.cats).long(), tensor(to.targ)
    258         return tensor(to.cats).long(),tensor(to.conts).float(), tensor(to.targ)
    259 

/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'with_cont'

You need to use to.new() to create a new tabular object, then convert it to a DataBunch.

When I do something like:

dl = exp_learn.dbunch.new(df)

It creates a TabDataLoader right away, however I still get the with_cont error. Is there a different way of doing this Iā€™m missing?

Thanks :slight_smile:

Edit:

I got it working, thanks @sgugger :slight_smile:

The process was:

to = exp_learn.dbunch.train_ds.new(df)
to.process()
dl = TabDataLoader(to)
learn.get_preds(dl=dl)

Would it be possible to make that a bit easier to work with? Iā€™d be happy to try to figure that out. (Something like where exp_learn.dbunch.new() would do the same thing for tabular, or test_dl

Let me think of it. Either use type-dispatch to have custom behavior on test_dl or make that a method of DataBunch so we can change the behavior for some subclasses.

1 Like