Fastai v2 tabular

muellerzr · November 11, 2019, 12:21am

@sgugger finally getting back to look at this. ~~I tried MSELossFlat() for my loss function, but it did not convert oddly enough.~~ MSELossFlat did wind up working.

Here is what I am currently trying:

tab = TabularPandas(train_df, procs=procs, cat_names=cat_vars, cont_names=cont_vars, y_names=dep_var, splits=splits, block_y=TransformBlock(type_tfms=lambda x: float(x)))

model = TabularModel(get_emb_sz(tab), len(tab.cont_names), 1, [1000,500], y_range=y_range)
opt_func = partial(Adam, wd=0.01, eps=1e-5)
learn = Learner(tab.databunch(), model, MSELossFlat(), opt_func=opt_func, metrics=rmse)

It won’t really “train” and epoch time is 37 minutes on a GPU! By wont really “train”, initial loss is 58017752.000000. I may wait a bit and move to NLP for my guides if you’re planning on getting to Rossmann (eventually), as it’s causing quite the headache for me

muellerzr · November 19, 2019, 9:14pm

I’m trying to do learn.show_results() and I get ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 2 has 1 dimension(s)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-c3b657dcc9ae> in <module>()
----> 1 learn.show_results()

11 frames
/usr/local/lib/python3.6/dist-packages/fastai2/learner.py in show_results(self, ds_idx, dl, max_n, **kwargs)
    322         b = dl.one_batch()
    323         _,_,preds = self.get_preds(dl=[b], with_decoded=True)
--> 324         self.dbunch.show_results(b, preds, max_n=max_n, **kwargs)
    325 
    326     def show_training_loop(self):

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in show_results(self, b, out, max_n, ctxs, show, **kwargs)
     83         x,y,its = self.show_batch(b, max_n=max_n, show=False)
     84         b_out = b[:self.n_inp] + (tuple(out) if is_listy(out) else (out,))
---> 85         x1,y1,outs = self.show_batch(b_out, max_n=max_n, show=False)
     86         res = (x,x1,None,None) if its is None else (x, y, its, outs.itemgot(slice(self.n_inp,None)))
     87         if not show: return res

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in show_batch(self, b, max_n, ctxs, show, **kwargs)
     77     def show_batch(self, b=None, max_n=9, ctxs=None, show=True, **kwargs):
     78         if b is None: b = self.one_batch()
---> 79         if not show: return self._pre_show_batch(b, max_n=max_n)
     80         show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
     81 

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in _pre_show_batch(self, b, max_n)
     69     def _pre_show_batch(self, b, max_n=9):
     70         "Decode `b` to be ready for `show_batch`"
---> 71         b = self.decode(b)
     72         if hasattr(b, 'show'): return b,None,None
     73         its = self._decode_batch(b, max_n, full=False)

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in decode(self, b)
     59             if isinstance(f,Pipeline): f.split_idx=split_idx
     60 
---> 61     def decode(self, b): return self.before_batch.decode(self.after_batch.decode(self._retain_dl(b)))
     62     def decode_batch(self, b, max_n=9, full=True): return self._decode_batch(self.decode(b), max_n, full)
     63 

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in decode(self, o, full)
    207 
    208     def decode  (self, o, full=True):
--> 209         if full: return compose_tfms(o, tfms=self.fs, is_enc=False, reverse=True, split_idx=self.split_idx)
    210         #Not full means we decode up to the point the item knows how to show itself.
    211         for f in reversed(self.fs):

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
    147     for f in tfms:
    148         if not is_enc: f = f.decode
--> 149         x = f(x, **kwargs)
    150     return x
    151 

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in decode(self, x, **kwargs)
     86     def use_as_item(self): return ifnone(self.as_item_force, self.as_item)
     87     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
---> 88     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     89     def setup(self, items=None): return self.setups(items)
     90     def __repr__(self): return f'{self.__class__.__name__}: {self.use_as_item} {self.encodes} {self.decodes}'

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in _call(self, fn, x, split_idx, **kwargs)
     93         if split_idx!=self.split_idx and self.split_idx is not None: return x
     94         f = getattr(self, fn)
---> 95         if self.use_as_item or not is_listy(x): return self._do_call(f, x, **kwargs)
     96         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     97         return retain_type(res, x)

/usr/local/lib/python3.6/dist-packages/fastai2/core/transform.py in _do_call(self, f, x, **kwargs)
     98 
     99     def _do_call(self, f, x, **kwargs):
--> 100         return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
    101 
    102 add_docs(Transform, decode="Delegate to `decodes` to undo transform", setup="Delegate to `setups` to set up transform")

/usr/local/lib/python3.6/dist-packages/fastai2/core/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = types.MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in decodes(self, o)
    155     def decodes(self, o):
    156         cats,conts,targs = to_np(o)
--> 157         vals = np.concatenate([cats,conts,targs], axis=1)
    158         df = pd.DataFrame(vals, columns=self.to.all_col_names)
    159         to = self.to.new(df)

<__array_function__ internals> in concatenate(*args, **kwargs)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 2 has 1 dimension(s)

(This is on ADULTs)

Databunch was made like so:

trn_dl = TabDataLoader(to.train, bs=64, shuffle=True, drop_last=True)
val_dl = TabDataLoader(to.valid, bs=128)
dbunch = DataBunch(trn_dl, val_dl)

sgugger · November 19, 2019, 10:40pm

Not sure show_results works well on tabular yet. I’ll get to it when I get to port the rossman lesson, for now we are working on extracting the development in notebook functionality in a separate package so I don’t expect I’ll have time to work on this until the end of the week.

muellerzr · November 19, 2019, 10:41pm

It was working earlier, but with your changes to the new tabular it may not be anymore. Sounds good Glad to see we get those separate library building packages. Thanks!!!

muellerzr · December 5, 2019, 7:16pm

Just a useful tip for people, you can shed a few seconds on creating your TabularPandas if you preprocess your category columns to a category (instead of having it convert in place)

ie:

for var in cat_vars:
  train_df[var] = train_df[var].astype('category')

For example:
No preprocess Rossmann: 24.4s
Preprocess Rossmann: 19.7s (including the time to convert the categories)

Also @sgugger thank you for figuring out what was going on with Rossmann Did you face that timing issue at all when you were exploring it? Or was it something on my end?

Edit: I see now, you used pd.Categorical() to convert it first

sgugger · December 5, 2019, 8:23pm

Yes the timing issue was on fastai2: we used map to convert the classes into category codes and that was super slow.

jeremy · December 6, 2019, 1:39am

Is this fixed now?

muellerzr · December 6, 2019, 1:57am

Yep

Brainkite · January 2, 2020, 3:59pm

I have a pretty general question about tabular models:
Does columns order in data impact model performance?
I’m currently working on timeseries forecasting and it seems that the model’s valid_loss performances have decreased.

muellerzr · January 2, 2020, 4:01pm

Numerical or categorical columns (or if both mostly which)? That’s a very interesting behavior

Brainkite · January 2, 2020, 4:29pm

I built a function to handle automatically my cat and cont variable names and I used python’s sets to make use of intersections and subset operations. (cat always come before cont)
But sets never return the same order(which I thought wasn’t important at the time ).
But if the cat variables are not orderd the same way, embedings won’t match if you wand to load backed-up states of the model.
So now I sort cat and cont variables alphabeticaly after generating the 2 sets.
I re-ran the train with these newly orded variables and lost something like 10% on log loss (1.28 best_val_loss) compared to the train I did yesterday (1.12 best_val_loss).

( Of course this loss might be due to something else that passed under my radar , also I may have had some luck with the model initialization yesterday)

muellerzr · January 13, 2020, 3:05pm

@sgugger there seems to be a bug in tabular with the ConfusionMatrix. See the following trace:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-10-ab4f144f3c10> in <module>()
----> 1 interp.plot_confusion_matrix()

1 frames
/usr/local/lib/python3.6/dist-packages/fastai2/interpret.py in plot_confusion_matrix(self, normalize, title, cmap, norm_dec, plot_txt, **kwargs)
     63         "Plot the confusion matrix, with `title` and using `cmap`."
     64         # This function is mainly copied from the sklearn docs
---> 65         cm = self.confusion_matrix()
     66         if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
     67         fig = plt.figure(**kwargs)

/usr/local/lib/python3.6/dist-packages/fastai2/interpret.py in confusion_matrix(self)
     56         "Confusion matrix as an `np.ndarray`."
     57         x = torch.arange(0, len(self.vocab))
---> 58         cm = ((self.decoded==x[:,None]) & (self.targs==x[:,None,None])).sum(2)
     59         return to_np(cm)
     60 

RuntimeError: The size of tensor a (2) must match the size of tensor b (200) at non-singleton dimension 1

(This is ripped right from the ADULTs notebook in terms of how I set everything up. I just added in a ClassificationInterpretation instance plus plot_confusion_matrix)

muellerzr · January 13, 2020, 3:13pm

The fix is the following inside the confusion_matrix call:

cm = ((self.decoded==x[:,None]) & (self.targs.squeeze()==x[:,None,None])).sum(2)

Though I’m unsure if that will affect vision etc. probably a delegates for tabular specifically would be needed?

sgugger · January 13, 2020, 4:28pm

I guess the squeeze won’t hurt. Will add that.

jeremy · January 13, 2020, 11:47pm

@sgugger I think it’s best to generally pass a dim parameter to squeeze, otherwise you might accidentally quite, for example, a batch size of 1 into a scalar.

sgugger · January 14, 2020, 12:11am

True. I’ll fix this tomorrow.

muellerzr · January 22, 2020, 6:59pm

@sgugger what is the proper way to add a test set with an exported learner here? I’ve tried:

dl = test_dl(exp_learn.dbunch, df)

But when I do show_batch I get
AttributeError: 'DataFrame' object has no attribute 'with_cont'

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-39-c4380ecf3ea4> in <module>()
----> 1 dl.show_batch()

11 frames
/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in show_batch(self, b, max_n, ctxs, show, **kwargs)
     75 
     76     def show_batch(self, b=None, max_n=9, ctxs=None, show=True, **kwargs):
---> 77         if b is None: b = self.one_batch()
     78         if not show: return self._pre_show_batch(b, max_n=max_n)
     79         show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)

/usr/local/lib/python3.6/dist-packages/fastai2/data/load.py in one_batch(self)
    128     def one_batch(self):
    129         if self.n is not None and len(self)==0: raise ValueError(f'This DataLoader does not contain any batches')
--> 130         with self.fake_l.no_multiproc(): res = first(self)
    131         if hasattr(self, 'it'): delattr(self, 'it')
    132         return res

/usr/local/lib/python3.6/dist-packages/fastcore/utils.py in first(x)
    172 def first(x):
    173     "First element of `x`, or None if missing"
--> 174     try: return next(iter(x))
    175     except StopIteration: return None
    176 

/usr/local/lib/python3.6/dist-packages/fastai2/data/load.py in __iter__(self)
     97         for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
     98             if self.device is not None: b = to_device(b, self.device)
---> 99             yield self.after_batch(b)
    100         self.after_iter()
    101         if hasattr(self, 'it'): delattr(self, 'it')

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in __call__(self, o)
    177         self.fs.append(t)
    178 
--> 179     def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
    180     def __repr__(self): return f"Pipeline: {self.fs}"
    181     def __getitem__(self,i): return self.fs[i]

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
    125     for f in tfms:
    126         if not is_enc: f = f.decode
--> 127         x = f(x, **kwargs)
    128     return x
    129 

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in __call__(self, x, **kwargs)
     60     @property
     61     def use_as_item(self): return ifnone(self.as_item_force, self.as_item)
---> 62     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     63     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     64     def __repr__(self): return f'{self.__class__.__name__}: {self.use_as_item} {self.encodes} {self.decodes}'

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     71         if split_idx!=self.split_idx and self.split_idx is not None: return x
     72         f = getattr(self, fn)
---> 73         if self.use_as_item or not is_listy(x): return self._do_call(f, x, **kwargs)
     74         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     75         return retain_type(res, x)

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     76 
     77     def _do_call(self, f, x, **kwargs):
---> 78         return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
     79 
     80 add_docs(Transform, decode="Delegate to `decodes` to undo transform", setup="Delegate to `setups` to set up transform")

/usr/local/lib/python3.6/dist-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

/usr/local/lib/python3.6/dist-packages/fastai2/tabular/core.py in encodes(self, to)
    255     # TODO: use float for cont targ
    256     def encodes(self, to):
--> 257         if not to.with_cont: return tensor(to.cats).long(), tensor(to.targ)
    258         return tensor(to.cats).long(),tensor(to.conts).float(), tensor(to.targ)
    259 

/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'with_cont'

sgugger · January 22, 2020, 7:02pm

You need to use to.new() to create a new tabular object, then convert it to a DataBunch.

muellerzr · January 22, 2020, 7:06pm

When I do something like:

dl = exp_learn.dbunch.new(df)

It creates a TabDataLoader right away, however I still get the with_cont error. Is there a different way of doing this I’m missing?

Thanks

Edit:

I got it working, thanks @sgugger

The process was:

to = exp_learn.dbunch.train_ds.new(df)
to.process()
dl = TabDataLoader(to)
learn.get_preds(dl=dl)

Would it be possible to make that a bit easier to work with? I’d be happy to try to figure that out. (Something like where exp_learn.dbunch.new() would do the same thing for tabular, or test_dl

sgugger · January 22, 2020, 7:15pm

Let me think of it. Either use type-dispatch to have custom behavior on test_dl or make that a method of DataBunch so we can change the behavior for some subclasses.