Fastai v2 text

Tokenizer.setup ends up modifying the text cols from the dataframe, is this expected?

I have a Transform in my pipeline that operates on the raw text (regex lookups), but after setup the text col gets transformed to a list of tokens. Is this done on purpose for caching purposes?

Yes, tokenizer does the tokenization only once and changes the dataframe, to avoid redoing it all the time.

1 Like

If I change res_col_name does caching still happens? I’m guessing not because ColReader will still read from the non-tokenized column, example on IMDB_SAMPLE

splits = ColReader('is_valid')(df)
dset = Datasets(df, tfms=[[ColReader('text'), Tokenizer.from_df('text', res_col_name='tokens')]])

This is currently throwing an error

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-5-444f1088f8d5> in <module>
----> 1 dset = Datasets(df, tfms=[[ColReader('text'), Tokenizer.from_df('text', res_col_name='tokens')]])

~/git/fastai2/fastai2/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
    282     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    283         super().__init__(dl_type=dl_type)
--> 284         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    285         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    286 

~/git/fastai2/fastai2/data/core.py in <listcomp>(.0)
    282     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    283         super().__init__(dl_type=dl_type)
--> 284         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    285         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    286 

~/git/fastcore/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
     45             return x
     46 
---> 47         res = super().__call__(*((x,) + args), **kwargs)
     48         res._newchk = 0
     49         return res

~/git/fastai2/fastai2/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose)
    220         if do_setup:
    221             pv(f"Setting up {self.tfms}", verbose)
--> 222             self.setup(train_setup=train_setup)
    223 
    224     def _new(self, items, split_idx=None, **kwargs):

~/git/fastai2/fastai2/data/core.py in setup(self, train_setup)
    242             for f in self.tfms.fs:
    243                 self.types.append(getattr(f, 'input_types', type(x)))
--> 244                 x = f(x)
    245             self.types.append(type(x))
    246         types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()

~/git/fastcore/fastcore/transform.py in __call__(self, x, **kwargs)
     70     @property
     71     def name(self): return getattr(self, '_name', _get_name(self))
---> 72     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     73     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     74     def __repr__(self): return f'{self.name}: {self.encodes} {self.decodes}'

~/git/fastcore/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     80     def _call(self, fn, x, split_idx=None, **kwargs):
     81         if split_idx!=self.split_idx and self.split_idx is not None: return x
---> 82         return self._do_call(getattr(self, fn), x, **kwargs)
     83 
     84     def _do_call(self, f, x, **kwargs):

~/git/fastcore/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     84     def _do_call(self, f, x, **kwargs):
     85         if not _is_tuple(x):
---> 86             return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
     87         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     88         return retain_type(res, x)

~/git/fastcore/fastcore/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

~/git/fastai2/fastai2/data/transforms.py in __call__(self, o, **kwargs)
    197 
    198     def __call__(self, o, **kwargs):
--> 199         if len(self.cols) == 1: return self._do_one(o, self.cols[0])
    200         return L(self._do_one(o, c) for c in self.cols)
    201 

~/git/fastai2/fastai2/data/transforms.py in _do_one(self, r, c)
    191 
    192     def _do_one(self, r, c):
--> 193         o = r[c] if isinstance(c, int) else getattr(r, c)
    194         if len(self.pref)==0 and len(self.suff)==0 and self.label_delim is None: return o
    195         if self.label_delim is None: return f'{self.pref}{o}{self.suff}'

~/anaconda3/envs/dl/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'text'

The problem happens inside tokenize_df:

    other_cols = df.columns[~df.columns.isin(text_cols)]
    res = df[other_cols].copy()
    res[res_col_name] = outputs

I think we don’t need to remove text_cols, just override it if needed, can try something like:

    other_cols = text_cols.filter(lambda o: o != res_col_name)
    other_cols = df.columns[~df.columns.isin(other_cols)]
    res = df[other_cols].copy()
    res[res_col_name] = outputs

This would fix the bug, but caching will still not happen when res_col_name is changed

Or you could duplicate your text column needed beforehand, with another name, so it stays there :wink:

2 Likes

Comparing interpret.py from fastai1:

To interpret.py from fastai2:

It appears TextClassificationInterpretation, and show_intrinsic_attention haven’t been implemented yet. I’d like to see which text weighed more heavily towards the prediction, as seen here in the fastai1 doc:

https://docs.fast.ai/text.interpret.html

Is anything like this possible in fastai2 yet?

2 Likes

Let me see what I can do :slight_smile:

@chess got it working :slight_smile: (Don’t let it scare you, call the exact same function with your Learner first then your text, IE show_intrensic_attention(learn, text)

image

@sgugger would you rather this as a seperate module or in the library itself?

Edit: may not be as simple as I hoped because ClassificationInterpretation doesn’t have access to the Learner anymore, so I’d be very interested in your thoughts

See here for my code:

2 Likes

I think an extension module is best suited (will work on that example tomorrow). The ClassificationInterpretation class is mostly for plot_top_losses or confusion_matrix. I think it’s fine to have this one have its own class.

2 Likes

Sounds good, thanks!

Thanks for the quick turnaround!!

In this code:

def intrinsic_attention(learn, text, class_id=None):
  "Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`."
  learn.model.train()
  _eval_dropouts(learn.model)
  learn.model.zero_grad()
  learn.model.reset()
  dl = dls.test_dl([text])
  ids = dl.one_batch()[0]
  emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
  lstm = learn.model[0].module(emb, True)
  learn.model.eval()
  cl = learn.model[1]((lstm, torch.zeros_like(batch).bool(),))[0].softmax(dim=-1)
  if class_id is None: class_id = cl.argmax()
  cl[0][class_id].backward()
  attn = emb.grad.squeeze().abs().sum(dim=-1)
  attn /= attn.max()
  tok, _ = learn.dls.decode_batch((*tuplify(batch), *tuplify(cl)))[0]
  return tok, attn

I got this error:

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-62-139f078376c4> in <module>
----> 1 show_intrinsic_attention(learn,"Superman is the best superhero! No one will ever defeat him!")

<ipython-input-61-ec8033bf013f> in show_intrinsic_attention(learn, text, class_id, **kwargs)
     55 
     56 def show_intrinsic_attention(learn, text:str, class_id:int=None, **kwargs)->None:
---> 57     text, attn = intrinsic_attention(learn, text, class_id)
     58     show_piece_attn(text.split(), to_np(attn), **kwargs)

<ipython-input-61-ec8033bf013f> in intrinsic_attention(learn, text, class_id)
     15   learn.model.zero_grad()
     16   learn.model.reset()
---> 17   dl = dls.test_dl([text])
     18   ids = dl.one_batch()[0]
     19   emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)

NameError: name 'dls' is not defined

Which I fixed by changing this line:

dl = dls.test_dl([text])

to:

dl = learn.dls.test_dl([text])

Then I got:

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-64-139f078376c4> in <module>
----> 1 show_intrinsic_attention(learn,"Superman is the best superhero! No one will ever defeat him!")

<ipython-input-63-86ff1b978f06> in show_intrinsic_attention(learn, text, class_id, **kwargs)
     55 
     56 def show_intrinsic_attention(learn, text:str, class_id:int=None, **kwargs)->None:
---> 57     text, attn = intrinsic_attention(learn, text, class_id)
     58     show_piece_attn(text.split(), to_np(attn), **kwargs)

<ipython-input-63-86ff1b978f06> in intrinsic_attention(learn, text, class_id)
     17   dl = learn.dls.test_dl([text])
     18   ids = dl.one_batch()[0]
---> 19   emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
     20   lstm = learn.model[0].module(emb, True)
     21   learn.model.eval()

NameError: name 'batch' is not defined

Which I fixed by changing line 18:

ids = dl.one_batch()[0]

to:

batch = dl.one_batch()[0]

Since I didn’t see that “ids” was used anywhere. Now the output has all the words highlighted the same, and it’s showing nan outputs:

Here’s how I created the learners for both (I only use one of the DataBlocks at a time):

#This is for a normal category prediction, where only one can be correct.

imdb_clas = DataBlock(blocks=(TextBlock.from_df(['names'], vocab=dbunch.vocab), CategoryBlock),
                      get_x=attrgetter('text'),
                      get_y=attrgetter('number'),
                      splitter=TrainTestSplitter(test_size = 0.2, stratify=df_numbers['number'], random_state = 12))

#This is a regression. Use this to predict a floating point number.

imdb_clas = DataBlock(blocks=(TextBlock.from_df(['names'], vocab=dbunch.vocab), RegressionBlock),
                      get_x=attrgetter('text'),
                      get_y=attrgetter('number'),
                      splitter=TrainTestSplitter(test_size = 0.1, stratify=df_scores['number'], df=df_numbers, random_state = 24)
                      )

#For regressions

callbacks = [SaveModelCallback(),EarlyStoppingCallback(patience=3)]

learn = text_classifier_learner(dbunch_class, AWD_LSTM, drop_mult=0.5, loss_func=MSELossFlat(), wd = 0.1, y_range=(-3,3), cbs=callbacks).to_fp16()
learn = learn.load_encoder('finetuned6_208.pkl')

Thanks for the help!

If y’all will be working on an official implementation of this tomorrow, we’d also love to see this brought over from https://github.com/fastai/fastai/blob/master/fastai/text/interpret.py :slight_smile:


    def show_top_losses(self, k:int, max_len:int=70)->None:
        """
        Create a tabulation showing the first `k` texts in top_losses along with their prediction, actual,loss, and probability of
        actual class. `max_len` is the maximum number of tokens displayed.
        """
        from IPython.display import display, HTML
        items = []
        tl_val,tl_idx = self.top_losses()
        for i,idx in enumerate(tl_idx):
            if k <= 0: break
            k -= 1
            tx,cl = self.data.dl(self.ds_type).dataset[idx]
            cl = cl.data
            classes = self.data.classes
            txt = ' '.join(tx.text.split(' ')[:max_len]) if max_len is not None else tx.text
            tmp = [txt, f'{classes[self.pred_class[idx]]}', f'{classes[cl]}', f'{self.losses[idx]:.2f}',
                   f'{self.preds[idx][cl]:.2f}']
            items.append(tmp)
        items = np.array(items)
        names = ['Text', 'Prediction', 'Actual', 'Loss', 'Probability']
        df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)
        with pd.option_context('display.max_colwidth', pd_max_colwidth()):
            display(HTML(df.to_html(index=False)))

Originally mentioned here:

It would be really useful in fastai2.

If not, no problem. We all appreciate your work @sgugger and @muellerzr !

It already works in fastai v2, across applications.

1 Like

I’ll debug this in a bit.

1 Like

FYI, just fixed a critical bug in WeightDropout (it basically was not working) so if you get unexpected changes in AWD LSTMs, it probably comes form that.

2 Likes

I am trying to replicate fastai2 text classification notebook on Kaggle Kernel, the TextDataLoaders generation tends to run on CPU even when GPU is enabled. Is the default setup as CPU for text api ?

Additional the kernel dies, because TextDataloader tries to use too much memory and processes. Is there a way to limit memory and core usage in Fastai2?

Finished fixing all the mess due to this bug and made things cleaner (there is no longer two parameters, one being a duplicate of the other for instance). That also means models trained previously don’t have the same parameters, so loading might be harder across versions of fastai. I added a function that should automatically convert those weights but it may fail.

In any case, since WeightDropout wasn’t working properly, models should be retrained, ideally (also it was creating issues with predictions).

3 Likes

Just updated my editable installs of fastai2 and fastcore, currently getting this message when trying to import all from fastai2.text:

from fastai2.text.all import *
from fastai2.tabular.all import *
pd.set_option("display.max_columns", 50)
import seaborn as sns
sns.set(style='whitegrid')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-2-360028788016> in <module>
----> 1 from fastai2.text.all import *
      2 from fastai2.tabular.all import *
      3 pd.set_option("display.max_columns", 50)
      4 import seaborn as sns
      5 sns.set(style='whitegrid')

~/development/_training/fastai2/fastai2/text/all.py in <module>
----> 1 from ..basics import *
      2 from ..callback.all import *
      3 from .core import *
      4 from .data import *
      5 from .models import *

~/development/_training/fastai2/fastai2/basics.py in <module>
----> 1 from .data.all import *
      2 from .optimizer import *
      3 from .callback.core import *
      4 from .learner import *
      5 from .metrics import *

~/development/_training/fastai2/fastai2/data/all.py in <module>
      1 from ..torch_basics import *
----> 2 from .core import *
      3 from .load import *
      4 from .external import *
      5 from .transforms import *

~/development/_training/fastai2/fastai2/data/core.py in <module>
     33 
     34 # Cell
---> 35 @log_args(but_as=DataLoader.__init__)
     36 @delegates()
     37 class TfmdDL(DataLoader):

TypeError: log_args() got an unexpected keyword argument 'but_as'

Anyone else getting this? Going to revert to an earlier fastai2/fastcore version for now.

It looks like you don’t have the latest version of fastcore.

3 Likes

Ah yes, I forgot to do a git pull :man_facepalming:

1 Like

Could someone help me with this:?

I am trying to load the following data into a TextDataLoader:

id comment_text toxic severe_toxic obscene threat insult identity_hate
0 0000997932d777bf Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren’t vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don’t remove the template from the talk page since I’m retired now.89.205.38.27 0 0 0 0 0 0
1 000103f0d9cfb60f D’aww! He matches this background colour I’m seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC) 0 0 0 0 0 0
2 000113f07ec002fd Hey man, I’m really not trying to edit war. It’s just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info. 0 0 0 0 0 0
3 0001b41b1c6bb37e "\nMore\nI can’t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ““types of accidents”” -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It’s listed in the relevant form eg Wikipedia:Good_ar… 0 0 0 0 0 0
4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember what page that’s on? 0 0 0 0 0 0

I use the following line of code:

dls = TextDataLoaders.from_csv(data_drive, csv_fname="train.csv",valid_pct=0.1,text_col="comment_text",label_col=["toxic","severe_toxic","obscene","threat","insult","identity_hate"])

I end up getting an error and I can’t find why:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-5-3c53bcdc9583> in <module>()
      1 default_device(None)
      2 
----> 3 dls = TextDataLoaders.from_csv(data_drive, csv_fname="train.csv",valid_pct=0.1,text_col="comment_text",label_col=["toxic","severe_toxic","obscene","threat","insult","identity_hate"])

16 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   5272             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5273                 return self[name]
-> 5274             return object.__getattribute__(self, name)
   5275 
   5276     def __setattr__(self, name: str, value) -> None:

AttributeError: 'Series' object has no attribute 'comment_text'

Check the examples here:

After the dataloader is done tokenizing the text, the text column is always labelled “text”

It’s asking for the tokenized text column, not the column in your original csv that has the text you want to analyze. That can be confusing, because in this example, the actual untokenized text was ALSO in a “text” column originally! I’ve never done this with .from_csv, when I import data from a csv, I first import it to a dataframe:

df = pd.read_csv('myamazing.csv', low_memory=False)

Then throw that into a datablock:

imdb_clas = DataBlock(blocks=(TextBlock.from_df(['description'], vocab=dbunch.vocab), RegressionBlock),
                          get_x=attrgetter('text'),
                          get_y=attrgetter('number')
                          )

And throw that into the dataloader:

dbunch_class = imdb_clas.dataloaders(df, bs=64, seq_len=80)

In this example, my csv has two columns:

description | number

Hope this helps!

1 Like