How to properly work with huggingface transformers in fastai

cmeaton · August 26, 2021, 4:09pm

I am have trained DistilBERT via fastai and huggingface for a sequence classification problem. I found a useful tutorial that gave a good example on how to do this with binary classification. The code is below:

# !pip install torch==1.9.0
# !pip install torchtext==0.10
# !pip install transformers==4.7
# !pip install fastai==2.4

from fastai.text.all import *
from sklearn.model_selection import train_test_split
import pandas as pd
import glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification


hf_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

"""
train_df and val_df looks like this:

      label	text
4240	5	whoa interesting.
13	    7	you could you could we just
4639	4	you set the goal,
28	    1	because ive already agreed to that
66	    8	oh hey freshman thats you gona need
"""

print(list(train_df.label.value_counts().index))
"""
[4, 1, 5, 6, 7, 0, 2, 3, 8]
"""

class HF_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, hf_tokenizer):
        self.df = df
        self.hf_tokenizer = hf_tokenizer
        
        self.label_map = {
            0:0,
            1:0,
            2:0,
            3:0,
            4:1,
            5:1,
            6:1,
            7:1,
            8:1
        }
        
    def __len__(self):
        return len(self.df)

    def decode(self, token_ids):
        return ' '.join([hf_tokenizer.decode(x) for x in tokenizer_outputs['input_ids']])
    
    def decode_to_original(self, token_ids):
        return self.hf_tokenizer.decode(token_ids.squeeze())

    def __getitem__(self, index):
        label, text = self.df.iloc[index]
        label = self.label_map[label]
        label = torch.tensor(label)

        tokenizer_output = self.hf_tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        
        tokenizer_output['input_ids'].squeeze_()
        tokenizer_output['attention_mask'].squeeze_()
        
        return tokenizer_output, label
        

train_dataset = HF_Dataset(train_df, hf_tokenizer)
valid_dataset = HF_Dataset(valid_df, hf_tokenizer)

train_dl = DataLoader(train_dataset, bs=16, shuffle=True)
valid_dl = DataLoader(valid_dataset, bs=16)
dls = DataLoaders(train_dl, valid_dl)
hf_model(**batched_data)


class HF_Model(nn.Module):
  
    def __init__(self, hf_model):
        super().__init__()
        
        self.hf_model = hf_model
        
    def forward(self, tokenizer_outputs):
        
        model_output = self.hf_model(**tokenizer_outputs)
        
        return model_output.logits
        
model = HF_Model(hf_model)
# Manually popping the model onto the gpu since the data is in a dictionary format
# (doesn't automatically place model + data on gpu otherwise)
learn = Learner(dls, model, loss_func=nn.CrossEntropyLoss(), metrics=[accuracy])
learn.fit_one_cycle(3, 1e-4)

This trains like a normal fastiai pipeline. However, after training (fit_one_cycle), all further attempts to work with the learn object break.

When I run:

interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(title='Confusion matrix')

I get:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-51-93b05a3209d9> in <module>
      1 # predict the validation set with our model
----> 2 interp = ClassificationInterpretation.from_learner(learn)
      3 interp.plot_confusion_matrix(title='Confusion matrix')

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/interpret.py in from_learner(cls, learn, ds_idx, dl, act)
     27         "Construct interpretation object from a learner"
     28         if dl is None: dl = learn.dls[ds_idx].new(shuffled=False, drop_last=False)
---> 29         return cls(dl, *learn.get_preds(dl=dl, with_input=True, with_loss=True, with_decoded=True, act=None))
     30 
     31     def top_losses(self, k=None, largest=True):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in get_preds(self, ds_idx, dl, with_input, with_decoded, with_loss, act, inner, reorder, cbs, **kwargs)
    258                 res[pred_i] = act(res[pred_i])
    259                 if with_decoded: res.insert(pred_i+2, getattr(self.loss_func, 'decodes', noop)(res[pred_i]))
--> 260             if reorder and hasattr(dl, 'get_idxs'): res = nested_reorder(res, tensor(idxs).argsort())
    261             return tuple(res)
    262         self._end_cleanup()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/torch_core.py in nested_reorder(t, idxs)
    710     "Reorder all tensors in `t` using `idxs`"
    711     if isinstance(t, (Tensor,L)): return t[idxs]
--> 712     elif is_listy(t): return type(t)(nested_reorder(t_, idxs) for t_ in t)
    713     if t is None: return t
    714     raise TypeError(f"Expected tensor, tuple, list or L but got {type(t)}")

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/torch_core.py in <genexpr>(.0)
    710     "Reorder all tensors in `t` using `idxs`"
    711     if isinstance(t, (Tensor,L)): return t[idxs]
--> 712     elif is_listy(t): return type(t)(nested_reorder(t_, idxs) for t_ in t)
    713     if t is None: return t
    714     raise TypeError(f"Expected tensor, tuple, list or L but got {type(t)}")

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/torch_core.py in nested_reorder(t, idxs)
    712     elif is_listy(t): return type(t)(nested_reorder(t_, idxs) for t_ in t)
    713     if t is None: return t
--> 714     raise TypeError(f"Expected tensor, tuple, list or L but got {type(t)}")
    715 
    716 # Cell

TypeError: Expected tensor, tuple, list or L but got <class 'dict'>

When I run:

learn.export('export')

I get:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-52-4de7bc9fac6f> in <module>
----> 1 learn.export('export')

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in export(self, fname, pickle_module, pickle_protocol)
    367     self._end_cleanup()
    368     old_dbunch = self.dls
--> 369     self.dls = self.dls.new_empty()
    370     state = self.opt.state_dict() if self.opt is not None else None
    371     self.opt = None

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/data/core.py in new_empty(self)
    144     def __getitem__(self, i): return self.loaders[i]
    145     def new_empty(self):
--> 146         loaders = [dl.new(dl.dataset.new_empty()) for dl in self.loaders]
    147         return type(self)(*loaders, path=self.path, device=self.device)
    148 

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/data/core.py in <listcomp>(.0)
    144     def __getitem__(self, i): return self.loaders[i]
    145     def new_empty(self):
--> 146         loaders = [dl.new(dl.dataset.new_empty()) for dl in self.loaders]
    147         return type(self)(*loaders, path=self.path, device=self.device)
    148 

AttributeError: 'HF_Dataset' object has no attribute 'new_empty'

When I run:

sample_text = test_df.text.values[0]
# sample text = 'hey how are you'
learn.predict(sample_text)

I get:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-53-7e8410893154> in <module>
----> 1 learn.predict(sample_text)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in predict(self, item, rm_type_tfms, with_input)
    264     def predict(self, item, rm_type_tfms=None, with_input=False):
    265         dl = self.dls.test_dl([item], rm_type_tfms=rm_type_tfms, num_workers=0)
--> 266         inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True)
    267         i = getattr(self.dls, 'n_inp', -1)
    268         inp = (inp,) if i==1 else tuplify(inp)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in get_preds(self, ds_idx, dl, with_input, with_decoded, with_loss, act, inner, reorder, cbs, **kwargs)
    251         if with_loss: ctx_mgrs.append(self.loss_not_reduced())
    252         with ContextManagers(ctx_mgrs):
--> 253             self._do_epoch_validate(dl=dl)
    254             if act is None: act = getattr(self.loss_func, 'activation', noop)
    255             res = cb.all_tensors()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_epoch_validate(self, ds_idx, dl)
    201         if dl is None: dl = self.dls[ds_idx]
    202         self.dl = dl
--> 203         with torch.no_grad(): self._with_events(self.all_batches, 'validate', CancelValidException)
    204 
    205     def _do_epoch(self):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in all_batches(self)
    167     def all_batches(self):
    168         self.n_iter = len(self.dl)
--> 169         for o in enumerate(self.dl): self.one_batch(*o)
    170 
    171     def _do_one_batch(self):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in one_batch(self, i, b)
    192         b = self._set_device(b)
    193         self._split(b)
--> 194         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    195 
    196     def _do_epoch_train(self):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_one_batch(self)
    170 
    171     def _do_one_batch(self):
--> 172         self.pred = self.model(*self.xb)
    173         self('after_pred')
    174         if len(self.yb):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-16-3f4e8a42b8ff> in forward(self, tokenizer_outputs)
      9     def forward(self, tokenizer_outputs):
     10 
---> 11         model_output = self.hf_model(**tokenizer_outputs)
     12 
     13         return model_output.logits

TypeError: DistilBertForSequenceClassification object argument after ** must be a mapping, not str

The fact that all of these normal fastai procedures are breaking indicates there must be some general problem I am running into. What do I need to do to treat learn like one normally does in a fastai pipeline?

For reference:

print(fastai.__version__)
print(torch.__version__)
print(transformers.__version__)

2.4
1.9.0+cu102
4.7.0

Running on Ubuntu 18.04.