Multi-label classification with Bert + Fastai returns same prediction every time

I’ve tried to reproduce the BERT + Fastai article but have run into an issue. @abhikjha

My code is in this gist. It will run on Colab…

It doesn’t seem to work for cases with two labels (I’ve yet to try with five, as used in the original; but it doesn’t work for two).

I’m encountering two errors:

The model assigns the same probability to every text snippet (which is not high enough to classify any text as one of the two classes). The “accuracy” is just a measure of how many snippets are NOT labeled as one of the two classes. This is clear in how the accuracy is not changing over epochs.

I’ve made only a few modifications from the article.

  1. I removed three of the columns from the jigsaw dataset so this is a multi-classification but there are only two classes to choose from rather than five.
  2. As a result, when BERT is imported, I do: bert_model_class = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) (setting num_labels to two)
  3. I don’t split the layers for the discriminant analysis.
  4. In the original post, fit_one_cycle is parameterized for having more layers. Ie.,: .fit_one_cycle(2, max_lr=slice(1e-5, 5e-4), moms=(0.8,0.7), pct_start=0.2, wd =(1e-7, 1e-5, 1e-4, 1e-3, 1e-2)). I choose simple defaults (chosen from the learning rate finder). So I am not passing in tuples.`

Does anyone know why this is happening? I will try with more classes but wanted to check here first…

In case it’s not obvious the dataset looks like this.

1 Like

I was able to get this work. For now, running with one classification column, but this should be adaptable.

class Config(dict):
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
config = Config(
    num_labels=2, # 0 or 1


class FastAITokenizer():
    def __init__(self, model_name: str, max_seq_len: int=128, do_lower_case: bool=True, **kwargs):
        self.bert_tok = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)

    def bert_tokenizer(self, pre_rules=[], post_rules=[]):
        return Tokenizer(tok_func=FastAIBertTokenizer(self.bert_tok, max_seq_len=self.max_seq_len), pre_rules=[], post_rules=[])

    def fastai_bert_vocab(self):
        return Vocab(list(self.bert_tok.vocab.keys()))

class FastAIBertTokenizer(BaseTokenizer): 
    """Wrapper around BertTokenizer to be compatible with"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs): 
         self._pretrained_tokenizer = tokenizer 
         self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
         return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Limits the maximum sequence length""" 
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

class BertLearner(Learner):

    def unfreeze_all_layers(self)->None:
        for name, param in self.model.named_parameters():
            param.requires_grad = True
    def freeze_embeddings(self)->None:
        for name, param in self.model.named_parameters():
            # FIXME: check if any batchnorm layer present, set to False
            if ('embeddings' in name) or ('LayerNorm' in name):
                param.requires_grad = False
                param.requires_grad = True
    def freeze_encoders_to(self, n=12)->None:
        for name, param in self.model.named_parameters():
            if 'encoder' in name:
                index=[int(s) for s in name.split(".") if s.isdigit()][0]
            if ('embeddings' in name) or ('LayerNorm' in name) or index < n:
                param.requires_grad = False
                param.requires_grad = True
    def freeze_all_layers(self):
        for name, param in self.model.bert.named_parameters():
            param.requires_grad = False
    def print_trainable_layers(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad: print(name)
    def get_ordered_preds(self, ds_type:DatasetType=DatasetType.Valid, with_loss:bool=False, n_batch:Optional[int]=None, pbar:Optional[PBar]=None,
              ordered:bool=True) -> List[Tensor]:
        "Return predictions and targets on the valid, train, or test set, depending on `ds_type`."
        #FIXME: check if this is required. reset is done in fastai. implement if require for BERT also
        if ordered: np.random.seed(42)
        preds = self.get_preds(ds_type=ds_type, with_loss=with_loss, n_batch=n_batch, pbar=pbar)
        if ordered and hasattr(self.dl(ds_type), 'sampler'):
            sampler = [i for i in self.dl(ds_type).sampler]
            reverse_sampler = np.argsort(sampler)
            preds = [p[reverse_sampler] for p in preds] 

    def get_predictions(self, ds_type:DatasetType=DatasetType.Valid, with_loss:bool=False, n_batch:Optional[int]=None, pbar:Optional[PBar]=None,
        preds, true_labels =  self.get_ordered_preds(ds_type=ds_type, with_loss=with_loss, n_batch=n_batch, pbar=pbar, ordered=ordered)
        pred_values = np.argmax(preds, axis=1)
        return preds, pred_values, true_labels
    def print_metrics(self, preds, pred_values, true_labels):
        acc = accuracy(preds, true_labels)
        f1s = f1_score(true_labels, pred_values)
        print(f"Accuracy={acc}, f1_score={f1s}")

    def load_best_model(self, model_name="bestmodel"):
            self.load(model_name, purge=False)
            print(f"Loading {model_name}")
            print(f"Failed to load {model_name}")

    def similar(self, text):
        cls, _, _ = self.predict(text)
        return cls.obj == 1


databunch = TextDataBunch.from_df(".", train_df=train,valid_df=val,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),


bert_model = BertForSequenceClassification.from_pretrained(
    config.bert_model_name, num_labels=config.num_labels)

learner = BertLearner(databunch, bert_model,

learner.fit_one_cycle(config.epochs, max_lr=config.max_lr)

It mostly follows this tutorial.

Model is learning well before layer freezing.

1 Like