Do I have to use both learn save and export to save both model architecture and weights?

wdewit · May 21, 2021, 8:11am

Hi,

Yestererday evening I saved my intermediate model using learn.save(‘modelname’).
However, when today I load this and check the confusion matrix on my test set to see it is loaded correctly I suddenly have a very bad confusion matrix. Is it possible that the save only saves my model architecture but not the weights I had yestererday eve ? How do I have to save both my model architecture and weights ? Do I have to do both learn.save and learn.export ?

learn.save(‘25epochs_with_testdata’)
df_test.to_pickle(‘df_test’)
df_valid.to_pickle(‘df_valid’)
df_train.to_pickle(‘df_train’)

df_test=pd.read_pickle(“df_test”)
df_valid=pd.read_pickle(“df_valid”)
df_train=pd.read_pickle(“df_train”)
learn = learn.load(‘25epochs_with_testdata’)
preds, pred_values = get_preds_as_nparray(DatasetType.Test)
predicted=pd.DataFrame(pred_values)
predicted.columns=[‘predicted’]
testcf=df_test.join(predicted)
pd.crosstab(testcf.ylabel, testcf.predicted, margins=True, margins_name=“Total”)

gives now:

while I had:

I’m using fastai v1.0.61 with the Roberta model for Dutch:

class FastAiRobertaTokenizer(BaseTokenizer):
def init(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs):
self._pretrained_tokenizer = tokenizer
self.max_seq_len = max_seq_len
def call(self, *args, **kwargs):
return self
def tokenizer(self, t:str) → List[str]:
return [“~~”] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [“~~”]

class RobertaTokenizeProcessor(TokenizeProcessor):
def init(self, tokenizer):
super().init(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
def init(self, *args, **kwargs):
super().init(*args, vocab=fastai_roberta_vocab, **kwargs)

def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
return [RobertaTokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(vocab=vocab)]
#return [RobertaTokenizeProcessor(tokenizer=tokenizer), RobertaNumericalizeProcessor(vocab=vocab)]

#create a databunch for Roberta
class RobertaDataBunch(TextDataBunch):
@classmethod
def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr=‘.’, bs:int=64, val_bs:int=None, pad_idx=1,
pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False,
dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) → DataBunch:
“Function that transform the datasets in a DataBunch for classification. Passes **dl_kwargs on to DataLoader()”
datasets = cls._init_ds(train_ds, valid_ds, test_ds)
val_bs = ifnone(val_bs, bs)
collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
dataloaders = [train_dl]
for ds in datasets[1:]:
lengths = [len(t) for t in ds.x.items]
sampler = SortSampler(ds.x, key=lengths.getitem)
dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

#create a Roberta specific textclass
class RobertaTextList(TextList):
_bunch = RobertaDataBunch
_label_cls = TextList

feat_cols = “t
ekst_cleaned”
label_cols = “result”
#initialize our Fastai tokenizer
fastai_tokenizer = Tokenizer(tok_func = FastAiRobertaTokenizer(dtokenizer, max_seq_len=512), pre_rules=, post_rules=)

def set_seed(x=42): #must have dls, as it has an internal random.Random
random.seed(x)
np.random.seed(x)
torch.manual_seed(x)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)

set_seed(125)

processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)
data = RobertaTextList.from_df(df_train, “.”, cols=feat_cols, processor=processor)
.split_from_df(col=‘valid’)
.label_from_df(cols=label_cols,label_cls=CategoryList)
.add_test(RobertaTextList.from_df(df_test, “.”, cols=feat_cols, processor=processor))
.databunch(bs=32, pad_first=False, pad_idx=0) #pad till maximum sequence length and ignore pads

import torch
import torch.nn as nn
from transformers import RobertaModel
#from transformers import RobertaForSequenceClassification

class CustomRobertatModel(nn.Module):
def init(self,num_labels=2):
super(CustomRobertatModel,self).init()
self.num_labels = num_labels
self.roberta = RobertaModel.from_pretrained(“pdelobelle/robbert-v2-dutch-base”)
#self.roberta= RobertaForSequenceClassification.from_pretrained(“pdelobelle/robbert-v2-dutch-base”)
self.dropout = nn.Dropout(.05)
self.classifier = nn.Linear(768, num_labels)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
    _ , pooled_output = self.roberta(input_ids, token_type_ids, attention_mask, return_dict=False)  #return_dict toegevoegd. Cfhttps://stackoverflow.com/questions/66846030/typeerror-linear-argument-input-position-1-must-be-tensor-not-str
    logits = self.classifier(pooled_output)        
    return logits
roberta_model = CustomRobertatModel(num_labels=5)

learn = Learner(data, roberta_model, metrics=[accuracy])

learn.model.roberta.train() # set roberta into train mode
learn.fit_one_cycle(1, max_lr=1e-5)

… trained a couple more epochs and saved the model

learn.save(‘25epochs_with_testdata’)

BobMcDear · May 21, 2021, 10:33am

On the contrary, actually: Fastai saves solely the parameters, not the architecture, so rest assured that’s not the issue. I can’t examine your code right now, but note your model always predicts zero, which is worth pursuing.

By the way, fastai has built-in methods for plotting a confusion matrix and the most incorrect predictions, you don’t need to do it yourself.

Best of luck!

wdewit · May 21, 2021, 1:03pm

I now tried an export, but then somehow I can’t do a batch scoring of the testfile from the databunch anymore when reloading the exported model.

#export the learner
learn.export()

#reload
path = Path()
learn=load_learner(path=path,file=‘export.pkl’)

#reload databunch
data=load_data(path,‘train_and_valid_and_test_data’)

#batch scoring the test part of the databunch
preds, pred_values = get_preds_as_nparray(DatasetType.Test)
predicted=pd.DataFrame(pred_values)
predicted.columns=[‘predicted’]
testcf=df_test.join(predicted)
pd.crosstab(testcf.ylabel, testcf.predicted, margins=True, margins_name=“Total”)

gives an error:

I can get a prediction for a single line when loading my testdata back in from a pickle file:
df_test=pd.read_pickle(“df_test”)
i=0
predictie=learn.predict(df_test[‘tekst_cleaned’][i])
but then predictie[0] is a Category and I don’t know how to add this to a numpy array so that I can use a for loop to make the predictions for each line of df_test.

BobMcDear · May 21, 2021, 2:09pm

The reason you’re receiving the error is that when you load an exported Learner, the data is just dummy data, and there’s therefore no test set. You need to manually do learn.data = data for batch inference using the Learner’s built-in methods.

Regarding your second problem, you can do int(category) to get the index of the label or str(category) for its name.

Please do let me know if you had any other questions!

Cheers!

wdewit · May 21, 2021, 3:55pm

I found it why he suddenly gave very bad predictions: an index mismatching …Solved Thanks for your help !

I do however have a final question. I followed the methodology from Using RoBERTa with fast.ai for NLP | by Dev Sharma | Analytics Vidhya | Medium to use the Dutch Roberta model as a pretrained language model for my classification issue on Dutch conversations.
However, my data contains some sector specific terms which the Roberta model didn’t know.
So, I was wondering how I could now also relearn the word embeddings from the Roberta model for these unknown words. I can’t use unfreeze(-2) on my classification learner as was shown in the course as according to him I only have one layer (the linear layer added on top of the pretrained model).

Is there a way I can use fastai also to refine the word embeddings from a pretrained Roberta model using the language_model_learner ?

BobMcDear · May 21, 2021, 4:26pm

Happy to hear that!

On a high level, you would need to create embeddings and tokens for the unseen terms. Concretely, say RoBERTa’s vocabulary consists of 1,000 words, but your text has three new ones. All you must do is add three rows to RoBERTa’s embedding layer and set their initial values to the average of the original 1,000 embeddings. For tokenization, the three new words should get tokens 1,000, 1,001, and 1,002 (assuming the words RoBERTa already knows have tokens from 0 for 999 inclusive), and that’s essentially it.

Maybe you could draw inspiration from fastai:

def convert_weights(wgts:Weights, stoi_wgts:Dict[str,int], itos_new:Collection[str]) -> Weights:
    "Convert the model `wgts` to go with a new vocabulary."
    dec_bias, enc_wgts = wgts.get('1.decoder.bias', None), wgts['0.encoder.weight']
    wgts_m = enc_wgts.mean(0)
    if dec_bias is not None: bias_m = dec_bias.mean(0)
    new_w = enc_wgts.new_zeros((len(itos_new),enc_wgts.size(1))).zero_()
    if dec_bias is not None: new_b = dec_bias.new_zeros((len(itos_new),)).zero_()
    for i,w in enumerate(itos_new):
        r = stoi_wgts[w] if w in stoi_wgts else -1
        new_w[i] = enc_wgts[r] if r>=0 else wgts_m
        if dec_bias is not None: new_b[i] = dec_bias[r] if r>=0 else bias_m
    wgts['0.encoder.weight'] = new_w
    if '0.encoder_dp.emb.weight' in wgts: wgts['0.encoder_dp.emb.weight'] = new_w.clone()
    wgts['1.decoder.weight'] = new_w.clone()
    if dec_bias is not None: wgts['1.decoder.bias'] = new_b
    return wgts

Have a great weekend!

wdewit · May 25, 2021, 10:25am

Many thanks for your help !

One thing I want to check still: as long as I haven’t retrained the word embeddings from Roberta to capture the unseen terms correctly, what happens with the unseen terms ? Am I interpreting it correctly that they disappear in the word embeddings from the ‘unknown’ tag or are there normally no word embeddings for the ‘unknown’ tag and so he doesn’t even do anything at all with them when doing transfer learning on the word embeddings of the Roberta model ?