I am trying to create a custom model similar to BERT (not for NLP), and I have been operating in Pytorch. I’m hoping to use the Learner wrapper class on the final model for training so I can use fit_one_cycle and freeze etc. However there is a tricky part to the training related to how BERT does token masking for predictions. The below code will fire off model training much like fit_one_cycle will, and I need to implement the masking section (where I call the mask_tokens function/method) somewhere into Fastai code such that I can get it to train the way I want, and I’m struggling to navigate through the code (my experience with python is limited to hacking existing and make minor changes to do what I need it to). Can someone please provide some suggestions.
def train(train_dataloader, model, config):
global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(int(arg.num_train_epochs), desc="Epoch", disable=arg.local_rank not in [-1, 0])
set_seed(arg.seed_value, arg.n_gpu) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=arg.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
batch = batch.float() # to resolve the runtime error
model.train()
inputs_, labels, amounts, masked_indices = mask_tokens(batch, tok, config)**
inputs = {"input_data": inputs_,**
"labels": labels,**
"amounts": amounts}**
outputs = model(**inputs)**
loss = outputs[0]
if arg.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
if arg.gradient_accumulation_steps > 1:
loss = loss / arg.gradient_accumulation_steps
if arg.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
print(loss)
tr_loss += loss.item()
if (step + 1) % arg.gradient_accumulation_steps == 0:
if arg.fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), arg.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), arg.max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
model.zero_grad()
global_step += 1
if arg.local_rank in [-1, 0] and arg.save_steps > 0 and global_step % arg.save_steps == 0:
# Save model checkpoint
output_dir_ = os.path.join(output_dir, "checkpoint-{}".format(global_step))
if not os.path.exists(output_dir_):
os.makedirs(output_dir_)
model_to_save = (model.module if hasattr(model, "module") else model) # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir_)
if arg.max_steps > 0 and global_step > arg.max_steps:
epoch_iterator.close()
break
if arg.max_steps > 0 and global_step > arg.max_steps:
train_iterator.close()
break