Error when training after loading learner

I’m having this problem with the current fastai version (1.0.43.post1):

If I run this toy example:

from fastai.vision import *
path = untar_data(URLs.MNIST)
data = (ImageItemList.from_folder(path/'training')
                     .random_split_by_pct()
                     .label_from_folder()
                     .databunch())
learn = create_cnn(data, models.resnet18)
learn.fit_one_cycle(2, 1e-2)
learn.save('model')
learn.load('model')
learn.fit_one_cycle(2, 1e-3) # <--- this works

But if I load the model on a new python session and try to continue training:

from fastai.vision import *
path = untar_data(URLs.MNIST)
data = (ImageItemList.from_folder(path/'training')
                     .random_split_by_pct()
                     .label_from_folder()
                     .databunch())
learn.load('model')
learn.fit_one_cycle(2, 1e-3) # <--- this doesn't works
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-486c2b8b9d58> in <module>
----> 1 learn.fit_one_cycle(1, 1e-3)

~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, wd, callbacks, tot_epochs, start_epoch)
     20     callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start, tot_epochs=tot_epochs, 
     21                                        start_epoch=start_epoch))
---> 22     learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
     23 
     24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):

~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    176         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    177         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 178             callbacks=self.callbacks+callbacks)
    179 
    180     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/utils/mem.py in wrapper(*args, **kwargs)
    101 
    102         try:
--> 103             return func(*args, **kwargs)
    104         except Exception as e:
    105             if ("CUDA out of memory" in str(e) or

~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     88             for xb,yb in progress_bar(data.train_dl, parent=pbar):
     89                 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 90                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     91                 if cb_handler.on_batch_end(loss): break
     92 

~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     28         loss.backward()
     29         cb_handler.on_backward_end()
---> 30         opt.step()
     31         cb_handler.on_step_end()
     32         opt.zero_grad()

~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/callback.py in step(self)
     45                     for p in pg2['params']: p.data.mul_(1 - wd*lr)
     46             self.set_val('weight_decay', listify(0, self._wd))
---> 47         self.opt.step()
     48 
     49     def zero_grad(self)->None:

~/anaconda3/envs/ml/lib/python3.7/site-packages/torch/optim/adam.py in step(self, closure)
     91 
     92                 # Decay the first and second moment running average coefficient
---> 93                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
     94                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
     95                 if amsgrad:

RuntimeError: The size of tensor a (128) must match the size of tensor b (64) at non-singleton dimension 0

Anyone else can replicate this? I will try to look into the issue.

update:
Apparently the problem have to do with the way the optimizer info is being saved, if the models is stored with learn.save('model', with_opt=False) there is no problem.

update 2:
There is a mismatch on the shapes of the exponential averages (exp_avg) after loading the optimizer:

3 Likes

Now it’s solved in the new fast.ai update.
with_opt=False is no longer necessary