I’m having this problem with the current fastai version (1.0.43.post1):
If I run this toy example:
from fastai.vision import *
path = untar_data(URLs.MNIST)
data = (ImageItemList.from_folder(path/'training')
.random_split_by_pct()
.label_from_folder()
.databunch())
learn = create_cnn(data, models.resnet18)
learn.fit_one_cycle(2, 1e-2)
learn.save('model')
learn.load('model')
learn.fit_one_cycle(2, 1e-3) # <--- this works
But if I load the model on a new python session and try to continue training:
from fastai.vision import *
path = untar_data(URLs.MNIST)
data = (ImageItemList.from_folder(path/'training')
.random_split_by_pct()
.label_from_folder()
.databunch())
learn.load('model')
learn.fit_one_cycle(2, 1e-3) # <--- this doesn't works
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-4-486c2b8b9d58> in <module>
----> 1 learn.fit_one_cycle(1, 1e-3)
~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, wd, callbacks, tot_epochs, start_epoch)
20 callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start, tot_epochs=tot_epochs,
21 start_epoch=start_epoch))
---> 22 learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
23
24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):
~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
176 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
177 fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 178 callbacks=self.callbacks+callbacks)
179
180 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/utils/mem.py in wrapper(*args, **kwargs)
101
102 try:
--> 103 return func(*args, **kwargs)
104 except Exception as e:
105 if ("CUDA out of memory" in str(e) or
~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
88 for xb,yb in progress_bar(data.train_dl, parent=pbar):
89 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 90 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
91 if cb_handler.on_batch_end(loss): break
92
~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
28 loss.backward()
29 cb_handler.on_backward_end()
---> 30 opt.step()
31 cb_handler.on_step_end()
32 opt.zero_grad()
~/anaconda3/envs/ml/lib/python3.7/site-packages/fastai/callback.py in step(self)
45 for p in pg2['params']: p.data.mul_(1 - wd*lr)
46 self.set_val('weight_decay', listify(0, self._wd))
---> 47 self.opt.step()
48
49 def zero_grad(self)->None:
~/anaconda3/envs/ml/lib/python3.7/site-packages/torch/optim/adam.py in step(self, closure)
91
92 # Decay the first and second moment running average coefficient
---> 93 exp_avg.mul_(beta1).add_(1 - beta1, grad)
94 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
95 if amsgrad:
RuntimeError: The size of tensor a (128) must match the size of tensor b (64) at non-singleton dimension 0
Anyone else can replicate this? I will try to look into the issue.
update:
Apparently the problem have to do with the way the optimizer info is being saved, if the models is stored with learn.save('model', with_opt=False)
there is no problem.
update 2:
There is a mismatch on the shapes of the exponential averages (exp_avg) after loading the optimizer: