Can anyone diagnose this bug?
I have done a lot of alterations to the model and built my own DataSet outside of fastai. Only the final layer is not frozen. The model and inputs are all CUDA.
learn.lr_find() #Runs ok
lr1 = 1e-3
for i in range(1):
learn.fit_one_cycle(1,lr_max=lr1) #Trains ok
learn.lr_find() #Runs ok
lr1 = 1e-5
for i in range(1):
learn.fit_one_cycle(1,lr_max=lr1) #Gives the error below.
If I do NOT run lr_find() between the two fit_one_cycle()'s, both run ok. So it seem that the second lr_find() is somehow causing the second fit_one_cycle() to fail.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-37-bbb75180bc8b> in <module>
1 lr1 = 1e-5
2 for i in range(1):
----> 3 learn.fit_one_cycle(1,lr_max=lr1)
4 # 0 0.000510 0.000368 03:42
~/fastaiActive/repos-fastai/fastai/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
110 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
111 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
113
114 # Cell
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
209 self.opt.set_hypers(lr=self.lr if lr is None else lr)
210 self.n_epoch = n_epoch
--> 211 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
212
213 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _do_fit(self)
200 for epoch in range(self.n_epoch):
201 self.epoch=epoch
--> 202 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
203
204 def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _do_epoch(self)
194
195 def _do_epoch(self):
--> 196 self._do_epoch_train()
197 self._do_epoch_validate()
198
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _do_epoch_train(self)
186 def _do_epoch_train(self):
187 self.dl = self.dls.train
--> 188 self._with_events(self.all_batches, 'train', CancelTrainException)
189
190 def _do_epoch_validate(self, ds_idx=1, dl=None):
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in all_batches(self)
164 def all_batches(self):
165 self.n_iter = len(self.dl)
--> 166 for o in enumerate(self.dl): self.one_batch(*o)
167
168 def _do_one_batch(self):
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in one_batch(self, i, b)
182 self.iter = i
183 self._split(b)
--> 184 self._with_events(self._do_one_batch, 'batch', CancelBatchException)
185
186 def _do_epoch_train(self):
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _do_one_batch(self)
176 self('before_backward')
177 self.loss_grad.backward()
--> 178 self._with_events(self.opt.step, 'step', CancelStepException)
179 self.opt.zero_grad()
180
~/fastaiActive/repos-fastai/fastai/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/fastaiActive/repos-fastai/fastai/fastai/optimizer.py in step(self)
80 def step(self):
81 for p,pg,state,hyper in self.all_params(with_grad=True):
---> 82 for cb in self.cbs: state = _update(state, cb(p, **{**state, **hyper}))
83 self.state[p] = state
84
~/fastaiActive/repos-fastai/fastai/fastai/optimizer.py in average_grad(p, mom, dampening, grad_avg, **kwargs)
120 if grad_avg is None: grad_avg = torch.zeros_like(p.grad.data)
121 damp = 1-mom if dampening else 1.
--> 122 grad_avg.mul_(mom).add_(p.grad.data, alpha=damp)
123 return {'grad_avg': grad_avg}
124
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Thanks!