Trying the new lesson 1 (Part 1 v 2) on my Windows PC, with an NVIDIA GTX 650 Ti, CUDA 9, CuDNN 7. A third of the way into learn.fit after unfreezing the model, it crashes out with a CUDNN error.
Can anyone advise me - I suspect I am asking too much of my GPU (I already had to reduce batch size to 24 just to prevent a memory error).
Update: I have since restarted the notebook and have tried running again without the cycle_save parameter, and I have got through 1 epoch.
Additionally, after reading posts at these 2 links I am pretty sure I am just running out of memory, which is kind of expected since I only have 3GB GPU memory
learn.fit(lr, 3, cycle_len=1, cycle_mult=2 , cycle_save_name=βlesson1winβ)
Epoch
0% 0/7 [00:00<?, ?it/s]
31%|βββββββββββββββββββββ | 297/959 [48:55<1:49:04, 9.89s/it, loss=0.0808]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-153-9b95850059b8> in <module>()
----> 1 learn.fit(lr, 3, cycle_len=1, cycle_mult=2, cycle_save_name='lesson1win')
d:\FASTAI\fastai\courses\dl1\fastai\learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
207 self.sched = None
208 layer_opt = self.get_layer_opt(lrs, wds)
--> 209 return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
210
211 def warm_up(self, lr, wds=None):
d:\FASTAI\fastai\courses\dl1\fastai\learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, use_clr, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, **kwargs)
154 n_epoch = sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle)
155 return fit(model, data, n_epoch, layer_opt.opt, self.crit,
--> 156 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, **kwargs)
157
158 def get_layer_groups(self): return self.models.get_layer_groups()
d:\FASTAI\fastai\courses\dl1\fastai\model.py in fit(model, data, epochs, opt, crit, metrics, callbacks, **kwargs)
94 batch_num += 1
95 for cb in callbacks: cb.on_batch_begin()
---> 96 loss = stepper.step(V(x),V(y))
97 avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
98 debias_loss = avg_loss / (1 - avg_mom**batch_num)
d:\FASTAI\fastai\courses\dl1\fastai\model.py in step(self, xs, y)
43 loss = raw_loss = self.crit(output, y)
44 if self.reg_fn: loss = self.reg_fn(output, xtra, raw_loss)
---> 45 loss.backward()
46 if self.clip: # Gradient clipping
47 nn.utils.clip_grad_norm(trainable_params_(self.m), self.clip)
D:\Anaconda3\envs\fastai\lib\site-packages\torch\autograd\variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
165 Variable.
166 """
--> 167 torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
168
169 def register_hook(self, hook):
D:\Anaconda3\envs\fastai\lib\site-packages\torch\autograd\__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
97
98 Variable._execution_engine.run_backward(
---> 99 variables, grad_variables, retain_graph)
100
101
RuntimeError: CUDNN_STATUS_EXECUTION_FAILED