Cudnn_status_execution_failed

Chris_Palmer · February 5, 2018, 12:46pm

Trying the new lesson 1 (Part 1 v 2) on my Windows PC, with an NVIDIA GTX 650 Ti, CUDA 9, CuDNN 7. A third of the way into learn.fit after unfreezing the model, it crashes out with a CUDNN error.

Can anyone advise me - I suspect I am asking too much of my GPU (I already had to reduce batch size to 24 just to prevent a memory error).

Update: I have since restarted the notebook and have tried running again without the cycle_save parameter, and I have got through 1 epoch.

Additionally, after reading posts at these 2 links I am pretty sure I am just running out of memory, which is kind of expected since I only have 3GB GPU memory

learn.fit(lr, 3, cycle_len=1, cycle_mult=2 , cycle_save_name=‘lesson1win’)

Epoch
0% 0/7 [00:00<?, ?it/s]
 31%|████████████████████▏                                            | 297/959 [48:55<1:49:04,  9.89s/it, loss=0.0808]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-153-9b95850059b8> in <module>()
----> 1 learn.fit(lr, 3, cycle_len=1, cycle_mult=2, cycle_save_name='lesson1win')

d:\FASTAI\fastai\courses\dl1\fastai\learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
    207         self.sched = None
    208         layer_opt = self.get_layer_opt(lrs, wds)
--> 209         return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
    210 
    211     def warm_up(self, lr, wds=None):

d:\FASTAI\fastai\courses\dl1\fastai\learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, use_clr, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, **kwargs)
    154         n_epoch = sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle)
    155         return fit(model, data, n_epoch, layer_opt.opt, self.crit,
--> 156             metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, **kwargs)
    157 
    158     def get_layer_groups(self): return self.models.get_layer_groups()

d:\FASTAI\fastai\courses\dl1\fastai\model.py in fit(model, data, epochs, opt, crit, metrics, callbacks, **kwargs)
     94             batch_num += 1
     95             for cb in callbacks: cb.on_batch_begin()
---> 96             loss = stepper.step(V(x),V(y))
     97             avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
     98             debias_loss = avg_loss / (1 - avg_mom**batch_num)

d:\FASTAI\fastai\courses\dl1\fastai\model.py in step(self, xs, y)
     43         loss = raw_loss = self.crit(output, y)
     44         if self.reg_fn: loss = self.reg_fn(output, xtra, raw_loss)
---> 45         loss.backward()
     46         if self.clip:   # Gradient clipping
     47             nn.utils.clip_grad_norm(trainable_params_(self.m), self.clip)

D:\Anaconda3\envs\fastai\lib\site-packages\torch\autograd\variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
    165                 Variable.
    166         """
--> 167         torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
    168 
    169     def register_hook(self, hook):

D:\Anaconda3\envs\fastai\lib\site-packages\torch\autograd\__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
     97 
     98     Variable._execution_engine.run_backward(
---> 99         variables, grad_variables, retain_graph)
    100 
    101 

RuntimeError: CUDNN_STATUS_EXECUTION_FAILED