Hi there, my home setup for doing fast.ai was working okay with a GTX970. However when i installed an RTX 2080, things have not gone well. I updated the video card drivers, and have tried running cuda 10, which pytorch 0.3 doesnt seem to support very well.
I managed to run lesson 1 okay, by keeping the batch size around 160, otherwise i’d get CUDNN_STATUS_EXECUTION_FAILED.
I tried removing cuda & cudnn, re-running through the bash script on setup. So i installed cuda-9.2. But when going through the lesson4 imdb notebook, i hit an error when running learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2).
A Jupyter widget could not be displayed because the widget state could not be found. This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the appropriate cells.
0%| | 0/5834 [00:00<?, ?it/s]
---------------------------------------------------------------------------
CuDNNError Traceback (most recent call last)
<ipython-input-9-357a8890c905> in <module>
----> 1 learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)
~/src/fastai/courses/dl1/AaronsWorkbook/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
300 self.sched = None
301 layer_opt = self.get_layer_opt(lrs, wds)
--> 302 return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
303
304 def warm_up(self, lr, wds=None):
~/src/fastai/courses/dl1/AaronsWorkbook/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
247 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
248 swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 249 swa_eval_freq=swa_eval_freq, **kwargs)
250
251 def get_layer_groups(self): return self.models.get_layer_groups()
~/src/fastai/courses/dl1/AaronsWorkbook/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, visualize, **kwargs)
139 batch_num += 1
140 for cb in callbacks: cb.on_batch_begin()
--> 141 loss = model_stepper.step(V(x),V(y), epoch)
142 avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
143 debias_loss = avg_loss / (1 - avg_mom**batch_num)
~/src/fastai/courses/dl1/AaronsWorkbook/fastai/model.py in step(self, xs, y, epoch)
48 def step(self, xs, y, epoch):
49 xtra = []
---> 50 output = self.m(*xs)
51 if isinstance(output,tuple): output,*xtra = output
52 if self.fp16: self.m.zero_grad()
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
355 result = self._slow_forward(*input, **kwargs)
356 else:
--> 357 result = self.forward(*input, **kwargs)
358 for hook in self._forward_hooks.values():
359 hook_result = hook(self, input, result)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
65 def forward(self, input):
66 for module in self._modules.values():
---> 67 input = module(input)
68 return input
69
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
355 result = self._slow_forward(*input, **kwargs)
356 else:
--> 357 result = self.forward(*input, **kwargs)
358 for hook in self._forward_hooks.values():
359 hook_result = hook(self, input, result)
~/src/fastai/courses/dl1/AaronsWorkbook/fastai/lm_rnn.py in forward(self, input)
104 with warnings.catch_warnings():
105 warnings.simplefilter("ignore")
--> 106 raw_output, new_h = rnn(raw_output, self.hidden[l])
107 new_hidden.append(new_h)
108 raw_outputs.append(raw_output)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
355 result = self._slow_forward(*input, **kwargs)
356 else:
--> 357 result = self.forward(*input, **kwargs)
358 for hook in self._forward_hooks.values():
359 hook_result = hook(self, input, result)
~/src/fastai/courses/dl1/AaronsWorkbook/fastai/rnn_reg.py in forward(self, *args)
122 """
123 self._setweights()
--> 124 return self.module.forward(*args)
125
126 class EmbeddingDropout(nn.Module):
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
202 flat_weight=flat_weight
203 )
--> 204 output, hidden = func(input, self.all_weights, hx)
205 if is_packed:
206 output = PackedSequence(output, batch_sizes)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward(input, *fargs, **fkwargs)
383 return hack_onnx_rnn((input,) + fargs, output, args, kwargs)
384 else:
--> 385 return func(input, *fargs, **fkwargs)
386
387 return forward
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/function.py in _do_forward(self, *input)
326 self._nested_input = input
327 flat_input = tuple(_iter_variables(input))
--> 328 flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
329 nested_output = self._nested_output
330 nested_variables = _unflatten(flat_output, self._nested_output)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/autograd/function.py in forward(self, *args)
348 def forward(self, *args):
349 nested_tensors = _map_variable_tensor(self._nested_input)
--> 350 result = self.forward_extended(*nested_tensors)
351 del self._nested_input
352 self._nested_output = result
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward_extended(self, input, weight, hx)
292 hy = tuple(h.new() for h in hx)
293
--> 294 cudnn.rnn.forward(self, input, hx, weight, output, hy)
295
296 self.save_for_backward(input, hx, weight, output)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in forward(fn, input, hx, weight, output, hy)
226
227 # init descriptors
--> 228 fn.rnn_desc = init_rnn_descriptor(fn, handle)
229 if is_input_packed:
230 fn.x_descs = cudnn.descriptor_sequence(x, fn.batch_sizes)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in init_rnn_descriptor(fn, handle)
40 if (dropout_desc_name not in fn.dropout_state) or (fn.dropout_state[dropout_desc_name].get() is None):
41 fn.dropout_state[dropout_desc_name] = Unserializable(
---> 42 cudnn.DropoutDescriptor(handle, dropout_p, fn.dropout_seed)
43 )
44 dropout_desc = fn.dropout_state[dropout_desc_name].get()
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/backends/cudnn/__init__.py in __init__(self, handle, dropout, seed)
205 self.handle = handle
206
--> 207 self._set(dropout, seed)
208
209 def set_dropout(self, dropout, seed):
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/backends/cudnn/__init__.py in _set(self, dropout, seed)
230 ctypes.c_void_p(state_ptr),
231 ctypes.c_size_t(state_size),
--> 232 ctypes.c_ulonglong(seed),
233 ))
234
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/backends/cudnn/__init__.py in check_error(status)
281 def check_error(status):
282 if status is not 0:
--> 283 raise CuDNNError(status)
284
285
CuDNNError: 8: b'CUDNN_STATUS_EXECUTION_FAILED'
Here’s my system info
aaron@aaron-MS-7751:~/src/fastai/courses/dl1$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Tue_Jun_12_23:07:04_CDT_2018
Cuda compilation tools, release 9.2, V9.2.148
Output from pytorch’s collect_env.py
Collecting environment information...
PyTorch version: 0.3.1.post2
Is debug build: No
CUDA used to build PyTorch: 9.0.176
OS: Ubuntu 16.04.5 LTS
GCC version: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609
CMake version: Could not collect
Python version: 3.6
Is CUDA available: Yes
CUDA runtime version: 9.2.148
GPU models and configuration: GPU 0: GeForce RTX 2080
Nvidia driver version: 410.79
cuDNN version: Probably one of the following:
/usr/local/cuda-9.2/targets/x86_64-linux/lib/libcudnn.so
/usr/local/cuda-9.2/targets/x86_64-linux/lib/libcudnn.so.7
/usr/local/cuda-9.2/targets/x86_64-linux/lib/libcudnn.so.7.0.5
/usr/local/cuda-9.2/targets/x86_64-linux/lib/libcudnn_static.a
Versions of relevant libraries:
[pip] Could not collect
[conda] blas 1.0 mkl
[conda] cuda90 1.0 h6433d27_0 pytorch
[conda] mkl 2019.1 144
[conda] mkl_fft 1.0.6 py36hd81dba3_0
[conda] mkl_random 1.0.2 py36hd81dba3_0
[conda] pytorch 0.3.1 py36_cuda9.0.176_cudnn7.0.5_2 [cuda90] pytorch
[conda] torchtext 0.2.3 <pip>
[conda] torchvision 0.2.0 <pip>