cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:88
Does anybody know how to solve this error? I’m running the fast.ai paperspace instance.
Happens when I run through this line: torch.cuda.set_device(1)
cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:88
Does anybody know how to solve this error? I’m running the fast.ai paperspace instance.
Happens when I run through this line: torch.cuda.set_device(1)
Jeremy mentioned this during the lecture. The GPU devices are numbered starting 0
, and given your machine (probably) has one GPU, that’s what you need to set it to.
I get the same error when I try to call learn.fit
in the cifar10-darknet notebook. It really looks like a device mapping problem. The traceback goes deep into the torch library so I don’t really understand what the trouble is.
I made sure no device mapping is done from the notebook code by commenting out the line torch.cuda.set_device(1)
Here is the error and traceback:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<timed eval> in <module>()
~/fastai/courses/dl2/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
285 self.sched = None
286 layer_opt = self.get_layer_opt(lrs, wds)
--> 287 return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
288
289 def warm_up(self, lr, wds=None):
~/fastai/courses/dl2/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
232 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
233 swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 234 swa_eval_freq=swa_eval_freq, **kwargs)
235
236 def get_layer_groups(self): return self.models.get_layer_groups()
~/fastai/courses/dl2/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, **kwargs)
138 batch_num += 1
139 for cb in callbacks: cb.on_batch_begin()
--> 140 loss = model_stepper.step(V(x),V(y), epoch)
141 avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
142 debias_loss = avg_loss / (1 - avg_mom**batch_num)
~/fastai/courses/dl2/fastai/model.py in step(self, xs, y, epoch)
48 def step(self, xs, y, epoch):
49 xtra = []
---> 50 output = self.m(*xs)
51 if isinstance(output,tuple): output,*xtra = output
52 if self.fp16: self.m.zero_grad()
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
355 result = self._slow_forward(*input, **kwargs)
356 else:
--> 357 result = self.forward(*input, **kwargs)
358 for hook in self._forward_hooks.values():
359 hook_result = hook(self, input, result)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
67 if not self.device_ids:
68 return self.module(*inputs, **kwargs)
---> 69 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
70 if len(self.device_ids) == 1:
71 return self.module(*inputs[0], **kwargs[0])
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in scatter(self, inputs, kwargs, device_ids)
78
79 def scatter(self, inputs, kwargs, device_ids):
---> 80 return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
81
82 def parallel_apply(self, replicas, inputs, kwargs):
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter_kwargs(inputs, kwargs, target_gpus, dim)
36 def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
37 r"""Scatter with support for kwargs dictionary"""
---> 38 inputs = scatter(inputs, target_gpus, dim) if inputs else []
39 kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
40 if len(inputs) < len(kwargs):
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter(inputs, target_gpus, dim)
29 # None, clearing the cell
30 try:
---> 31 return scatter_map(inputs)
32 finally:
33 scatter_map = None
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter_map(obj)
16 assert not torch.is_tensor(obj), "Tensors not supported in scatter."
17 if isinstance(obj, tuple) and len(obj) > 0:
---> 18 return list(zip(*map(scatter_map, obj)))
19 if isinstance(obj, list) and len(obj) > 0:
20 return list(map(list, zip(*map(scatter_map, obj))))
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter_map(obj)
13 def scatter_map(obj):
14 if isinstance(obj, Variable):
---> 15 return Scatter.apply(target_gpus, None, dim, obj)
16 assert not torch.is_tensor(obj), "Tensors not supported in scatter."
17 if isinstance(obj, tuple) and len(obj) > 0:
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/_functions.py in forward(ctx, target_gpus, chunk_sizes, dim, input)
72 # Perform CPU to GPU copies in a background stream
73 streams = [_get_stream(device) for device in ctx.target_gpus]
---> 74 outputs = comm.scatter(input, ctx.target_gpus, ctx.chunk_sizes, ctx.dim, streams)
75 # Synchronize with the copy stream
76 if streams is not None:
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/comm.py in scatter(tensor, devices, chunk_sizes, dim, streams)
186 outputs = []
187 for device, chunk, stream in zip(devices, chunks, streams):
--> 188 with torch.cuda.device(device), torch.cuda.stream(stream):
189 outputs.append(chunk.cuda(device, async=True))
190 return tuple(outputs)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/__init__.py in __enter__(self)
207 self.prev_idx = torch._C._cuda_getDevice()
208 if self.prev_idx != self.idx:
--> 209 torch._C._cuda_setDevice(self.idx)
210 _lazy_init()
211
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:88
see this: