Paperspace lesson 1 error when setting device

aashj99 · March 20, 2018, 11:01pm

cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:88

Does anybody know how to solve this error? I’m running the fast.ai paperspace instance.

Happens when I run through this line: torch.cuda.set_device(1)

suvash · March 20, 2018, 11:07pm

Jeremy mentioned this during the lecture. The GPU devices are numbered starting 0, and given your machine (probably) has one GPU, that’s what you need to set it to.

Andreas_Daiminger · June 12, 2018, 1:48pm

I get the same error when I try to call learn.fit in the cifar10-darknet notebook. It really looks like a device mapping problem. The traceback goes deep into the torch library so I don’t really understand what the trouble is.
I made sure no device mapping is done from the notebook code by commenting out the line torch.cuda.set_device(1)

Here is the error and traceback:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<timed eval> in <module>()

~/fastai/courses/dl2/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
    285         self.sched = None
    286         layer_opt = self.get_layer_opt(lrs, wds)
--> 287         return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
    288 
    289     def warm_up(self, lr, wds=None):

~/fastai/courses/dl2/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
    232             metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
    233             swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 234             swa_eval_freq=swa_eval_freq, **kwargs)
    235 
    236     def get_layer_groups(self): return self.models.get_layer_groups()

~/fastai/courses/dl2/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, **kwargs)
    138             batch_num += 1
    139             for cb in callbacks: cb.on_batch_begin()
--> 140             loss = model_stepper.step(V(x),V(y), epoch)
    141             avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
    142             debias_loss = avg_loss / (1 - avg_mom**batch_num)

~/fastai/courses/dl2/fastai/model.py in step(self, xs, y, epoch)
     48     def step(self, xs, y, epoch):
     49         xtra = []
---> 50         output = self.m(*xs)
     51         if isinstance(output,tuple): output,*xtra = output
     52         if self.fp16: self.m.zero_grad()

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    355             result = self._slow_forward(*input, **kwargs)
    356         else:
--> 357             result = self.forward(*input, **kwargs)
    358         for hook in self._forward_hooks.values():
    359             hook_result = hook(self, input, result)

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
     67         if not self.device_ids:
     68             return self.module(*inputs, **kwargs)
---> 69         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
     70         if len(self.device_ids) == 1:
     71             return self.module(*inputs[0], **kwargs[0])

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in scatter(self, inputs, kwargs, device_ids)
     78 
     79     def scatter(self, inputs, kwargs, device_ids):
---> 80         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
     81 
     82     def parallel_apply(self, replicas, inputs, kwargs):

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter_kwargs(inputs, kwargs, target_gpus, dim)
     36 def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
     37     r"""Scatter with support for kwargs dictionary"""
---> 38     inputs = scatter(inputs, target_gpus, dim) if inputs else []
     39     kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
     40     if len(inputs) < len(kwargs):

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter(inputs, target_gpus, dim)
     29     # None, clearing the cell
     30     try:
---> 31         return scatter_map(inputs)
     32     finally:
     33         scatter_map = None

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter_map(obj)
     16         assert not torch.is_tensor(obj), "Tensors not supported in scatter."
     17         if isinstance(obj, tuple) and len(obj) > 0:
---> 18             return list(zip(*map(scatter_map, obj)))
     19         if isinstance(obj, list) and len(obj) > 0:
     20             return list(map(list, zip(*map(scatter_map, obj))))

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in scatter_map(obj)
     13     def scatter_map(obj):
     14         if isinstance(obj, Variable):
---> 15             return Scatter.apply(target_gpus, None, dim, obj)
     16         assert not torch.is_tensor(obj), "Tensors not supported in scatter."
     17         if isinstance(obj, tuple) and len(obj) > 0:

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/parallel/_functions.py in forward(ctx, target_gpus, chunk_sizes, dim, input)
     72             # Perform CPU to GPU copies in a background stream
     73             streams = [_get_stream(device) for device in ctx.target_gpus]
---> 74         outputs = comm.scatter(input, ctx.target_gpus, ctx.chunk_sizes, ctx.dim, streams)
     75         # Synchronize with the copy stream
     76         if streams is not None:

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/comm.py in scatter(tensor, devices, chunk_sizes, dim, streams)
    186     outputs = []
    187     for device, chunk, stream in zip(devices, chunks, streams):
--> 188         with torch.cuda.device(device), torch.cuda.stream(stream):
    189             outputs.append(chunk.cuda(device, async=True))
    190     return tuple(outputs)

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/__init__.py in __enter__(self)
    207         self.prev_idx = torch._C._cuda_getDevice()
    208         if self.prev_idx != self.idx:
--> 209             torch._C._cuda_setDevice(self.idx)
    210         _lazy_init()
    211 

RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:88

sayko · June 15, 2018, 5:21am

see this: