Image Segmentation : CUDA error

chirathpansilu · March 18, 2020, 5:55pm

I started doing Kaggle Carvana competition today and i got into a CUDA error That i don’t undestand (I’m a beginner).

Everything went fine until i did the learn.lr_find()

after i did that i got a error like this


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
    100                 xb, yb = cb_handler.on_batch_begin(xb, yb)
--> 101                 loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler)
    102                 if cb_handler.on_batch_end(loss): break

/opt/conda/lib/python3.6/site-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     32     if opt is not None:
---> 33         loss,skip_bwd = cb_handler.on_backward_begin(loss)
     34         if not skip_bwd:                     loss.backward()

/opt/conda/lib/python3.6/site-packages/fastai/callback.py in on_backward_begin(self, loss)
    289         "Handle gradient calculation on `loss`."
--> 290         self.smoothener.add_value(loss.float().detach().cpu())
    291         self.state_dict['last_loss'], self.state_dict['smooth_loss'] = loss, self.smoothener.smooth

RuntimeError: CUDA error: device-side assert triggered

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-20-c7a9c29f9dd1> in <module>
----> 1 learn.lr_find()
      2 learn.recorder.plot()

/opt/conda/lib/python3.6/site-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
     39     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     40     epochs = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 41     learn.fit(epochs, start_lr, callbacks=[cb], wd=wd)
     42 
     43 def to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=True, clip:float=None,

/opt/conda/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    198         else: self.opt.lr,self.opt.wd = lr,wd
    199         callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
--> 200         fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
    201 
    202     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

/opt/conda/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
    110         exception = e
    111         raise
--> 112     finally: cb_handler.on_train_end(exception)
    113 
    114 loss_func_name2activ = {'cross_entropy_loss': F.softmax, 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,

/opt/conda/lib/python3.6/site-packages/fastai/callback.py in on_train_end(self, exception)
    321     def on_train_end(self, exception:Union[bool,Exception])->None:
    322         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
--> 323         self('train_end', exception=exception)
    324 
    325     @property

/opt/conda/lib/python3.6/site-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
    249         if call_mets:
    250             for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
--> 251         for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
    252 
    253     def set_dl(self, dl:DataLoader):

/opt/conda/lib/python3.6/site-packages/fastai/callback.py in _call_and_update(self, cb, cb_name, **kwargs)
    239     def _call_and_update(self, cb, cb_name, **kwargs)->None:
    240         "Call `cb_name` on `cb` and update the inner state."
--> 241         new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
    242         for k,v in new.items():
    243             if k not in self.state_dict:

/opt/conda/lib/python3.6/site-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)
     33     def on_train_end(self, **kwargs:Any)->None:
     34         "Cleanup learn model weights disturbed during LRFinder exploration."
---> 35         self.learn.load('tmp', purge=False)
     36         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()
     37         for cb in self.callbacks:

/opt/conda/lib/python3.6/site-packages/fastai/basic_train.py in load(self, file, device, strict, with_opt, purge, remove_module)
    267         source = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file
    268         distrib_barrier()
--> 269         state = torch.load(source, map_location=device)
    270         if set(state.keys()) == {'model', 'opt'}:
    271             model_state = state['model']

/opt/conda/lib/python3.6/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
    527             with _open_zipfile_reader(f) as opened_zipfile:
    528                 return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
--> 529         return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
    530 
    531 

/opt/conda/lib/python3.6/site-packages/torch/serialization.py in _legacy_load(f, map_location, pickle_module, **pickle_load_args)
    700     unpickler = pickle_module.Unpickler(f, **pickle_load_args)
    701     unpickler.persistent_load = persistent_load
--> 702     result = unpickler.load()
    703 
    704     deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)

/opt/conda/lib/python3.6/site-packages/torch/serialization.py in persistent_load(saved_id)
    663                 obj = data_type(size)
    664                 obj._torch_load_uninitialized = True
--> 665                 deserialized_objects[root_key] = restore_location(obj, location)
    666             storage = deserialized_objects[root_key]
    667             if view_metadata is not None:

/opt/conda/lib/python3.6/site-packages/torch/serialization.py in restore_location(storage, location)
    738     elif isinstance(map_location, torch.device):
    739         def restore_location(storage, location):
--> 740             return default_restore_location(storage, str(map_location))
    741     else:
    742         def restore_location(storage, location):

/opt/conda/lib/python3.6/site-packages/torch/serialization.py in default_restore_location(storage, location)
    154 def default_restore_location(storage, location):
    155     for _, _, fn in _package_registry:
--> 156         result = fn(storage, location)
    157         if result is not None:
    158             return result

/opt/conda/lib/python3.6/site-packages/torch/serialization.py in _cuda_deserialize(obj, location)
    134             storage_type = getattr(torch.cuda, type(obj).__name__)
    135             with torch.cuda.device(device):
--> 136                 return storage_type(obj.size())
    137         else:
    138             return obj.cuda(device)

/opt/conda/lib/python3.6/site-packages/torch/cuda/__init__.py in _lazy_new(cls, *args, **kwargs)
    478     # We may need to call lazy init again if we are a forked child
    479     # del _CudaBase.__new__
--> 480     return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
    481 
    482 

RuntimeError: CUDA error: device-side assert triggered

(But then I proceeded with CPU , then it gave me a error ‘255 index out of range’ , then I looked at a training image and its pixel values were within 1-0 )

sebbecht · March 19, 2020, 3:15pm

Hi there, I have seen and had this error occur if I labels in your segmentation ground_truths are less or more than the specified class labels when you create your SegmentationItemList. Try to doublecheck this first.