Cant do segmentation. RuntimeError: cuda runtime error (710)

SOLVED (in reply)

Hi there,

I am trying to segment my images. I made masks for them that only have two pixel classes with values 0 and 1 (saved in PNG format).
To get started, I just decided to use the tutorial for camvid_tiny segmentation https://github.com/fastai/fastai/blob/master/nbs/23_tutorial.vision.ipynb

Just repeating all the code from the tutorial (except of bs and resize)

fnames = get_image_files(path/'images')
def label_func(x): return path/'masks'/f'{x.stem}_P{x.suffix}'
codes = np.loadtxt(path/'labels.txt', dtype=str)

dls = SegmentationDataLoaders.from_label_func(path, 
                                              fnames, 
                                              label_func, 
                                              codes=codes, 
                                              bs=2,
                                              item_tfms=[Resize(144)],
                                              num_workers=0)

x,y = dls.one_batch()
dls.show_batch()

here I see my images with masks. Batches seems OK.
Learner learn = unet_learner(dls, resnet34) is created without errors and then I get the error on learn.fine_tune(8)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-27-f067e6d6d342> in <module>
      1 learn = unet_learner(dls, resnet34)
----> 2 learn.fine_tune(8)

~\Anaconda3\lib\site-packages\fastcore\utils.py in _f(*args, **kwargs)
    470         init_args.update(log)
    471         setattr(inst, 'init_args', init_args)
--> 472         return inst if to_return else f(*args, **kwargs)
    473     return _f
    474 

~\Anaconda3\lib\site-packages\fastai\callback\schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
    159     "Fine tune with `freeze` for `freeze_epochs` then with `unfreeze` from `epochs` using discriminative LR"
    160     self.freeze()
--> 161     self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
    162     base_lr /= 2
    163     self.unfreeze()

~\Anaconda3\lib\site-packages\fastcore\utils.py in _f(*args, **kwargs)
    470         init_args.update(log)
    471         setattr(inst, 'init_args', init_args)
--> 472         return inst if to_return else f(*args, **kwargs)
    473     return _f
    474 

~\Anaconda3\lib\site-packages\fastai\callback\schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

~\Anaconda3\lib\site-packages\fastcore\utils.py in _f(*args, **kwargs)
    470         init_args.update(log)
    471         setattr(inst, 'init_args', init_args)
--> 472         return inst if to_return else f(*args, **kwargs)
    473     return _f
    474 

~\Anaconda3\lib\site-packages\fastai\learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    205             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    206             self.n_epoch,self.loss = n_epoch,tensor(0.)
--> 207             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    208 
    209     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~\Anaconda3\lib\site-packages\fastai\learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~\Anaconda3\lib\site-packages\fastai\learner.py in _do_fit(self)
    195         for epoch in range(self.n_epoch):
    196             self.epoch=epoch
--> 197             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    198 
    199     @log_args(but='cbs')

~\Anaconda3\lib\site-packages\fastai\learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~\Anaconda3\lib\site-packages\fastai\learner.py in _do_epoch(self)
    189 
    190     def _do_epoch(self):
--> 191         self._do_epoch_train()
    192         self._do_epoch_validate()
    193 

~\Anaconda3\lib\site-packages\fastai\learner.py in _do_epoch_train(self)
    181     def _do_epoch_train(self):
    182         self.dl = self.dls.train
--> 183         self._with_events(self.all_batches, 'train', CancelTrainException)
    184 
    185     def _do_epoch_validate(self, ds_idx=1, dl=None):

~\Anaconda3\lib\site-packages\fastai\learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~\Anaconda3\lib\site-packages\fastai\learner.py in all_batches(self)
    159     def all_batches(self):
    160         self.n_iter = len(self.dl)
--> 161         for o in enumerate(self.dl): self.one_batch(*o)
    162 
    163     def _do_one_batch(self):

~\Anaconda3\lib\site-packages\fastai\learner.py in one_batch(self, i, b)
    177         self.iter = i
    178         self._split(b)
--> 179         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    180 
    181     def _do_epoch_train(self):

~\Anaconda3\lib\site-packages\fastai\learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~\Anaconda3\lib\site-packages\fastai\learner.py in _do_one_batch(self)
    164         self.pred = self.model(*self.xb)
    165         self('after_pred')
--> 166         if len(self.yb): self.loss = self.loss_func(self.pred, *self.yb)
    167         self('after_loss')
    168         if not self.training or not len(self.yb): return

~\Anaconda3\lib\site-packages\fastai\layers.py in __call__(self, inp, targ, **kwargs)
    295         if targ.dtype in [torch.int8, torch.int16, torch.int32]: targ = targ.long()
    296         if self.flatten: inp = inp.view(-1,inp.shape[-1]) if self.is_2d else inp.view(-1)
--> 297         return self.func.__call__(inp, targ.view(-1) if self.flatten else targ, **kwargs)
    298 
    299 # Cell

~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~\Anaconda3\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
    946     def forward(self, input: Tensor, target: Tensor) -> Tensor:
    947         return F.cross_entropy(input, target, weight=self.weight,
--> 948                                ignore_index=self.ignore_index, reduction=self.reduction)
    949 
    950 

~\Anaconda3\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
   2420     if size_average is not None or reduce is not None:
   2421         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2422     return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
   2423 
   2424 

~\Anaconda3\lib\site-packages\torch\nn\functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   2216                          .format(input.size(0), target.size(0)))
   2217     if dim == 2:
-> 2218         ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   2219     elif dim == 4:
   2220         ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)

RuntimeError: cuda runtime error (710) : device-side assert triggered at C:/cb/pytorch_1000000000000/work/aten/src\THCUNN/generic/ClassNLLCriterion.cu:118

I repeated the camvid tutorial on my computer, it works fine, there are no errors, and an error appears with my data.
I noticed that after I create a dataloaders with batches and execute the following code.

x,y = dls.one_batch()
dls.show_batch()
y[0],y[0].min(),y[0].max()

Then the values in the masks are 0 and 255 instead of 0 and 1. Maybe this is the problem? But I don’t understand why the values in masks change after batch creation. Before creating batches, they are equal to 0 or 1.

fastai version - 2.0.9

Update: I made test mask with 0,1,2 pixel values just in case. Before SegmentationDataLoaders it looks like this


And after DataLoaders it looks like normalization happened…

In camvid dataset all masks looks OK like this
image

At the same time, I compared camvid masks files with masks for my data and found no significant difference between them. Both masks are 8 bit grayscale

I had a specific problem of saving the mask in the ImageJ program. It was necessary to specify the scale of brightness from 0 to 255. Apparently some metadata changes the mask when creating batches. After resaving on a scale of 0-255, everything worked as it should.

1 Like