Cnn_learner.fit_one_cycle: RuntimeError: Cannot pickle CUDA storage; try pickling a CUDA tensor instead

dreamflasher · October 15, 2020, 5:23pm

from fastai.vision.data import ImageDataLoaders
from fastai.vision.learner import cnn_learner
from fastai.vision.augment import aug_transforms
import pandas as pd
from fastai import vision

df = pd.read_csv("/data/cats/labels.csv")

data = ImageDataLoaders.from_df(df=df, path="/", label_col=1, bs=100, batch_tfms=[
    *aug_transforms(size=224)], valid_pct=0.2)
learn = cnn_learner(data, getattr(vision.models, "resnet18"))
learn.fit_one_cycle(10)

Results in:

RuntimeError                              Traceback (most recent call last)
<ipython-input-106-427e45e6948b> in <module>
      8 data = ImageDataLoaders.from_df(df=df, path="/", label_col=1, bs=100, batch_tfms=[*aug_transforms(size=224)], valid_pct=0.2)
      9 learn = cnn_learner(data, getattr(vision.models, "resnet18"))
---> 10 learn.fit_one_cycle(10)

~/.local/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

~/.local/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    205             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    206             self.n_epoch = n_epoch
--> 207             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    208 
    209     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    195         for epoch in range(self.n_epoch):
    196             self.epoch=epoch
--> 197             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    198 
    199     @log_args(but='cbs')

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    189 
    190     def _do_epoch(self):
--> 191         self._do_epoch_train()
    192         self._do_epoch_validate()
    193 

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    181     def _do_epoch_train(self):
    182         self.dl = self.dls.train
--> 183         self._with_events(self.all_batches, 'train', CancelTrainException)
    184 
    185     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    159     def all_batches(self):
    160         self.n_iter = len(self.dl)
--> 161         for o in enumerate(self.dl): self.one_batch(*o)
    162 
    163     def _do_one_batch(self):

~/.local/lib/python3.8/site-packages/fastai/data/load.py in __iter__(self)
    100         self.before_iter()
    101         self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
--> 102         for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
    103             if self.device is not None: b = to_device(b, self.device)
    104             yield self.after_batch(b)

~/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __init__(self, loader)
    735             #     before it starts, and __del__ tries to join but will get:
    736             #     AssertionError: can only join a started process.
--> 737             w.start()
    738             self._index_queues.append(index_queue)
    739             self._workers.append(w)

/usr/lib/python3.8/multiprocessing/process.py in start(self)
    119                'daemonic processes are not allowed to have children'
    120         _cleanup()
--> 121         self._popen = self._Popen(self)
    122         self._sentinel = self._popen.sentinel
    123         # Avoid a refcycle if the target function holds an indirect

/usr/lib/python3.8/multiprocessing/context.py in _Popen(process_obj)
    222     @staticmethod
    223     def _Popen(process_obj):
--> 224         return _default_context.get_context().Process._Popen(process_obj)
    225 
    226 class DefaultContext(BaseContext):

/usr/lib/python3.8/multiprocessing/context.py in _Popen(process_obj)
    282         def _Popen(process_obj):
    283             from .popen_spawn_posix import Popen
--> 284             return Popen(process_obj)
    285 
    286     class ForkServerProcess(process.BaseProcess):

/usr/lib/python3.8/multiprocessing/popen_spawn_posix.py in __init__(self, process_obj)
     30     def __init__(self, process_obj):
     31         self._fds = []
---> 32         super().__init__(process_obj)
     33 
     34     def duplicate_for_child(self, fd):

/usr/lib/python3.8/multiprocessing/popen_fork.py in __init__(self, process_obj)
     17         self.returncode = None
     18         self.finalizer = None
---> 19         self._launch(process_obj)
     20 
     21     def duplicate_for_child(self, fd):

/usr/lib/python3.8/multiprocessing/popen_spawn_posix.py in _launch(self, process_obj)
     45         try:
     46             reduction.dump(prep_data, fp)
---> 47             reduction.dump(process_obj, fp)
     48         finally:
     49             set_spawning_popen(None)

/usr/lib/python3.8/multiprocessing/reduction.py in dump(obj, file, protocol)
     58 def dump(obj, file, protocol=None):
     59     '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60     ForkingPickler(file, protocol).dump(obj)
     61 
     62 #

~/.local/lib/python3.8/site-packages/torch/multiprocessing/reductions.py in reduce_storage(storage)
    308     from . import get_sharing_strategy
    309     if storage.is_cuda:
--> 310         raise RuntimeError("Cannot pickle CUDA storage; try pickling a CUDA tensor instead")
    311     elif get_sharing_strategy() == 'file_system':
    312         metadata = storage._share_filename_()

RuntimeError: Cannot pickle CUDA storage; try pickling a CUDA tensor instead

The dataset are images of cats/no-cats, binary label in the second column, first column filename. I have been using that dataset with fastai1.

dreamflasher · October 18, 2020, 10:10am

I can’t reproduce the error myself anymore. My code was missing some imports
from fastai.vision.all import * (although I’d prefer to only import what’s needed and not getting flake8 warnings)
and the ImageDataLoaders need item_tfms=Resize(224), it looks like batch_tfms=[ *aug_transforms(size=224)] is not sufficient.

dreamflasher · October 21, 2020, 5:30pm

Now I am getting the very same error again in lr_find().

dreamflasher · October 22, 2020, 2:59pm

I can reproduce it now, the problem is setting num_works > 0: https://github.com/fastai/fastai/issues/2899

dreamflasher · October 22, 2020, 3:42pm

set_start_method('fork', force=True) gets rid of the exception, but now there is no training progress bar anymore. Training works now, but it’s twice slower.