Prediction fails on Windows

balnazzar · March 22, 2020, 8:38am

Hi. The following happens on Windows, but not on Linux.

After a successful training loop (like 1st lesson), I try to predict either on a custom PIL image or on an element provided by the dls, and it fails (below the stack trace), whereas if I convert the image to pytorch tensor and predict with learn.model(), it succeeds.

Stack trace:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in _do_epoch_validate(self, ds_idx, dl)
    173             self.dl = dl;                                    self('begin_validate')
--> 174             with torch.no_grad(): self.all_batches()
    175         except CancelValidException:                         self('after_cancel_validate')

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in all_batches(self)
    141         self.n_iter = len(self.dl)
--> 142         for o in enumerate(self.dl): self.one_batch(*o)
    143 

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\data\load.py in __iter__(self)
     96         self.before_iter()
---> 97         for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
     98             if self.device is not None: b = to_device(b, self.device)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
    718             #     AssertionError: can only join a started process.
--> 719             w.start()
    720             self._index_queues.append(index_queue)

c:\users\poko\anaconda3\envs\f2\lib\multiprocessing\process.py in start(self)
    111         _cleanup()
--> 112         self._popen = self._Popen(self)
    113         self._sentinel = self._popen.sentinel

c:\users\poko\anaconda3\envs\f2\lib\multiprocessing\context.py in _Popen(process_obj)
    222     def _Popen(process_obj):
--> 223         return _default_context.get_context().Process._Popen(process_obj)
    224 

c:\users\poko\anaconda3\envs\f2\lib\multiprocessing\context.py in _Popen(process_obj)
    321             from .popen_spawn_win32 import Popen
--> 322             return Popen(process_obj)
    323 

c:\users\poko\anaconda3\envs\f2\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
     88                 reduction.dump(prep_data, to_child)
---> 89                 reduction.dump(process_obj, to_child)
     90             finally:

c:\users\poko\anaconda3\envs\f2\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
     59     '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60     ForkingPickler(file, protocol).dump(obj)
     61 

c:\users\poko\anaconda3\envs\f2\lib\site-packages\torch\multiprocessing\reductions.py in reduce_tensor(tensor)
    241          event_handle,
--> 242          event_sync_required) = storage._share_cuda_()
    243         tensor_offset = tensor.storage_offset()

RuntimeError: cuda runtime error (801) : operation not supported at C:\w\1\s\tmp_conda_3.7_100118\conda\conda-bld\pytorch_1579082551706\work\torch/csrc/generic/StorageSharing.cpp:245

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
<ipython-input-95-3b6ef5ec6f96> in <module>
----> 1 learn.predict(img)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in predict(self, item, rm_type_tfms, with_input)
    228     def predict(self, item, rm_type_tfms=None, with_input=False):
    229         dl = self.dls.test_dl([item], rm_type_tfms=rm_type_tfms)
--> 230         inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True)
    231         dec = self.dls.decode_batch((*tuplify(inp),*tuplify(dec_preds)))[0]
    232         i = getattr(self.dls, 'n_inp', -1)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in get_preds(self, ds_idx, dl, with_input, with_decoded, with_loss, act, inner, **kwargs)
    216             for mgr in ctx_mgrs: stack.enter_context(mgr)
    217             self(event.begin_epoch if inner else _before_epoch)
--> 218             self._do_epoch_validate(dl=dl)
    219             self(event.after_epoch if inner else _after_epoch)
    220             if act is None: act = getattr(self.loss_func, 'activation', noop)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in _do_epoch_validate(self, ds_idx, dl)
    175         except CancelValidException:                         self('after_cancel_validate')
    176         finally:
--> 177             dl,*_ = change_attrs(dl, names, old, has);       self('after_validate')
    178 
    179     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in __call__(self, event_name)
    121     def ordered_cbs(self, cb_func): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, cb_func)]
    122 
--> 123     def __call__(self, event_name): L(event_name).map(self._call_one)
    124     def _call_one(self, event_name):
    125         assert hasattr(event, event_name)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastcore\foundation.py in map(self, f, *args, **kwargs)
    360              else f.format if isinstance(f,str)
    361              else f.__getitem__)
--> 362         return self._new(map(g, self))
    363 
    364     def filter(self, f, negate=False, **kwargs):

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastcore\foundation.py in _new(self, items, *args, **kwargs)
    313     @property
    314     def _xtra(self): return None
--> 315     def _new(self, items, *args, **kwargs): return type(self)(items, *args, use_list=None, **kwargs)
    316     def __getitem__(self, idx): return self._get(idx) if is_indexer(idx) else L(self._get(idx), use_list=None)
    317     def copy(self): return self._new(self.items.copy())

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastcore\foundation.py in __call__(cls, x, *args, **kwargs)
     39             return x
     40 
---> 41         res = super().__call__(*((x,) + args), **kwargs)
     42         res._newchk = 0
     43         return res

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastcore\foundation.py in __init__(self, items, use_list, match, *rest)
    304         if items is None: items = []
    305         if (use_list is not None) or not _is_array(items):
--> 306             items = list(items) if use_list else _listify(items)
    307         if match is not None:
    308             if is_coll(match): match = len(match)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastcore\foundation.py in _listify(o)
    240     if isinstance(o, list): return o
    241     if isinstance(o, str) or _is_array(o): return [o]
--> 242     if is_iter(o): return list(o)
    243     return [o]
    244 

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastcore\foundation.py in __call__(self, *args, **kwargs)
    206             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    207         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 208         return self.fn(*fargs, **kwargs)
    209 
    210 # Cell

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in _call_one(self, event_name)
    124     def _call_one(self, event_name):
    125         assert hasattr(event, event_name)
--> 126         [cb(event_name) for cb in sort_by_run(self.cbs)]
    127 
    128     def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\learner.py in <listcomp>(.0)
    124     def _call_one(self, event_name):
    125         assert hasattr(event, event_name)
--> 126         [cb(event_name) for cb in sort_by_run(self.cbs)]
    127 
    128     def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\callback\core.py in __call__(self, event_name)
     21         _run = (event_name not in _inner_loop or (self.run_train and getattr(self, 'training', True)) or
     22                (self.run_valid and not getattr(self, 'training', False)))
---> 23         if self.run and _run: getattr(self, event_name, noop)()
     24         if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
     25 

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\callback\core.py in after_validate(self)
     92     def after_validate(self):
     93         "Concatenate all recorded tensors"
---> 94         if self.with_input:     self.inputs  = detuplify(to_concat(self.inputs, dim=self.concat_dim))
     95         if not self.save_preds: self.preds   = detuplify(to_concat(self.preds, dim=self.concat_dim))
     96         if not self.save_targs: self.targets = detuplify(to_concat(self.targets, dim=self.concat_dim))

c:\users\poko\anaconda3\envs\f2\lib\site-packages\fastai2\torch_core.py in to_concat(xs, dim)
    211 def to_concat(xs, dim=0):
    212     "Concat the element in `xs` (recursively if they are tuples/lists of tensors)"
--> 213     if is_listy(xs[0]): return type(xs[0])([to_concat([x[i] for x in xs], dim=dim) for i in range_of(xs[0])])
    214     if isinstance(xs[0],dict):  return {k: to_concat([x[k] for x in xs], dim=dim) for k in xs[0].keys()}
    215     #We may receives xs that are not concatenatable (inputs of a text classifier for instance),

IndexError: list index out of range

pdb says xs is an empty list at the moment the error is thrown out… It seems Windows does mess with something, but I cannot really figure out what…

sgugger · March 22, 2020, 1:55pm

This is a multiprocessing error, you just didn’t set your num_workers to 0 I suspect.

s.s.o · March 22, 2020, 2:54pm

I think you are right… StorageSharing error comes from number of workers which must be set to 0…

balnazzar · March 23, 2020, 8:56am

So the culprit is again MP… Thanks guys!