I was stuck on this for a couple of days (weeks even but I have abandoned this project in the meantime and am just now returning to it).
I am running a fairly complicated model and I want to use fastai pre-trained model as part of it. The problem is that the model trains on GPU and so running so shuffling data back and forth between GPU and CPU makes the model almost impossible/impractical. This has not been working despite setting cpu=False when load_learner
.
To replicate:
- load any model e.g. camvid:
learn = load_learner('stage1', cpu=False)
- create a test tensor:
test_v1 = torch.tensor(np.ndarray((512,512,3))).to('cuda')
-
learn.predict(test_v1)
yields:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-8-2886587ca7d8> in <module>()
----> 1 learn.predict(test_v1)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in predict(self, item, rm_type_tfms)
331 def predict(self, item, rm_type_tfms=None):
332 dl = self.dls.test_dl([item], rm_type_tfms=rm_type_tfms)
--> 333 inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True)
334 i = getattr(self.dls, 'n_inp', -1)
335 full_dec = self.dls.decode_batch((*tuplify(inp),*tuplify(dec_preds)))[0][i:]
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in get_preds(self, ds_idx, dl, with_input, with_decoded, with_loss, act, **kwargs)
319 for mgr in ctx_mgrs: stack.enter_context(mgr)
320 self(_before_epoch)
--> 321 self._do_epoch_validate(dl=dl)
322 self(_after_epoch)
323 if act is None: act = getattr(self.loss_func, 'activation', noop)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in _do_epoch_validate(self, ds_idx, dl)
278 dl,old,has = change_attrs(dl, names, [False,False])
279 self.dl = dl; self('begin_validate')
--> 280 with torch.no_grad(): self.all_batches()
281 except CancelValidException: self('after_cancel_validate')
282 finally:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in all_batches(self)
246 def all_batches(self):
247 self.n_iter = len(self.dl)
--> 248 for o in enumerate(self.dl): self.one_batch(*o)
249
250 def one_batch(self, i, b):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/load.py in __iter__(self)
95 self.randomize()
96 self.before_iter()
---> 97 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
98 if self.device is not None: b = to_device(b, self.device)
99 yield self.after_batch(b)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
817 else:
818 del self.task_info[idx]
--> 819 return self._process_data(data)
820
821 next = __next__ # Python 2 compatibility
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _process_data(self, data)
844 self._try_put_index()
845 if isinstance(data, ExceptionWrapper):
--> 846 data.reraise()
847 return data
848
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/_utils.py in reraise(self)
367 # (https://bugs.python.org/issue2651), so we work around it.
368 msg = KeyErrorMessage(msg)
--> 369 raise self.exc_type(msg)
TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
data = fetcher.fetch(index)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
data = next(self.dataset_iter)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/load.py", line 106, in create_batches
yield from map(self.do_batch, self.chunkify(res))
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/utils.py", line 270, in chunked
res = list(itertools.islice(it, cs))
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/load.py", line 119, in do_item
try: return self.after_item(self.create_item(s))
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/load.py", line 125, in create_item
def create_item(self, s): return next(self.it) if s is None else self.dataset[s]
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/core.py", line 265, in __getitem__
res = tuple([tl[it] for tl in self.tls])
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/core.py", line 265, in <listcomp>
res = tuple([tl[it] for tl in self.tls])
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/core.py", line 242, in __getitem__
return self._after_item(res) if is_indexer(idx) else res.map(self._after_item)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/data/core.py", line 206, in _after_item
def _after_item(self, o): return self.tfms(o)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/transform.py", line 185, in __call__
def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/transform.py", line 136, in compose_tfms
x = f(x, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/transform.py", line 71, in __call__
def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/transform.py", line 82, in _call
if self.use_as_item or not is_listy(x): return self._do_call(f, x, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/transform.py", line 87, in _do_call
return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/dispatch.py", line 98, in __call__
return f(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/vision/core.py", line 87, in create
if isinstance(fn,Tensor): fn = fn.numpy()
TypeError: can't convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
Things I have tried:
- I have looked into fastai2 and it does not have the
defaults
orconfig
, unlike (it seems?) v1. -
torch.device
does not seem to help - setting
self.learner.model = self.learner.model.to('cuda')
- searching through the forums if anyone has had a similar issue
If there’s any questions or detail I could add to make this clearer, please let me know!
I really do not feel like I (yet) know enough about the fastai2 internals to solve this on my own. So any help would be deeply appreciated!