Part1 2020 : 02_Production - found at least two devices, cuda:0 and cpu

I’m trying to just ensure that the code actually runs. But when I get to this:

learn.fine_tune(4)

I get the error:

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

I’m on a dual GPU system (asymmetric GPUs). I have tried:

  1. when launching jupyter notebook, I first call set CUDA_VISIBLE_DEVICES=0 to ensure it only sees the first device
  2. I have tried this
    fastai.torch_core.defaults.device = 'cuda:0'
    and this
    fastai.torch_core.defaults.device = 'cuda'
    and this
    fastai.torch_core.defaults.device = 'cpu'

To no effect, same error.

Any other ideas? I’m running off of a git clone of latest everything.

This is bizarre. I scoured the internet and couldn’t find anybody with the same issue in FastAI, so I just posted mine, and now I saw yours.

I’m wondering if it’s a bug at this point.

Are you on the latest version of fastai? I’m getting this error and I built a new conda environment with the latest version of fastai from github along with the current pytorch on Cuda 11.

Yes. I did all of that today - I git cloned latest from fastcore/fastai/fastprogress etc, wrestled with my conda being out of date, got that updated, got everything “running” in a fresh conda env up until this point.

So maybe it is a bug?

do you have the full stack trace?


RuntimeError Traceback (most recent call last)
in
1 learn = cnn_learner(dls, resnet18, metrics=error_rate)
----> 2 learn.fine_tune(4)

C:\github\fastai_v4\fastai\fastai\callback\schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
155 “Fine tune with freeze for freeze_epochs then with unfreeze from epochs using discriminative LR”
156 self.freeze()
–> 157 self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
158 base_lr /= 2
159 self.unfreeze()

C:\github\fastai_v4\fastai\fastai\callback\schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
110 scheds = {‘lr’: combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
111 ‘mom’: combined_cos(pct_start, *(self.moms if moms is None else moms))}
–> 112 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
113
114 # Cell

C:\github\fastai_v4\fastai\fastai\learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
210 self.opt.set_hypers(lr=self.lr if lr is None else lr)
211 self.n_epoch = n_epoch
–> 212 self._with_events(self._do_fit, ‘fit’, CancelFitException, self._end_cleanup)
213
214 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

C:\github\fastai_v4\fastai\fastai\learner.py in with_events(self, f, event_type, ex, final)
158
159 def with_events(self, f, event_type, ex, final=noop):
–> 160 try: self(f’before
{event_type}’); f()
161 except ex: self(f’after_cancel
{event_type}’)
162 self(f’after_{event_type}’); final()

C:\github\fastai_v4\fastai\fastai\learner.py in _do_fit(self)
201 for epoch in range(self.n_epoch):
202 self.epoch=epoch
–> 203 self._with_events(self._do_epoch, ‘epoch’, CancelEpochException)
204
205 def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

C:\github\fastai_v4\fastai\fastai\learner.py in with_events(self, f, event_type, ex, final)
158
159 def with_events(self, f, event_type, ex, final=noop):
–> 160 try: self(f’before
{event_type}’); f()
161 except ex: self(f’after_cancel
{event_type}’)
162 self(f’after_{event_type}’); final()

C:\github\fastai_v4\fastai\fastai\learner.py in _do_epoch(self)
195
196 def _do_epoch(self):
–> 197 self._do_epoch_train()
198 self._do_epoch_validate()
199

C:\github\fastai_v4\fastai\fastai\learner.py in _do_epoch_train(self)
187 def _do_epoch_train(self):
188 self.dl = self.dls.train
–> 189 self._with_events(self.all_batches, ‘train’, CancelTrainException)
190
191 def _do_epoch_validate(self, ds_idx=1, dl=None):

C:\github\fastai_v4\fastai\fastai\learner.py in with_events(self, f, event_type, ex, final)
158
159 def with_events(self, f, event_type, ex, final=noop):
–> 160 try: self(f’before
{event_type}’); f()
161 except ex: self(f’after_cancel
{event_type}’)
162 self(f’after_{event_type}’); final()

C:\github\fastai_v4\fastai\fastai\learner.py in all_batches(self)
164 def all_batches(self):
165 self.n_iter = len(self.dl)
–> 166 for o in enumerate(self.dl): self.one_batch(*o)
167
168 def _do_one_batch(self):

C:\github\fastai_v4\fastai\fastai\data\load.py in iter(self)
111 if self.device is not None and multiprocessing.get_start_method().lower() == “fork”:
112 b = to_device(b, self.device)
–> 113 yield self.after_batch(b)
114 self.after_iter()
115 if hasattr(self, ‘it’): del(self.it)

C:\github\fastai_v4\fastcore\fastcore\transform.py in call(self, o)
196 self.fs.append(t)
197
–> 198 def call(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
199 def repr(self): return f"Pipeline: {’ -> '.join([f.name for f in self.fs if f.name != ‘noop’])}"
200 def getitem(self,i): return self.fs[i]

C:\github\fastai_v4\fastcore\fastcore\transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
148 for f in tfms:
149 if not is_enc: f = f.decode
–> 150 x = f(x, **kwargs)
151 return x
152

C:\github\fastai_v4\fastcore\fastcore\transform.py in call(self, x, **kwargs)
71 @property
72 def name(self): return getattr(self, ‘_name’, _get_name(self))
—> 73 def call(self, x, **kwargs): return self._call(‘encodes’, x, **kwargs)
74 def decode (self, x, **kwargs): return self._call(‘decodes’, x, **kwargs)
75 def repr(self): return f’{self.name}:\nencodes: {self.encodes}decodes: {self.decodes}’

C:\github\fastai_v4\fastcore\fastcore\transform.py in _call(self, fn, x, split_idx, **kwargs)
81 def _call(self, fn, x, split_idx=None, **kwargs):
82 if split_idx!=self.split_idx and self.split_idx is not None: return x
—> 83 return self._do_call(getattr(self, fn), x, **kwargs)
84
85 def _do_call(self, f, x, **kwargs):

C:\github\fastai_v4\fastcore\fastcore\transform.py in do_call(self, f, x, **kwargs)
88 ret = f.returns(x) if hasattr(f,‘returns’) else None
89 return retain_type(f(x, **kwargs), x, ret)
—> 90 res = tuple(self.do_call(f, x, **kwargs) for x
in x)
91 return retain_type(res, x)
92

C:\github\fastai_v4\fastcore\fastcore\transform.py in (.0)
88 ret = f.returns(x) if hasattr(f,‘returns’) else None
89 return retain_type(f(x, **kwargs), x, ret)
—> 90 res = tuple(self.do_call(f, x, **kwargs) for x_ in x)
91 return retain_type(res, x)
92

C:\github\fastai_v4\fastcore\fastcore\transform.py in do_call(self, f, x, **kwargs)
87 if f is None: return x
88 ret = f.returns(x) if hasattr(f,‘returns’) else None
—> 89 return retain_type(f(x, **kwargs), x, ret)
90 res = tuple(self.do_call(f, x, **kwargs) for x
in x)
91 return retain_type(res, x)

C:\github\fastai_v4\fastcore\fastcore\dispatch.py in call(self, *args, **kwargs)
116 elif self.inst is not None: f = MethodType(f, self.inst)
117 elif self.owner is not None: f = MethodType(f, self.owner)
–> 118 return f(*args, **kwargs)
119
120 def get(self, inst, owner):

C:\github\fastai_v4\fastai\fastai\data\transforms.py in encodes(self, x)
360 self.mean,self.std = x.mean(self.axes, keepdim=True),x.std(self.axes, keepdim=True)+1e-7
361
–> 362 def encodes(self, x:TensorImage): return (x-self.mean) / self.std
363 def decodes(self, x:TensorImage):
364 f = to_cpu if x.device.type==‘cpu’ else noop

C:\github\fastai_v4\fastai\fastai\torch_core.py in torch_function(self, func, types, args, kwargs)
327 convert=False
328 if _torch_handled(args, self._opt, func): convert,types = type(self),(torch.Tensor,)
–> 329 res = super().torch_function(func, types, args=args, kwargs=kwargs)
330 if convert: res = convert(res)
331 if isinstance(res, TensorBase): res.set_meta(self, as_copy=True)

C:\ProgramData\Anaconda3_2020\envs\fastAI2021\lib\site-packages\torch\tensor.py in torch_function(cls, func, types, args, kwargs)
960
961 with _C.DisableTorchFunction():
–> 962 ret = func(*args, **kwargs)
963 return _convert(ret, cls)
964

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Probably the same error as this one:

Okay, sure, but this is just the 02_Productionfast book…I haven’t made any custom modifications that would cause it to be broken, other than pointing the path to the locations of the various submodules that aren’t submodules. The reason I’m doing that as opposed to installing, is that it seems fastbooks doesn’t like pytorch 1.8.1, as it complains about TorchVision being too new (>0.9 - current is 0.9.1) for it to function (which I sort of doubt…)

This is the “DL in 5 minutes” example where I’m supposed to just hook up my Azure key, pull some images and watch how DL works.

My guess is it is something about the Dataloader? I’ve tried setting the device on the dataloader when I get the bears dls, but to no avail.

Can I debug a jupyter notebook in VSCode or PyCharm? I feel like my debugging is drastically slowed by the jupyter notebook. If I could inspect a few variables in the call stack, I could identify where the problem is in minutes. In jupyter notebook…I’m not sure how I would do that, and I think I end up needing to write a line of code for every value I want to inspect. I know the lessons said there was a jupyter notebook with all the text ripped out, so maybe I can find that, and cobble together something that works in a .py file that I can debug.

So after wrestling with my Conda install all day, I finally distilled the 02 production training to this:

# To get a conda env setup:
# conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c conda-forge
# conda install -c conda-forge python-graphviz
# conda install -c anaconda graphviz
# pip install azure-cognitiveservices-search-imagesearch

import sys
import os

sys.path.append("..")
sys.path.append("../fastai")
sys.path.append("../fastcore")
sys.path.append("../fastprogress")
sys.path.append("../nbdev")
sys.path.append("../fastbook")

from fastbook.utils import *

from fastai.vision.widgets import *

def main() :
    key = os.environ.get('AZURE_SEARCH_KEY', 'XXX') # I put my azure key here. No peeking

    bear_types = 'grizzly','black','teddy'
    path = Path('bears')

    if not path.exists():
        path.mkdir()
        for o in bear_types:
            dest = (path/o)
            dest.mkdir(exist_ok=True)
            results = search_images_bing(key, f'{o} bear')
            download_images(dest, urls=results.attrgot('contentUrl'))

    fns = get_image_files(path)
    # this was taking forever or hanging so I manually removed broken images...
    # failed = verify_images(fns)
    # failed.map(Path.unlink)

    bears = DataBlock(
        blocks=(ImageBlock, CategoryBlock),
        get_items=get_image_files,
        splitter=RandomSplitter(valid_pct=0.2, seed=42),
        get_y=parent_label,
        item_tfms=Resize(128))

    dls = bears.dataloaders(path, num_workers=0)

    learn = cnn_learner(dls, resnet18, metrics=error_rate)
    learn.fine_tune(4) # crashes here with "found on two devices, cuda:0 and cpu"
    interp = ClassificationInterpretation.from_learner(learn)

if __name__ == '__main__':
    main()

So now, I can see it is in Normalize.encode
@classmethod
def from_stats(cls, mean, std, dim=1, ndim=4, cuda=True): return cls(*broadcast_vec(dim, ndim, mean, std, cuda=cuda))

def setups(self, dl:DataLoader):
    if self.mean is None or self.std is None:
        x,*_ = dl.one_batch()
        self.mean,self.std = x.mean(self.axes, keepdim=True),x.std(self.axes, keepdim=True)+1e-7

def encodes(self, x:TensorImage): return (x-self.mean) / self.std

where x is on the CPU and self.mean is on the GPU.

But I don’t even understand how Normalize is getting put as a transform, or more importantly why my data isn’t being put on the GPU by default.

From what I can tell, this is the problem:
In data/load.py iter

# fix issue 2899. If the process start method isn't fork, the data will be copied to cuda in learner 
one_batch.
if self.device is not None and multiprocessing.get_start_method().lower() == "fork":
    b = to_device(b, self.device)

the start_method is spawn for Windows 10
and one_batch does not get called before the error happens, I imagine because it is running transforms on the object and the transforms expect the data to already be on the GPU.

Changing it to this makes it work:

    if self.device is not None:
        b = to_device(b, self.device)