Unet_learner with multi gpu

ankmaury · July 5, 2022, 7:14pm

Hi all,
I am trying to run unet_learner(the super res one ) on multi gpu machine. Using fastai v2. But I get the following error.

My code:

from fastai.vision.all import *
from fastai.distributed import *
from fastai.vision.models.xresnet import *

path = Path('/home/ubuntu/.fastai/data/512/6/')
path_good = path/'input_image'
path_bad = path/'output_image'

base_loss = F.l1_loss
class FeatureLoss(Module):
    def __init__(self, m_feat, layer_ids, layer_wgts):
        self.m_feat = m_feat
        self.loss_features = [self.m_feat[i] for i in layer_ids]
        self.hooks = hook_outputs(self.loss_features, detach=False)
        self.wgts = layer_wgts
        self.metric_names = ['pixel',] + [f'feat_{i}' for i in range(len(layer_ids))
              ] + [f'gram_{i}' for i in range(len(layer_ids))]

    def make_features(self, x, clone=False):
        self.m_feat(x)
        return [(o.clone() if clone else o) for o in self.hooks.stored]

    def forward(self, input, target, reduction='mean'):
        out_feat = self.make_features(target, clone=True)
        in_feat = self.make_features(input)
        self.feat_losses = [base_loss(input,target,reduction=reduction)]
        self.feat_losses += [base_loss(f_in, f_out,reduction=reduction)*w
                             for f_in, f_out, w in zip(in_feat, out_feat, self.wgts)]
        self.feat_losses += [base_loss(gram_matrix(f_in), gram_matrix(f_out),reduction=reduction)*w**2 * 5e3
                             for f_in, f_out, w in zip(in_feat, out_feat, self.wgts)]
        if reduction=='none':
            self.feat_losses = [f.mean(dim=[1,2,3]) for f in self.feat_losses[:4]] + [f.mean(dim=[1,2]) for f in self.feat_losses[4:]]
        for n,l in zip(self.metric_names, self.feat_losses): setattr(self, n, l)
        return sum(self.feat_losses)

    def __del__(self): self.hooks.remove()

arch = resnet34
def gram_matrix(x):
    n,c,h,w = x.size()
    x = x.view(n, c, -1)
    return (x @ x.transpose(1,2))/(c*h*w)
def get_y(x):
    return path_good/x.name

bs,size=32,128
dblock = DataBlock(blocks=(ImageBlock, ImageBlock),
                   get_items=get_image_files,
                   get_y = get_y,
                   splitter=RandomSplitter(valid_pct=0.2, seed=42),
                   item_tfms=Resize(size),
                batch_tfms=[ Normalize.from_stats(*imagenet_stats)])
dls = dblock.dataloaders(path_bad, bs=bs, path=path, item_tfms=Resize(size))
dls.c = 3

#Feature loss
t = tensor(dls.valid_ds[0][1]).float().permute(2,0,1)/255.
t = torch.stack([t,t])

base_loss = F.l1_loss
vgg_m = vgg16_bn(True).features.cuda().eval()
vgg_m = vgg_m.requires_grad_(False)

blocks = [i-1 for i,o in enumerate(vgg_m.children()) if isinstance(o,nn.MaxPool2d)]
blocks, [vgg_m[i] for i in blocks]

feat_loss = FeatureLoss(vgg_m, blocks[2:5], [5,15,2])
learn = unet_learner(dls, arch, loss_func=feat_loss, metrics=LossMetrics(feat_loss.metric_names),
                      blur=True, norm_type=NormType.Weight)

with learn.distrib_ctx(sync_bn=False):
        learn.fine_tune(1)
learn.save("test_multi_gpu")

Error which I get:

(base) ubuntu@ip-172-31-39-53:~/distributed_test$ accelerate launch distrib_unet.py -m fastai2.launch
epoch     train_loss  valid_loss  pixel     feat_0    feat_1    feat_2    gram_0    gram_1    gram_2    time
Traceback (most recent call last):----------------------------------------| 0.00% [0/37 00:00<00:00]
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 79, in <module>
    learn.fine_tune(1)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 168, in fine_tune
    self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 122, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 241, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 230, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 224, in _do_epoch
    self._do_epoch_train()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 216, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 185, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 212, in one_batch
    self._with_events(self._do_one_batch, 'batch', CancelBatchException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 194, in _do_one_batch
    self.loss_grad = self.loss_func(self.pred, *self.yb)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 24, in forward
    out_feat = self.make_features(target, clone=True)
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 20, in make_features
    self.m_feat(x)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
    input = module(input)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 447, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)
Traceback (most recent call last):
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 79, in <module>
    learn.fine_tune(1)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 168, in fine_tune
    self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 122, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 241, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 230, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 224, in _do_epoch
    self._do_epoch_train()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 216, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 185, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 212, in one_batch
    self._with_events(self._do_one_batch, 'batch', CancelBatchException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 194, in _do_one_batch
    self.loss_grad = self.loss_func(self.pred, *self.yb)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 24, in forward
    out_feat = self.make_features(target, clone=True)
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 20, in make_features
    self.m_feat(x)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
    input = module(input)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 447, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:2 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)
Traceback (most recent call last):
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 79, in <module>
    learn.fine_tune(1)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 168, in fine_tune
    self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 122, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 241, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 230, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 224, in _do_epoch
    self._do_epoch_train()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 216, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 185, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 212, in one_batch
    self._with_events(self._do_one_batch, 'batch', CancelBatchException)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 194, in _do_one_batch
    self.loss_grad = self.loss_func(self.pred, *self.yb)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 24, in forward
    out_feat = self.make_features(target, clone=True)
  File "/home/ubuntu/distributed_test/distrib_unet.py", line 20, in make_features
    self.m_feat(x)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
    input = module(input)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 447, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 126324 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 126325) of binary: /home/ubuntu/mambaforge/bin/python
Traceback (most recent call last):
  File "/home/ubuntu/mambaforge/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==1.11.0', 'console_scripts', 'torchrun')())
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
    return f(*args, **kwargs)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/run.py", line 724, in main
    run(args)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/run.py", line 715, in run
    elastic_launch(
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
distrib_unet.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2022-07-05_19:41:21
  host      : ip-172-31-39-53.ap-south-1.compute.internal
  rank      : 2 (local_rank: 2)
  exitcode  : 1 (pid: 126326)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
  time      : 2022-07-05_19:41:21
  host      : ip-172-31-39-53.ap-south-1.compute.internal
  rank      : 3 (local_rank: 3)
  exitcode  : 1 (pid: 126327)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2022-07-05_19:41:21
  host      : ip-172-31-39-53.ap-south-1.compute.internal
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 126325)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Traceback (most recent call last):
  File "/home/ubuntu/mambaforge/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 43, in main
    args.func(args)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/accelerate/commands/launch.py", line 562, in launch_command
    multi_gpu_launcher(args)
  File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/accelerate/commands/launch.py", line 306, in multi_gpu_launcher
    raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['torchrun', '--nproc_per_node', '4', 'distrib_unet.py', '-m', 'fastai2.launch']' returned non-zero exit status 1.
(base) ubuntu@ip-172-31-39-53:~/di

Any ideas on how to solve it?

Thanks,
Ankit

muellerzr · July 5, 2022, 7:39pm

There should have been a much larger stack trace here, could you please post it?

ankmaury · July 5, 2022, 7:43pm

Hi @muellerzr updated the entire stack trace in the question.

muellerzr · July 5, 2022, 7:49pm

Your issue here likely stems from this. Any CUDA tensors need to be on the right device when in multi-gpu. There’s a few ways to do this, but the best way would be to do the following at the start of your script:

from accelerate import Accelerator
accelerator = Accelerator()

From here any time you call .cuda or .to replace it with:

.to(accelerator.device)

And then call your Learner like normal. Note that this won’t work well in a notebook, only as a script. It gets a bit more complicated from a notebook but I can work on an answer to this tommorow

ankmaury · July 5, 2022, 7:57pm

Thanks a lot @muellerzr it works like a charm!!.
Eagerly waiting for notebook solution as well.

muellerzr · July 5, 2022, 8:05pm

Could you show your current solution?

ankmaury · July 6, 2022, 6:23am

Sure @muellerzr . It’s just one line change as you suggested, sharing the entire code again in case if it helps someone

from fastai.vision.all import *
from fastai.distributed import *
from fastai.vision.models.xresnet import *
from accelerate import Accelerator
accelerator = Accelerator()

path = Path('/home/ubuntu/.fastai/data/512/6/')
path_good = path/'input_image'
path_bad = path/'output_image'

base_loss = F.l1_loss
class FeatureLoss(Module):
    def __init__(self, m_feat, layer_ids, layer_wgts):
        self.m_feat = m_feat
        self.loss_features = [self.m_feat[i] for i in layer_ids]
        self.hooks = hook_outputs(self.loss_features, detach=False)
        self.wgts = layer_wgts
        self.metric_names = ['pixel',] + [f'feat_{i}' for i in range(len(layer_ids))
              ] + [f'gram_{i}' for i in range(len(layer_ids))]

    def make_features(self, x, clone=False):
        self.m_feat(x)
        return [(o.clone() if clone else o) for o in self.hooks.stored]

    def forward(self, input, target, reduction='mean'):
        out_feat = self.make_features(target, clone=True)
        in_feat = self.make_features(input)
        self.feat_losses = [base_loss(input,target,reduction=reduction)]
        self.feat_losses += [base_loss(f_in, f_out,reduction=reduction)*w
                             for f_in, f_out, w in zip(in_feat, out_feat, self.wgts)]
        self.feat_losses += [base_loss(gram_matrix(f_in), gram_matrix(f_out),reduction=reduction)*w**2 * 5e3
                             for f_in, f_out, w in zip(in_feat, out_feat, self.wgts)]
        if reduction=='none':
            self.feat_losses = [f.mean(dim=[1,2,3]) for f in self.feat_losses[:4]] + [f.mean(dim=[1,2]) for f in self.feat_losses[4:]]
        for n,l in zip(self.metric_names, self.feat_losses): setattr(self, n, l)
        return sum(self.feat_losses)

    def __del__(self): self.hooks.remove()


arch = resnet34
#arch = xresnet34
def gram_matrix(x):
    n,c,h,w = x.size()
    x = x.view(n, c, -1)
    return (x @ x.transpose(1,2))/(c*h*w)
def get_y(x):
    return path_good/x.name


bs,size=32,128
dblock = DataBlock(blocks=(ImageBlock, ImageBlock),
                   get_items=get_image_files,
                   #get_y=lambda x: path_good/x.name,
                   get_y = get_y,
                   splitter=RandomSplitter(valid_pct=0.2, seed=42),
                   item_tfms=Resize(size),
                   #batch_tfms=[*aug_transforms(max_zoom=2.), Normalize.from_stats(*imagenet_stats)])
                batch_tfms=[ Normalize.from_stats(*imagenet_stats)])
dls = dblock.dataloaders(path_bad, bs=bs, path=path, item_tfms=Resize(size))
dls.c = 3

#Feature loss
t = tensor(dls.valid_ds[0][1]).float().permute(2,0,1)/255.
t = torch.stack([t,t])

base_loss = F.l1_loss
#vgg_m = vgg16_bn(True).features.cuda().eval()
vgg_m = vgg16_bn(True).features.to(accelerator.device).eval()
vgg_m = vgg_m.requires_grad_(False)

blocks = [i-1 for i,o in enumerate(vgg_m.children()) if isinstance(o,nn.MaxPool2d)]
blocks, [vgg_m[i] for i in blocks]

feat_loss = FeatureLoss(vgg_m, blocks[2:5], [5,15,2])

learn = unet_learner(dls, arch, loss_func=feat_loss, metrics=LossMetrics(feat_loss.metric_names),
                      blur=True, norm_type=NormType.Weight)


with learn.distrib_ctx(sync_bn=False):
    learn.fine_tune(1)
learn.save("test_multi_gpu")

Any inputs for the notebook solution?