Hi all,
I am trying to run unet_learner(the super res one ) on multi gpu machine. Using fastai v2. But I get the following error.
My code:
from fastai.vision.all import *
from fastai.distributed import *
from fastai.vision.models.xresnet import *
path = Path('/home/ubuntu/.fastai/data/512/6/')
path_good = path/'input_image'
path_bad = path/'output_image'
base_loss = F.l1_loss
class FeatureLoss(Module):
def __init__(self, m_feat, layer_ids, layer_wgts):
self.m_feat = m_feat
self.loss_features = [self.m_feat[i] for i in layer_ids]
self.hooks = hook_outputs(self.loss_features, detach=False)
self.wgts = layer_wgts
self.metric_names = ['pixel',] + [f'feat_{i}' for i in range(len(layer_ids))
] + [f'gram_{i}' for i in range(len(layer_ids))]
def make_features(self, x, clone=False):
self.m_feat(x)
return [(o.clone() if clone else o) for o in self.hooks.stored]
def forward(self, input, target, reduction='mean'):
out_feat = self.make_features(target, clone=True)
in_feat = self.make_features(input)
self.feat_losses = [base_loss(input,target,reduction=reduction)]
self.feat_losses += [base_loss(f_in, f_out,reduction=reduction)*w
for f_in, f_out, w in zip(in_feat, out_feat, self.wgts)]
self.feat_losses += [base_loss(gram_matrix(f_in), gram_matrix(f_out),reduction=reduction)*w**2 * 5e3
for f_in, f_out, w in zip(in_feat, out_feat, self.wgts)]
if reduction=='none':
self.feat_losses = [f.mean(dim=[1,2,3]) for f in self.feat_losses[:4]] + [f.mean(dim=[1,2]) for f in self.feat_losses[4:]]
for n,l in zip(self.metric_names, self.feat_losses): setattr(self, n, l)
return sum(self.feat_losses)
def __del__(self): self.hooks.remove()
arch = resnet34
def gram_matrix(x):
n,c,h,w = x.size()
x = x.view(n, c, -1)
return (x @ x.transpose(1,2))/(c*h*w)
def get_y(x):
return path_good/x.name
bs,size=32,128
dblock = DataBlock(blocks=(ImageBlock, ImageBlock),
get_items=get_image_files,
get_y = get_y,
splitter=RandomSplitter(valid_pct=0.2, seed=42),
item_tfms=Resize(size),
batch_tfms=[ Normalize.from_stats(*imagenet_stats)])
dls = dblock.dataloaders(path_bad, bs=bs, path=path, item_tfms=Resize(size))
dls.c = 3
#Feature loss
t = tensor(dls.valid_ds[0][1]).float().permute(2,0,1)/255.
t = torch.stack([t,t])
base_loss = F.l1_loss
vgg_m = vgg16_bn(True).features.cuda().eval()
vgg_m = vgg_m.requires_grad_(False)
blocks = [i-1 for i,o in enumerate(vgg_m.children()) if isinstance(o,nn.MaxPool2d)]
blocks, [vgg_m[i] for i in blocks]
feat_loss = FeatureLoss(vgg_m, blocks[2:5], [5,15,2])
learn = unet_learner(dls, arch, loss_func=feat_loss, metrics=LossMetrics(feat_loss.metric_names),
blur=True, norm_type=NormType.Weight)
with learn.distrib_ctx(sync_bn=False):
learn.fine_tune(1)
learn.save("test_multi_gpu")
Error which I get:
(base) ubuntu@ip-172-31-39-53:~/distributed_test$ accelerate launch distrib_unet.py -m fastai2.launch
epoch train_loss valid_loss pixel feat_0 feat_1 feat_2 gram_0 gram_1 gram_2 time
Traceback (most recent call last):----------------------------------------| 0.00% [0/37 00:00<00:00]
File "/home/ubuntu/distributed_test/distrib_unet.py", line 79, in <module>
learn.fine_tune(1)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 168, in fine_tune
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 122, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 241, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 230, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 224, in _do_epoch
self._do_epoch_train()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 216, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 185, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 212, in one_batch
self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 194, in _do_one_batch
self.loss_grad = self.loss_func(self.pred, *self.yb)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/distributed_test/distrib_unet.py", line 24, in forward
out_feat = self.make_features(target, clone=True)
File "/home/ubuntu/distributed_test/distrib_unet.py", line 20, in make_features
self.m_feat(x)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
input = module(input)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 447, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)
Traceback (most recent call last):
File "/home/ubuntu/distributed_test/distrib_unet.py", line 79, in <module>
learn.fine_tune(1)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 168, in fine_tune
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 122, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 241, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 230, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 224, in _do_epoch
self._do_epoch_train()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 216, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 185, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 212, in one_batch
self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 194, in _do_one_batch
self.loss_grad = self.loss_func(self.pred, *self.yb)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/distributed_test/distrib_unet.py", line 24, in forward
out_feat = self.make_features(target, clone=True)
File "/home/ubuntu/distributed_test/distrib_unet.py", line 20, in make_features
self.m_feat(x)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
input = module(input)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 447, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:2 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)
Traceback (most recent call last):
File "/home/ubuntu/distributed_test/distrib_unet.py", line 79, in <module>
learn.fine_tune(1)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 168, in fine_tune
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/callback/schedule.py", line 122, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 241, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 230, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 224, in _do_epoch
self._do_epoch_train()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 216, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 185, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 212, in one_batch
self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 179, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/fastai/learner.py", line 194, in _do_one_batch
self.loss_grad = self.loss_func(self.pred, *self.yb)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/distributed_test/distrib_unet.py", line 24, in forward
out_feat = self.make_features(target, clone=True)
File "/home/ubuntu/distributed_test/distrib_unet.py", line 20, in make_features
self.m_feat(x)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
input = module(input)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 447, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 126324 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 126325) of binary: /home/ubuntu/mambaforge/bin/python
Traceback (most recent call last):
File "/home/ubuntu/mambaforge/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==1.11.0', 'console_scripts', 'torchrun')())
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/run.py", line 724, in main
run(args)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/run.py", line 715, in run
elastic_launch(
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
distrib_unet.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2022-07-05_19:41:21
host : ip-172-31-39-53.ap-south-1.compute.internal
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 126326)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2022-07-05_19:41:21
host : ip-172-31-39-53.ap-south-1.compute.internal
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 126327)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2022-07-05_19:41:21
host : ip-172-31-39-53.ap-south-1.compute.internal
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 126325)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Traceback (most recent call last):
File "/home/ubuntu/mambaforge/bin/accelerate", line 8, in <module>
sys.exit(main())
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 43, in main
args.func(args)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/accelerate/commands/launch.py", line 562, in launch_command
multi_gpu_launcher(args)
File "/home/ubuntu/mambaforge/lib/python3.9/site-packages/accelerate/commands/launch.py", line 306, in multi_gpu_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['torchrun', '--nproc_per_node', '4', 'distrib_unet.py', '-m', 'fastai2.launch']' returned non-zero exit status 1.
(base) ubuntu@ip-172-31-39-53:~/di
Any ideas on how to solve it?
Thanks,
Ankit