I’m trying to run the simple Cats vs Dogs fine_tune tutorial code using distributed training (DDP - with multiple GPUs on a single machine). I followed the DDP tutorial and sample code.
The code below works when you run it without DDP:
python3 tutorial_ddp.py
but it throws the error RuntimeError: No grad accumulator for a saved leaf!
when running via DDP:
python3 -m fastai.launch tutorial_ddp.py
Code
import os
from fastai.vision.all import *
from fastai.distributed import *
path = rank0_first(untar_data, URLs.PETS)
files = get_image_files(path/"images")
def label_func(f):
return f[0].isupper()
dls = ImageDataLoaders.from_name_func(path, files, label_func, item_tfms=Resize(224))
learn = cnn_learner(dls, resnet34, metrics=error_rate)
with learn.distrib_ctx():
learn.fine_tune(1)
# only save on the first instance or if not running in DDP mode
if 'RANK' not in os.environ or int(os.environ['RANK']) == 0:
learn.export(f'{os.environ["HOME"]}/tutorial_ddp.pk')
Stacktrace
Traceback (most recent call last):
File "tutorial_ddp.py", line 16, in <module>
learn.fine_tune(1)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/callback/schedule.py", line 157, in fine_tune
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 211, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 202, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 196, in _do_epoch
self._do_epoch_train()
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 188, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 166, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 184, in one_batch
self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/learner.py", line 177, in _do_one_batch
self.loss_grad.backward()
File "/home/jupyter/.local/lib/python3.7/site-packages/torch/tensor.py", line 220, in backward
create_graph=create_graph)
File "/home/jupyter/.local/lib/python3.7/site-packages/torch/overrides.py", line 1060, in handle_torch_function
result = overloaded_arg.__torch_function__(public_api, types, args, kwargs)
File "/home/jupyter/.local/lib/python3.7/site-packages/fastai/torch_core.py", line 325, in __torch_function__
res = super().__torch_function__(func, types, args=args, kwargs=kwargs)
File "/home/jupyter/.local/lib/python3.7/site-packages/torch/tensor.py", line 995, in __torch_function__
ret = func(*args, **kwargs)
File "/home/jupyter/.local/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/jupyter/.local/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
File "/home/jupyter/.local/lib/python3.7/site-packages/torch/autograd/function.py", line 89, in apply
return self._forward_cls.backward(self, *args) # type: ignore
File "/home/jupyter/.local/lib/python3.7/site-packages/torch/nn/modules/_functions.py", line 58, in backward
saved_input, weight, mean, invstd, count_tensor = self.saved_tensors
RuntimeError: No grad accumulator for a saved leaf!
Any idea what is causing this issue?