I’m having some trouble using multiple GPUs to train a language model on v2. Following the documentation:
and the sample:
I came up with this:
from fastai import *
from fastai.text import *
from fastai.text.all import *
from fastai.distributed import *
path = Path('/mnt/harddrive/text_files')
imdb = DataBlock(blocks=(TextBlock.from_folder(path, is_lm=True), CategoryBlock),
get_items=get_text_files,
get_y=parent_label,
splitter=RandomSplitter(valid_pct=0.2, seed=1))
dbunch = imdb.dataloaders(path, bs=64, seq_len=80, num_workers=0)
learn_lm = language_model_learner(dbunch, AWD_LSTM).to_fp16()
learn_lm.freeze()
with learn_lm.distrib_ctx():
learn_lm.fit_one_cycle(1, 0.05, moms=(0.8,0.7,0.8))
When training with a smaller dataset, it trains properly most of the time. With a larger dataset, it always fails. The error is almost identical in both cases, the only part that ever changes is the “and [77] at entry 32” part:
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [77] at entry 32
The difference between it working and not working, can be as subtle as changing the seed on the RandomSplitter. I’ve added some debug lines and provided the output:
First attempt, with seed: 1
python3 -m fastai.launch dist_simple.py
World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 1
bs: 64
seq_len: 80
Learning
dbunch loaded
_
rank_distrib(): 1
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
epoch train_loss valid_loss time
0 4.363470 3.997428 00:00
Second attempt, with seed: 3
python3 -m fastai.launch dist_simple.py
World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 3
bs: 64
seq_len: 80
Learning
dbunch loaded
_
rank_distrib(): 1
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
epoch train_loss valid_loss time
Traceback (most recent call last):███████████████████████████------------------------------------------------| 50.00% [2/4 00:00<00:00 4.7127]
File "/home/chess/project/training/dist_simple.py", line 51, in <module>
learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7,0.8))
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
self._do_epoch_train()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 101, in __iter__
for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
data = self._next_data()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 475, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
data = next(self.dataset_iter)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 110, in create_batches
yield from map(self.do_batch, self.chunkify(res))
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 133, in do_batch
def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 132, in create_batch
def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in fa_collate
else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in <listcomp>
else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 47, in fa_collate
return (default_collate(t) if isinstance(b, _collate_types)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
return torch.stack(batch, 0, out=out)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/torch_core.py", line 325, in __torch_function__
res = super().__torch_function__(func, types, args=args, kwargs=kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/tensor.py", line 995, in __torch_function__
ret = func(*args, **kwargs)
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [77] at entry 32
Third attempt, seed: 0:
Everything is the same besides the last line:
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [67] at entry 32
Forth attempt, an example of a failure with batch size changed from 64 to 32, note the “entry” in the stack error at the end changed from 32 to 16:
World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 1
bs: 32
seq_len: 80
Learning
dbunch loaded
_
rank_distrib(): 1
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
epoch train_loss valid_loss time
Traceback (most recent call last):███████████████████████████████████████████████████------------------------| 75.00% [6/8 00:00<00:00 4.2810]
File "/home/chess/project/training/dist_simple.py", line 51, in <module>
learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7,0.8))
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
self._do_epoch_train()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 101, in __iter__
for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
data = self._next_data()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 475, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
data = next(self.dataset_iter)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 110, in create_batches
yield from map(self.do_batch, self.chunkify(res))
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 133, in do_batch
def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 132, in create_batch
def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in fa_collate
else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in <listcomp>
else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 47, in fa_collate
return (default_collate(t) if isinstance(b, _collate_types)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
return torch.stack(batch, 0, out=out)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/torch_core.py", line 325, in __torch_function__
res = super().__torch_function__(func, types, args=args, kwargs=kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/tensor.py", line 995, in __torch_function__
ret = func(*args, **kwargs)
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [14] at entry 16
When I run it with a single GPU, it works 100% of the time:
python3 -m fastai.launch --gpus=0 dist_simple.py
Things I’ve tried:
- Different versions of the dataset, one with many smaller files, one with a few larger files…etc.
- Using this code instead of the datablock:
dbunch = TextDataLoaders.from_folder(path, is_lm=True, valid_pct=0.1)
- Different combinations of bs, seq_len, valid_pct, and num_workers.
- GrandparentSplitter instead of RandomSplitter.
- Adding drop_last=True to the dataloader.
My analysis:
Based on this post by sgugger, if there happens to be a remainder in the last batch (not enough data to fill up the batch fully), it’s dropped:
I think this issue affects language models specifically, as classifiers use padding when the data doesn’t line up. I believe it’s failing to drop the remainder sometimes, resulting in the “stack expects each tensor to be equal size” error when using a distributed language learner with multiple GPUs. This would explain why the “entry” number in the error is always the last item in the batch that gets sent to the GPU, and can be calculated by looking at the batch_size and world_size (number of GPUs).
In the examples above:
world_size: 2
bs: 64
produces:
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [77] at entry 32
world_size: 2
bs: 32
produces:
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [14] at entry 16
This matches with the math I see in the class DistributedDL(TfmdDL): section in the documentation:
Am I on the right track? If my analysis is correct and drop_last isn’t being properly applied, Any ideas on how I can address that?
Thanks!
Update: I’ve gone into python3.8/site-packages/fastai/data/load.py, and hard-coded drop_last to always be True and activated, however the error is consistent.
Update2: While I saw this post before, I didn’t think it was related. After a deeper dive I see @pierreguillou had the same problem last July with no solution:
So I tried with DataParallel instead of DistributedDataParallel:
with learn_lm.parallel_ctx():
learn_lm.fit_one_cycle(1, 0.05)
but now I’m getting this:
python3 -m fastai.launch dist_simple.py
World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 3
bs: 64
seq_len: 32
dbunch loaded
_
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:30: UserWarning:
There is an imbalance between your GPUs. You may want to exclude GPU 1 which
has less than 75% of the memory or cores of GPU 0. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
epoch train_loss valid_loss time
rank_distrib(): 1--------------------------------------------------------------------------------------------| 0.00% [0/9 00:00<00:00]
Learning
/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:30: UserWarning:
There is an imbalance between your GPUs. You may want to exclude GPU 1 which
has less than 75% of the memory or cores of GPU 0. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
epoch train_loss valid_loss time
Traceback (most recent call last):---------------------------------------------------------------------------| 0.00% [0/10 00:00<00:00]
File "/home/chess/project/training/dist_simple.py", line 57, in <module>
learn_lm.fit_one_cycle(1, lr)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
self._do_epoch_train()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 184, in one_batch
self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 169, in _do_one_batch
self.pred = self.model(*self.xb)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 153, in forward
raise RuntimeError("module must have its parameters and buffers "
RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cuda:1
Traceback (most recent call last):
File "/home/chess/project/training/dist_simple.py", line 57, in <module>
learn_lm.fit_one_cycle(1, lr)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
self._do_epoch_train()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 184, in one_batch
self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 169, in _do_one_batch
self.pred = self.model(*self.xb)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 161, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/_utils.py", line 428, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/text/models/awdlstm.py", line 106, in forward
output, new_h = rnn(output, self.hidden[l])
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/text/models/awdlstm.py", line 53, in forward
return self.module(*args)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 581, in forward
result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:1 and hidden tensor at cuda:0
I don’t think there’s a reliable way to train a language model on v2 with multiple GPUs at the moment. Moving forward with single GPU training!