Fastai v2 text

chess · February 20, 2021, 5:16pm

I’m having some trouble using multiple GPUs to train a language model on v2. Following the documentation:

and the sample:

fastai/fastai/blob/master/nbs/examples/distrib.py

from fastai.vision.all import *
from fastai.distributed import *
from fastai.vision.models.xresnet import *

path = rank0_first(untar_data, URLs.IMAGEWOOF_320)
dls = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    splitter=GrandparentSplitter(valid_name='val'),
    get_items=get_image_files, get_y=parent_label,
    item_tfms=[RandomResizedCrop(160), FlipItem(0.5)],
    batch_tfms=Normalize.from_stats(*imagenet_stats)
).dataloaders(path, path=path, bs=64)

learn = Learner(dls, xresnet50(n_out=10), metrics=[accuracy,top_k_accuracy]).to_fp16()
with learn.distrib_ctx(): learn.fit_flat_cos(2, 1e-3, cbs=MixUp(0.1))

I came up with this:

from fastai import *
from fastai.text import *
from fastai.text.all import *
from fastai.distributed import *

path = Path('/mnt/harddrive/text_files')
imdb = DataBlock(blocks=(TextBlock.from_folder(path, is_lm=True), CategoryBlock),
                 get_items=get_text_files,
                 get_y=parent_label,
                 splitter=RandomSplitter(valid_pct=0.2, seed=1))
dbunch = imdb.dataloaders(path, bs=64, seq_len=80, num_workers=0)

learn_lm = language_model_learner(dbunch, AWD_LSTM).to_fp16()
learn_lm.freeze()

with learn_lm.distrib_ctx():
    learn_lm.fit_one_cycle(1, 0.05, moms=(0.8,0.7,0.8))

When training with a smaller dataset, it trains properly most of the time. With a larger dataset, it always fails. The error is almost identical in both cases, the only part that ever changes is the “and [77] at entry 32” part:

RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [77] at entry 32

The difference between it working and not working, can be as subtle as changing the seed on the RandomSplitter. I’ve added some debug lines and provided the output:

First attempt, with seed: 1

python3 -m fastai.launch dist_simple.py 
World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 1
bs: 64
seq_len: 80
Learning
dbunch loaded
_
rank_distrib(): 1
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
epoch     train_loss  valid_loss  time    
0         4.363470    3.997428    00:00

Second attempt, with seed: 3

python3 -m fastai.launch dist_simple.py
World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 3
bs: 64
seq_len: 80
Learning
dbunch loaded
_
rank_distrib(): 1
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
epoch     train_loss  valid_loss  time    
Traceback (most recent call last):███████████████████████████------------------------------------------------| 50.00% [2/4 00:00<00:00 4.7127]
  File "/home/chess/project/training/dist_simple.py", line 51, in <module>
    learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7,0.8))
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
    self._do_epoch_train()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 101, in __iter__
    for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
    data = self._next_data()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 475, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
    data = next(self.dataset_iter)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 110, in create_batches
    yield from map(self.do_batch, self.chunkify(res))
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 133, in do_batch
    def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 132, in create_batch
    def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in fa_collate
    else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in <listcomp>
    else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 47, in fa_collate
    return (default_collate(t) if isinstance(b, _collate_types)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/torch_core.py", line 325, in __torch_function__
    res = super().__torch_function__(func, types, args=args, kwargs=kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/tensor.py", line 995, in __torch_function__
    ret = func(*args, **kwargs)
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [77] at entry 32

Third attempt, seed: 0:

Everything is the same besides the last line:

RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [67] at entry 32

Forth attempt, an example of a failure with batch size changed from 64 to 32, note the “entry” in the stack error at the end changed from 32 to 16:

World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 1
bs: 32
seq_len: 80
Learning
dbunch loaded
_
rank_distrib(): 1
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
epoch     train_loss  valid_loss  time    
Traceback (most recent call last):███████████████████████████████████████████████████------------------------| 75.00% [6/8 00:00<00:00 4.2810]
  File "/home/chess/project/training/dist_simple.py", line 51, in <module>
    learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7,0.8))
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
    self._do_epoch_train()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 101, in __iter__
    for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
    data = self._next_data()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 475, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
    data = next(self.dataset_iter)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 110, in create_batches
    yield from map(self.do_batch, self.chunkify(res))
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 133, in do_batch
    def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 132, in create_batch
    def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in fa_collate
    else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 48, in <listcomp>
    else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/data/load.py", line 47, in fa_collate
    return (default_collate(t) if isinstance(b, _collate_types)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/torch_core.py", line 325, in __torch_function__
    res = super().__torch_function__(func, types, args=args, kwargs=kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/tensor.py", line 995, in __torch_function__
    ret = func(*args, **kwargs)
RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [14] at entry 16

When I run it with a single GPU, it works 100% of the time:

python3 -m fastai.launch --gpus=0 dist_simple.py

Things I’ve tried:

Different versions of the dataset, one with many smaller files, one with a few larger files…etc.
Using this code instead of the datablock:

dbunch = TextDataLoaders.from_folder(path, is_lm=True, valid_pct=0.1)

Different combinations of bs, seq_len, valid_pct, and num_workers.
GrandparentSplitter instead of RandomSplitter.
Adding drop_last=True to the dataloader.

My analysis:

Based on this post by sgugger, if there happens to be a remainder in the last batch (not enough data to fill up the batch fully), it’s dropped:

I think this issue affects language models specifically, as classifiers use padding when the data doesn’t line up. I believe it’s failing to drop the remainder sometimes, resulting in the “stack expects each tensor to be equal size” error when using a distributed language learner with multiple GPUs. This would explain why the “entry” number in the error is always the last item in the batch that gets sent to the GPU, and can be calculated by looking at the batch_size and world_size (number of GPUs).

In the examples above:

world_size: 2
bs: 64
produces:

RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [77] at entry 32

world_size: 2
bs: 32
produces:

RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [14] at entry 16

This matches with the math I see in the class DistributedDL(TfmdDL): section in the documentation:

github.com

fastai/fastai/blob/master/nbs/20a_distributed.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#|hide\n",
    "#| eval: false\n",
    "! [ -e /content ] && pip install -Uqq fastai  # upgrade fastai on colab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#|default_exp distributed"

This file has been truncated. show original

Am I on the right track? If my analysis is correct and drop_last isn’t being properly applied, Any ideas on how I can address that?

Thanks!

Update: I’ve gone into python3.8/site-packages/fastai/data/load.py, and hard-coded drop_last to always be True and activated, however the error is consistent.

Update2: While I saw this post before, I didn’t think it was related. After a deeper dive I see @pierreguillou had the same problem last July with no solution:

So I tried with DataParallel instead of DistributedDataParallel:

with learn_lm.parallel_ctx():
    learn_lm.fit_one_cycle(1, 0.05)

but now I’m getting this:

python3 -m fastai.launch dist_simple.py 
World Size: 2
_
Loading dbunch
_
valid_pct: 0.2
seed: 3
bs: 64
seq_len: 32
dbunch loaded
_
rank_distrib(): 0
num_distrib(): 2
torch.cuda.device_count(): 2
Learning
/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:30: UserWarning: 
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
epoch     train_loss  valid_loss  time    
rank_distrib(): 1--------------------------------------------------------------------------------------------| 0.00% [0/9 00:00<00:00]
Learning
/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:30: UserWarning: 
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
epoch     train_loss  valid_loss  time    
Traceback (most recent call last):---------------------------------------------------------------------------| 0.00% [0/10 00:00<00:00]
  File "/home/chess/project/training/dist_simple.py", line 57, in <module>
    learn_lm.fit_one_cycle(1, lr)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
    self._do_epoch_train()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 184, in one_batch
    self._with_events(self._do_one_batch, 'batch', CancelBatchException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 169, in _do_one_batch
    self.pred = self.model(*self.xb)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 153, in forward
    raise RuntimeError("module must have its parameters and buffers "
RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cuda:1
Traceback (most recent call last):
  File "/home/chess/project/training/dist_simple.py", line 57, in <module>
    learn_lm.fit_one_cycle(1, lr)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/callback/schedule.py", line 112, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 211, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 202, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 196, in _do_epoch
    self._do_epoch_train()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 188, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 166, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 184, in one_batch
    self._with_events(self._do_one_batch, 'batch', CancelBatchException)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 160, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/learner.py", line 169, in _do_one_batch
    self.pred = self.model(*self.xb)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 161, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
    output.reraise()
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/_utils.py", line 428, in reraise
    raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/container.py", line 117, in forward
    input = module(input)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/text/models/awdlstm.py", line 106, in forward
    output, new_h = rnn(output, self.hidden[l])
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/fastai/text/models/awdlstm.py", line 53, in forward
    return self.module(*args)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/chess/project/environments2/fastai_latest_dist/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 581, in forward
    result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:1 and hidden tensor at cuda:0

I don’t think there’s a reliable way to train a language model on v2 with multiple GPUs at the moment. Moving forward with single GPU training!