There are a lot of factors that play into the speedup you will see. Ex: dataset size, cpu speed, disk speed, dataloader complexity, etc. I’ve seen anything from 1.3x to 1.9x speedups across 2 gpu’s. These were all using the fast.ai library, not pure pytorch. On very small datasets you’ll see less of a speedup from my experience, likely just due to overhead being a larger portion of the processing time.
Here is an example from a from running 2 scripts on a private dataset of several thousand images using distributed training python -m fastai.launch --gpus 0,1 my_script.py
1 GPU
2 GPU
Here is a sanitized sample of the script used in these distributed training benchmark.
#python -m fastai.launch --gpus 0,1 distributed_sample_code.py
from fastai.basics import *
from fastai.callback.all import *
from fastai.vision.all import *
from nbdev.showdoc import *
import json
import random
from fastai.distributed import *
#DATA
data_path = Path('./data/')
labels = [
'...',
]
labels = [Path(data_path/'images'/o) for o in included_tileset_labels]
labels
validation_set = set(['...','...','...'])
fnames = L(['...'])
y_fnames = L([Path(str(o).replace('train_imgs','label_imgs')) for o in fnames])
codes = np.array(['nothing','class1','class2','class3'])
def FileSplitter():
"Split `items` depending on the value of `mask`."
#TODO: make a faster implementation of this
def _func(pth_in):
in_valid = False
if pth_in.stem[-1] == '0':
in_valid = True
else:
for o in validation_set:
if o in str(pth_in):
in_valid = True
return in_valid
def _inner(o, **kwargs): return FuncSplitter(_func)(o)
return _inner
def MyMaskBlock(codes=None):
"A `TransformBlock` for segmentation masks, potentially with `codes`"
return TransformBlock(type_tfms=PILMask.create, item_tfms=AddMaskCodes(codes=codes), batch_tfms=IntToFloatTensor)
def get_my_files(x):
return fnames
img_fn_ix_dict = dict()
for i in range(len(fnames)):
img_fn_ix_dict[str(fnames[i])] = i
def get_y(img_pth):
return y_fnames[img_fn_ix_dict[str(img_pth)]]
size_256 = 256
bs_256=64
sol_db_256 = DataBlock(blocks=(ImageBlock, MaskBlock(codes)),
get_items=get_my_files,
splitter=FileSplitter(),
get_y=get_y,
batch_tfms=[*aug_transforms(mult=1.0, do_flip=True, flip_vert=True, max_rotate=0.0, min_zoom=1.0, max_zoom=1.0, max_lighting=0.2, max_warp=0.0, p_affine=0.75, p_lighting=0.5, xtra_tfms=None, size=size_256, mode='bilinear', pad_mode='reflection', align_corners=True, batch=False, min_scale=1.0), Normalize.from_stats(*imagenet_stats)])
dls_256 = sol_db_256.dataloaders(fnames, bs=bs_256, path=Path('.'))
bs_128, size_128 = 256, 128
sol_db_128 = DataBlock(blocks=(ImageBlock, MaskBlock(codes)),
get_items=get_my_files,
splitter=FileSplitter(),
get_y=get_y,
batch_tfms=[*aug_transforms(mult=1.0, do_flip=True, flip_vert=True, max_rotate=0.0, min_zoom=1.0, max_zoom=1.0, max_lighting=0.2, max_warp=0.0, p_affine=0.75, p_lighting=0.5, xtra_tfms=None, size=size_128, mode='bilinear', pad_mode='reflection', align_corners=True, batch=False, min_scale=1.0), Normalize.from_stats(*imagenet_stats)])
dls_128 = sol_db_128.dataloaders(fnames, bs=bs_128, path=Path('.'))
#MODEL
void_code = 0
def acc_clss(input, target):
target = target.squeeze(1)
mask = target != void_code
ret = (input.argmax(dim=1)[mask]==target[mask]).float().mean()
ret[ret != ret] = 1.
return ret
def adj_clss(input, target):
target = target.squeeze(1)
mask = target != void_code
ret = (input.argmax(dim=1)[mask]==target[mask]).float().mean()
ret[ret != ret] = 0.
return ret
metrics=(acc_all,acc_clss)
wd=1e-2
dls_256.vocab,dls_128.vocab = codes,codes
class_weights=torch.FloatTensor([1.,256.,32.,2.]).cuda()
loss_fn = CrossEntropyLossFlat(axis=1, weight=class_weights)
learn = unet_learner(dls_128, resnet34, loss_func=loss_fn, metrics=metrics,self_attention=True).to_fp16()
lr=4e-4
##### TRAINING
cbs = [SaveModelCallback(fname='stage-1_128',at_end=False)]
with learn.distrib_ctx(sync_bn=False):
learn.fit_one_cycle(10, slice(lr), pct_start=0.9, wd=wd, cbs=cbs)
del learn
import gc
gc.collect()
learn = unet_learner(dls_256, resnet34, loss_func=loss_fn, metrics=metrics, self_attention=True).to_fp16()
learn.load('stage-1_128',with_opt=False)
cbs = [SaveModelCallback(fname='stage-1_256',at_end=False)]
with learn.distrib_ctx(sync_bn=False):
learn.fit_one_cycle(8, slice(lr/2), pct_start=0.9, wd=wd, cbs=cbs)
learn.save('stage-1_256')
learn.load('stage-1_256',with_opt=False)
learn.unfreeze()
lrs = slice(5e-6,9e-5)
cbs = [SaveModelCallback(fname='stage-2_256',at_end=False)]
with learn.distrib_ctx(sync_bn=False):
learn.fit_one_cycle(8, lrs, pct_start=0.8, wd=wd, cbs=cbs)