There are topics about a similar error, but it appears they’ve been using other tools alongside fast.ai and the solutions proposed haven’t helped fix the error I’m encountering. My model will train as long as I set num_workers=0, but as soon as I comment out that setting and allow it to multiprocess, I begin to get the errors in the stack trace below, with the AttributeError repeated once for each process fast.ai spins up and some subsequent errors that come from the process crashing.
I’ve tried running some other, simpler code and not had issues, so I suspect it has to do with how I’m defining my DataBlocks, but it’s not clear what needs to be added for them to be multiprocessing compatible.
Package versions:
torch: 1.6.0
fast.ai: 2.0.13
python: 3.8.3 (have tried on 3.7 with the same result)
CUDA: 11.0.182
Code:
from fastai.vision.all import *
def _parent_idxs(items, name):
"""Gets the indexes for the items based on the parent folder name"""
def _inner(items, name):
return mask2idxs(Path(o).parent.name == name for o in items)
return [i for n in L(name) for i in _inner(items, n)]
def ParentSplitter(train_name='train', valid_name='valid'):
"Split `items` from the parent folder names (`train_name` and `valid_name`)."
def _inner(o, **kwargs):
return _parent_idxs(o, train_name), _parent_idxs(o, valid_name)
return _inner
class TorchTensor(TensorBase):
@classmethod
def create(cls, fn: (Path, str)) -> None:
# this reshaping is to make the data compatible with resnet34
tens = torch.reshape(torch.load(fn)[1:4, :, :], (3, 512, 512)).cpu().clone()
return cls(tens)
class TorchTensorMask(TensorMask):
@classmethod
def create(cls, fn: (Path, str)) -> None:
tens = torch.load(fn).cpu().clone()
return cls(tens)
def TensorBlock():
return TransformBlock(type_tfms=TorchTensor.create, batch_tfms=IntToFloatTensor)
def TensorMaskBlock(codes=None):
return TransformBlock(type_tfms=TorchTensorMask.create, item_tfms=AddMaskCodes(codes=codes), batch_tfms=IntToFloatTensor)
def get_msk(o):
path_pieces = o.parts
new_path = []
for i in range(len(path_pieces)):
if path_pieces[i] == 'images':
new_path.append('masks')
continue
new_path.append(path_pieces[i])
mask_path = os.sep.join(new_path)
return Path(mask_path)
class custom_dice(Metric):
"Dice coefficient metric (DSC) class for a binary target in segmentation"
def __init__(self, axis=1, object_class=1):
self.axis = axis
self.object_class = object_class
def reset(self):
self.inter, self.union = 0, 0
def accumulate(self, learn):
preds, targs = learn.pred, learn.y
preds = torch.softmax(preds,dim=1).float()
pred = preds[:, self.object_class, ...]
targ = (targs==self.object_class).float()
self.inter+=(pred*targ).sum()
self.union+=(pred+targ).sum()
@property
def value(self):
return 2.*self.inter/self.union if self.union > 0 else None
class m_dice(custom_dice):
def __init__(self):
super().__init__(self, object_class=1)
if __name__ == '__main__':
# avoiding an error
torch.multiprocessing.set_start_method('spawn')
# directories for data
path = "path to the data"
datablock = DataBlock(
blocks=(TensorBlock, TensorMaskBlock(codes=['mask'])),
get_items=FileGetter(extensions='.pt', folders=['images']),
get_y=get_msk,
splitter=ParentSplitter(train_name='train_data', valid_name='test_data'))
dataloaders = datablock.dataloaders(Path(path), bs=4) #, num_workers=0)
dice_1 = m_dice()
modelsavepath = "save directory for model"
modelsavedir = "fastai/"
learn = unet_learner(dls=dataloaders, arch=resnet34, metrics=[foreground_acc, dice_1], n_out=2, loss_func=CrossEntropyLossFlat(axis=1), path=modelsavepath, model_dir=modelsavedir)
learn.fit(1)
Error Stack Trace:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/home/anaconda3/envs/fastai/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/home/anaconda3/envs/fastai/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
AttributeError: '_FakeLoader' object has no attribute 'noops'
0 nan 01:40
Traceback (most recent call last):
File "/home/anaconda3/envs/fastai/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 779, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/home/anaconda3/envs/fastai/lib/python3.8/multiprocessing/queues.py", line 107, in get
if not self._poll(timeout):
File "/home/anaconda3/envs/fastai/lib/python3.8/multiprocessing/connection.py", line 257, in poll
return self._poll(timeout)
File "/home/anaconda3/envs/fastai/lib/python3.8/multiprocessing/connection.py", line 424, in _poll
r = wait([self], timeout)
File "/home/anaconda3/envs/fastai/lib/python3.8/multiprocessing/connection.py", line 931, in wait
ready = selector.select(timeout)
File "/home/anaconda3/envs/fastai/lib/python3.8/selectors.py", line 415, in select
fd_event_list = self._selector.poll(timeout)
File "/home/anaconda3/envs/fastai/lib/python3.8/site-packages/torch/utils/data/_utils/signal_handling.py", line
66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 1473) exited unexpectedly with exit code 1. Details are lost due to multiprocessing.
Rerunning with num_workers=0 may give better error trace.
Setting num_workers=2 also results in the same errors as using the default parameter settings.
Has anyone got an idea of what might be going wrong?