ResNet training breaks with weird message

Hello,

I’m playing around with fast.ai in the area of document recognition. My model is pretty simple:

documents = DataBlock(
    blocks = (ImageBlock, CategoryBlock),
    get_items = get_image_files,
    splitter = GrandparentSplitter(train_name = 'train', valid_name = 'val'),
    get_y = parent_label,
    item_tfms = Resize(size=480, method='Crop')
)

learn = vision_learner(dls, resnet34, metrics=error_rate)

learn.fit_one_cycle(3, 1e-3)

I’m running my code on paperspace and tried different machines (up to A100), but it always breaks in the second cycle with this error message:

---------------------------------------------------------------------------
EOFError                                  Traceback (most recent call last)
Input In [17], in <cell line: 1>()
----> 1 learn.fit_one_cycle(3, 1e-3)

File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
    116 lr_max = np.array([h['lr'] for h in self.opt.hypers])
    117 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    118           'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:264, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
    262 self.opt.set_hypers(lr=self.lr if lr is None else lr)
    263 self.n_epoch = n_epoch
--> 264 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner._with_events(self, f, event_type, ex, final)
    198 def _with_events(self, f, event_type, ex, final=noop):
--> 199     try: self(f'before_{event_type}');  f()
    200     except ex: self(f'after_cancel_{event_type}')
    201     self(f'after_{event_type}');  final()

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:253, in Learner._do_fit(self)
    251 for epoch in range(self.n_epoch):
    252     self.epoch=epoch
--> 253     self._with_events(self._do_epoch, 'epoch', CancelEpochException)

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner._with_events(self, f, event_type, ex, final)
    198 def _with_events(self, f, event_type, ex, final=noop):
--> 199     try: self(f'before_{event_type}');  f()
    200     except ex: self(f'after_cancel_{event_type}')
    201     self(f'after_{event_type}');  final()

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:247, in Learner._do_epoch(self)
    246 def _do_epoch(self):
--> 247     self._do_epoch_train()
    248     self._do_epoch_validate()

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:239, in Learner._do_epoch_train(self)
    237 def _do_epoch_train(self):
    238     self.dl = self.dls.train
--> 239     self._with_events(self.all_batches, 'train', CancelTrainException)

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner._with_events(self, f, event_type, ex, final)
    198 def _with_events(self, f, event_type, ex, final=noop):
--> 199     try: self(f'before_{event_type}');  f()
    200     except ex: self(f'after_cancel_{event_type}')
    201     self(f'after_{event_type}');  final()

File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:205, in Learner.all_batches(self)
    203 def all_batches(self):
    204     self.n_iter = len(self.dl)
--> 205     for o in enumerate(self.dl): self.one_batch(*o)

File /usr/local/lib/python3.9/dist-packages/fastai/data/load.py:127, in DataLoader.__iter__(self)
    125 self.before_iter()
    126 self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
--> 127 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
    128     # pin_memory causes tuples to be converted to lists, so convert them back to tuples
    129     if self.pin_memory and type(b) == list: b = tuple(b)
    130     if self.device is not None: b = to_device(b, self.device)

File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:652, in _BaseDataLoaderIter.__next__(self)
    649 if self._sampler_iter is None:
    650     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    651     self._reset()  # type: ignore[call-arg]
--> 652 data = self._next_data()
    653 self._num_yielded += 1
    654 if self._dataset_kind == _DatasetKind.Iterable and \
    655         self._IterableDataset_len_called is not None and \
    656         self._num_yielded > self._IterableDataset_len_called:

File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1330, in _MultiProcessingDataLoaderIter._next_data(self)
   1327     return self._process_data(data)
   1329 assert not self._shutdown and self._tasks_outstanding > 0
-> 1330 idx, data = self._get_data()
   1331 self._tasks_outstanding -= 1
   1332 if self._dataset_kind == _DatasetKind.Iterable:
   1333     # Check for _IterableDatasetStopIteration

File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1296, in _MultiProcessingDataLoaderIter._get_data(self)
   1292     # In this case, `self._data_queue` is a `queue.Queue`,. But we don't
   1293     # need to call `.task_done()` because we don't use `.join()`.
   1294 else:
   1295     while True:
-> 1296         success, data = self._try_get_data()
   1297         if success:
   1298             return data

File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1134, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout)
   1121 def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
   1122     # Tries to fetch data from `self._data_queue` once for a given timeout.
   1123     # This can also be used as inner loop of fetching without timeout, with
   (...)
   1131     # Returns a 2-tuple:
   1132     #   (bool: whether successfully get data, any: data if successful else None)
   1133     try:
-> 1134         data = self._data_queue.get(timeout=timeout)
   1135         return (True, data)
   1136     except Exception as e:
   1137         # At timeout and error, we manually check whether any worker has
   1138         # failed. Note that this is the only mechanism for Windows to detect
   1139         # worker failures.

File /usr/lib/python3.9/multiprocessing/queues.py:122, in Queue.get(self, block, timeout)
    120         self._rlock.release()
    121 # unserialize the data after having released the lock
--> 122 return _ForkingPickler.loads(res)

File /usr/local/lib/python3.9/dist-packages/torch/multiprocessing/reductions.py:297, in rebuild_storage_fd(cls, df, size)
    296 def rebuild_storage_fd(cls, df, size):
--> 297     fd = df.detach()
    298     try:
    299         storage = storage_from_cache(cls, fd_id(fd))

File /usr/lib/python3.9/multiprocessing/resource_sharer.py:58, in DupFd.detach(self)
     56 '''Get the fd.  This should only be called once.'''
     57 with _resource_sharer.get_connection(self._id) as conn:
---> 58     return reduction.recv_handle(conn)

File /usr/lib/python3.9/multiprocessing/reduction.py:189, in recv_handle(conn)
    187 '''Receive a handle over a local connection.'''
    188 with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
--> 189     return recvfds(s, 1)[0]

File /usr/lib/python3.9/multiprocessing/reduction.py:159, in recvfds(sock, size)
    157 msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_SPACE(bytes_size))
    158 if not msg and not ancdata:
--> 159     raise EOFError
    160 try:
    161     if ACKNOWLEDGE:

EOFError: 

I can’t make any sense from this message. Does anyone know this kind of issue?