Hello,
I’m playing around with fast.ai in the area of document recognition. My model is pretty simple:
documents = DataBlock(
blocks = (ImageBlock, CategoryBlock),
get_items = get_image_files,
splitter = GrandparentSplitter(train_name = 'train', valid_name = 'val'),
get_y = parent_label,
item_tfms = Resize(size=480, method='Crop')
)
learn = vision_learner(dls, resnet34, metrics=error_rate)
learn.fit_one_cycle(3, 1e-3)
I’m running my code on paperspace and tried different machines (up to A100), but it always breaks in the second cycle with this error message:
---------------------------------------------------------------------------
EOFError Traceback (most recent call last)
Input In [17], in <cell line: 1>()
----> 1 learn.fit_one_cycle(3, 1e-3)
File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
116 lr_max = np.array([h['lr'] for h in self.opt.hypers])
117 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
118 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:264, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
262 self.opt.set_hypers(lr=self.lr if lr is None else lr)
263 self.n_epoch = n_epoch
--> 264 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner._with_events(self, f, event_type, ex, final)
198 def _with_events(self, f, event_type, ex, final=noop):
--> 199 try: self(f'before_{event_type}'); f()
200 except ex: self(f'after_cancel_{event_type}')
201 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:253, in Learner._do_fit(self)
251 for epoch in range(self.n_epoch):
252 self.epoch=epoch
--> 253 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner._with_events(self, f, event_type, ex, final)
198 def _with_events(self, f, event_type, ex, final=noop):
--> 199 try: self(f'before_{event_type}'); f()
200 except ex: self(f'after_cancel_{event_type}')
201 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:247, in Learner._do_epoch(self)
246 def _do_epoch(self):
--> 247 self._do_epoch_train()
248 self._do_epoch_validate()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:239, in Learner._do_epoch_train(self)
237 def _do_epoch_train(self):
238 self.dl = self.dls.train
--> 239 self._with_events(self.all_batches, 'train', CancelTrainException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner._with_events(self, f, event_type, ex, final)
198 def _with_events(self, f, event_type, ex, final=noop):
--> 199 try: self(f'before_{event_type}'); f()
200 except ex: self(f'after_cancel_{event_type}')
201 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:205, in Learner.all_batches(self)
203 def all_batches(self):
204 self.n_iter = len(self.dl)
--> 205 for o in enumerate(self.dl): self.one_batch(*o)
File /usr/local/lib/python3.9/dist-packages/fastai/data/load.py:127, in DataLoader.__iter__(self)
125 self.before_iter()
126 self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
--> 127 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
128 # pin_memory causes tuples to be converted to lists, so convert them back to tuples
129 if self.pin_memory and type(b) == list: b = tuple(b)
130 if self.device is not None: b = to_device(b, self.device)
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:652, in _BaseDataLoaderIter.__next__(self)
649 if self._sampler_iter is None:
650 # TODO(https://github.com/pytorch/pytorch/issues/76750)
651 self._reset() # type: ignore[call-arg]
--> 652 data = self._next_data()
653 self._num_yielded += 1
654 if self._dataset_kind == _DatasetKind.Iterable and \
655 self._IterableDataset_len_called is not None and \
656 self._num_yielded > self._IterableDataset_len_called:
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1330, in _MultiProcessingDataLoaderIter._next_data(self)
1327 return self._process_data(data)
1329 assert not self._shutdown and self._tasks_outstanding > 0
-> 1330 idx, data = self._get_data()
1331 self._tasks_outstanding -= 1
1332 if self._dataset_kind == _DatasetKind.Iterable:
1333 # Check for _IterableDatasetStopIteration
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1296, in _MultiProcessingDataLoaderIter._get_data(self)
1292 # In this case, `self._data_queue` is a `queue.Queue`,. But we don't
1293 # need to call `.task_done()` because we don't use `.join()`.
1294 else:
1295 while True:
-> 1296 success, data = self._try_get_data()
1297 if success:
1298 return data
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1134, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout)
1121 def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
1122 # Tries to fetch data from `self._data_queue` once for a given timeout.
1123 # This can also be used as inner loop of fetching without timeout, with
(...)
1131 # Returns a 2-tuple:
1132 # (bool: whether successfully get data, any: data if successful else None)
1133 try:
-> 1134 data = self._data_queue.get(timeout=timeout)
1135 return (True, data)
1136 except Exception as e:
1137 # At timeout and error, we manually check whether any worker has
1138 # failed. Note that this is the only mechanism for Windows to detect
1139 # worker failures.
File /usr/lib/python3.9/multiprocessing/queues.py:122, in Queue.get(self, block, timeout)
120 self._rlock.release()
121 # unserialize the data after having released the lock
--> 122 return _ForkingPickler.loads(res)
File /usr/local/lib/python3.9/dist-packages/torch/multiprocessing/reductions.py:297, in rebuild_storage_fd(cls, df, size)
296 def rebuild_storage_fd(cls, df, size):
--> 297 fd = df.detach()
298 try:
299 storage = storage_from_cache(cls, fd_id(fd))
File /usr/lib/python3.9/multiprocessing/resource_sharer.py:58, in DupFd.detach(self)
56 '''Get the fd. This should only be called once.'''
57 with _resource_sharer.get_connection(self._id) as conn:
---> 58 return reduction.recv_handle(conn)
File /usr/lib/python3.9/multiprocessing/reduction.py:189, in recv_handle(conn)
187 '''Receive a handle over a local connection.'''
188 with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
--> 189 return recvfds(s, 1)[0]
File /usr/lib/python3.9/multiprocessing/reduction.py:159, in recvfds(sock, size)
157 msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_SPACE(bytes_size))
158 if not msg and not ancdata:
--> 159 raise EOFError
160 try:
161 if ACKNOWLEDGE:
EOFError:
I can’t make any sense from this message. Does anyone know this kind of issue?