I am trying to train a model using a vision_learner based on some thousand images. I have tried with several different networks, different learning rates and on different machines, both my own physical machine and several different on Paperspace (I thought the errors were related to multiprocessing). When I call fine_tune it sometimes run one whole epoch, sometimes a few batches and sometimes fails immediately. It never runs all epochs before failing. It gives at least three different errors but not systematically (as far as I have learned till now). The error messages themselves (cannot resize, EOFerror and Bad file descriptor, see below for stacks) all point to that there’s something wrong with my image files and indeed when I run on a smaller subset it seems to run fine. However, I could use help to interpret the errors to what they actually mean and I desperately need suggestions to how I can track down the error(s):
Error stack 1:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [21], in <cell line: 1>()
----> 1 train(arch, dls)
Input In [20], in train(arch, dls, accum, finetune, epochs)
4 learn = vision_learner(dls, arch, metrics=error_rate, cbs=cbs, path='tifs3').to_fp16()
5 if finetune:
----> 6 learn.fine_tune(epochs, lr)
7 #return learn.tta(dl=dls.test_dl(djurs_test))
8 else:
9 learn.unfreeze()
File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:165, in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
163 "Fine tune with `Learner.freeze` for `freeze_epochs`, then with `Learner.unfreeze` for `epochs`, using discriminative LR."
164 self.freeze()
--> 165 self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
166 base_lr /= 2
167 self.unfreeze()
File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
116 lr_max = np.array([h['lr'] for h in self.opt.hypers])
117 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
118 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:256, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
254 self.opt.set_hypers(lr=self.lr if lr is None else lr)
255 self.n_epoch = n_epoch
--> 256 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:245, in Learner._do_fit(self)
243 for epoch in range(self.n_epoch):
244 self.epoch=epoch
--> 245 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:239, in Learner._do_epoch(self)
238 def _do_epoch(self):
--> 239 self._do_epoch_train()
240 self._do_epoch_validate()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:231, in Learner._do_epoch_train(self)
229 def _do_epoch_train(self):
230 self.dl = self.dls.train
--> 231 self._with_events(self.all_batches, 'train', CancelTrainException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner.all_batches(self)
197 def all_batches(self):
198 self.n_iter = len(self.dl)
--> 199 for o in enumerate(self.dl): self.one_batch(*o)
File /usr/local/lib/python3.9/dist-packages/fastai/data/load.py:129, in DataLoader.__iter__(self)
127 self.before_iter()
128 self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
--> 129 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
130 # pin_memory causes tuples to be converted to lists, so convert them back to tuples
131 if self.pin_memory and type(b) == list: b = tuple(b)
132 if self.device is not None: b = to_device(b, self.device)
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:652, in _BaseDataLoaderIter.__next__(self)
649 if self._sampler_iter is None:
650 # TODO(https://github.com/pytorch/pytorch/issues/76750)
651 self._reset() # type: ignore[call-arg]
--> 652 data = self._next_data()
653 self._num_yielded += 1
654 if self._dataset_kind == _DatasetKind.Iterable and \
655 self._IterableDataset_len_called is not None and \
656 self._num_yielded > self._IterableDataset_len_called:
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1330, in _MultiProcessingDataLoaderIter._next_data(self)
1327 return self._process_data(data)
1329 assert not self._shutdown and self._tasks_outstanding > 0
-> 1330 idx, data = self._get_data()
1331 self._tasks_outstanding -= 1
1332 if self._dataset_kind == _DatasetKind.Iterable:
1333 # Check for _IterableDatasetStopIteration
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1296, in _MultiProcessingDataLoaderIter._get_data(self)
1292 # In this case, `self._data_queue` is a `queue.Queue`,. But we don't
1293 # need to call `.task_done()` because we don't use `.join()`.
1294 else:
1295 while True:
-> 1296 success, data = self._try_get_data()
1297 if success:
1298 return data
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1134, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout)
1121 def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
1122 # Tries to fetch data from `self._data_queue` once for a given timeout.
1123 # This can also be used as inner loop of fetching without timeout, with
(...)
1131 # Returns a 2-tuple:
1132 # (bool: whether successfully get data, any: data if successful else None)
1133 try:
-> 1134 data = self._data_queue.get(timeout=timeout)
1135 return (True, data)
1136 except Exception as e:
1137 # At timeout and error, we manually check whether any worker has
1138 # failed. Note that this is the only mechanism for Windows to detect
1139 # worker failures.
File /usr/lib/python3.9/multiprocessing/queues.py:122, in Queue.get(self, block, timeout)
120 self._rlock.release()
121 # unserialize the data after having released the lock
--> 122 return _ForkingPickler.loads(res)
File /usr/local/lib/python3.9/dist-packages/torch/multiprocessing/reductions.py:302, in rebuild_storage_fd(cls, df, size)
300 if storage is not None:
301 return storage
--> 302 storage = cls._new_shared_fd_cpu(fd, size)
303 shared_cache[fd_id(fd)] = StorageWeakRef(storage)
304 return storage
RuntimeError: unable to resize file <filename not specified> to the right size: Invalid argument (22)
Error stack 2:
---------------------------------------------------------------------------
EOFError Traceback (most recent call last)
Input In [22], in <cell line: 1>()
----> 1 train(arch, dls)
Input In [20], in train(arch, dls, accum, finetune, epochs)
4 learn = vision_learner(dls, arch, metrics=error_rate, cbs=cbs, path='tifs3').to_fp16()
5 if finetune:
----> 6 learn.fine_tune(epochs, lr)
7 #return learn.tta(dl=dls.test_dl(djurs_test))
8 else:
9 learn.unfreeze()
File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:165, in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
163 "Fine tune with `Learner.freeze` for `freeze_epochs`, then with `Learner.unfreeze` for `epochs`, using discriminative LR."
164 self.freeze()
--> 165 self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
166 base_lr /= 2
167 self.unfreeze()
File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
116 lr_max = np.array([h['lr'] for h in self.opt.hypers])
117 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
118 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:256, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
254 self.opt.set_hypers(lr=self.lr if lr is None else lr)
255 self.n_epoch = n_epoch
--> 256 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:245, in Learner._do_fit(self)
243 for epoch in range(self.n_epoch):
244 self.epoch=epoch
--> 245 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:239, in Learner._do_epoch(self)
238 def _do_epoch(self):
--> 239 self._do_epoch_train()
240 self._do_epoch_validate()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:231, in Learner._do_epoch_train(self)
229 def _do_epoch_train(self):
230 self.dl = self.dls.train
--> 231 self._with_events(self.all_batches, 'train', CancelTrainException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner.all_batches(self)
197 def all_batches(self):
198 self.n_iter = len(self.dl)
--> 199 for o in enumerate(self.dl): self.one_batch(*o)
File /usr/local/lib/python3.9/dist-packages/fastai/data/load.py:129, in DataLoader.__iter__(self)
127 self.before_iter()
128 self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
--> 129 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
130 # pin_memory causes tuples to be converted to lists, so convert them back to tuples
131 if self.pin_memory and type(b) == list: b = tuple(b)
132 if self.device is not None: b = to_device(b, self.device)
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:652, in _BaseDataLoaderIter.__next__(self)
649 if self._sampler_iter is None:
650 # TODO(https://github.com/pytorch/pytorch/issues/76750)
651 self._reset() # type: ignore[call-arg]
--> 652 data = self._next_data()
653 self._num_yielded += 1
654 if self._dataset_kind == _DatasetKind.Iterable and \
655 self._IterableDataset_len_called is not None and \
656 self._num_yielded > self._IterableDataset_len_called:
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1330, in _MultiProcessingDataLoaderIter._next_data(self)
1327 return self._process_data(data)
1329 assert not self._shutdown and self._tasks_outstanding > 0
-> 1330 idx, data = self._get_data()
1331 self._tasks_outstanding -= 1
1332 if self._dataset_kind == _DatasetKind.Iterable:
1333 # Check for _IterableDatasetStopIteration
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1296, in _MultiProcessingDataLoaderIter._get_data(self)
1292 # In this case, `self._data_queue` is a `queue.Queue`,. But we don't
1293 # need to call `.task_done()` because we don't use `.join()`.
1294 else:
1295 while True:
-> 1296 success, data = self._try_get_data()
1297 if success:
1298 return data
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py:1134, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout)
1121 def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
1122 # Tries to fetch data from `self._data_queue` once for a given timeout.
1123 # This can also be used as inner loop of fetching without timeout, with
(...)
1131 # Returns a 2-tuple:
1132 # (bool: whether successfully get data, any: data if successful else None)
1133 try:
-> 1134 data = self._data_queue.get(timeout=timeout)
1135 return (True, data)
1136 except Exception as e:
1137 # At timeout and error, we manually check whether any worker has
1138 # failed. Note that this is the only mechanism for Windows to detect
1139 # worker failures.
File /usr/lib/python3.9/multiprocessing/queues.py:122, in Queue.get(self, block, timeout)
120 self._rlock.release()
121 # unserialize the data after having released the lock
--> 122 return _ForkingPickler.loads(res)
File /usr/local/lib/python3.9/dist-packages/torch/multiprocessing/reductions.py:297, in rebuild_storage_fd(cls, df, size)
296 def rebuild_storage_fd(cls, df, size):
--> 297 fd = df.detach()
298 try:
299 storage = storage_from_cache(cls, fd_id(fd))
File /usr/lib/python3.9/multiprocessing/resource_sharer.py:58, in DupFd.detach(self)
56 '''Get the fd. This should only be called once.'''
57 with _resource_sharer.get_connection(self._id) as conn:
---> 58 return reduction.recv_handle(conn)
File /usr/lib/python3.9/multiprocessing/reduction.py:189, in recv_handle(conn)
187 '''Receive a handle over a local connection.'''
188 with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
--> 189 return recvfds(s, 1)[0]
File /usr/lib/python3.9/multiprocessing/reduction.py:159, in recvfds(sock, size)
157 msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_SPACE(bytes_size))
158 if not msg and not ancdata:
--> 159 raise EOFError
160 try:
161 if ACKNOWLEDGE:
EOFError:
Error stack 3:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/connection.py", line 593, in __init__
self._socket.setsockopt(socket.SOL_SOCKET,
OSError: [Errno 9] Bad file descriptor
Bad file descriptor (src/signaler.cpp:184)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [9], in <cell line: 1>()
----> 1 train(arch, dls)
Input In [8], in train(arch, dls, accum, finetune, epochs)
4 learn = vision_learner(dls, arch, metrics=error_rate, cbs=cbs, path='tifs3').to_fp16()
5 if finetune:
----> 6 learn.fine_tune(epochs, lr)
7 #return learn.tta(dl=dls.test_dl(djurs_test))
8 else:
9 learn.unfreeze()
File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:165, in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
163 "Fine tune with `Learner.freeze` for `freeze_epochs`, then with `Learner.unfreeze` for `epochs`, using discriminative LR."
164 self.freeze()
--> 165 self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
166 base_lr /= 2
167 self.unfreeze()
File /usr/local/lib/python3.9/dist-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
116 lr_max = np.array([h['lr'] for h in self.opt.hypers])
117 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
118 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:256, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
254 self.opt.set_hypers(lr=self.lr if lr is None else lr)
255 self.n_epoch = n_epoch
--> 256 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:245, in Learner._do_fit(self)
243 for epoch in range(self.n_epoch):
244 self.epoch=epoch
--> 245 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:240, in Learner._do_epoch(self)
238 def _do_epoch(self):
239 self._do_epoch_train()
--> 240 self._do_epoch_validate()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:236, in Learner._do_epoch_validate(self, ds_idx, dl)
234 if dl is None: dl = self.dls[ds_idx]
235 self.dl = dl
--> 236 with torch.no_grad(): self._with_events(self.all_batches, 'validate', CancelValidException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:199, in Learner.all_batches(self)
197 def all_batches(self):
198 self.n_iter = len(self.dl)
--> 199 for o in enumerate(self.dl): self.one_batch(*o)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:227, in Learner.one_batch(self, i, b)
225 b = self._set_device(b)
226 self._split(b)
--> 227 self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:195, in Learner._with_events(self, f, event_type, ex, final)
193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
--> 195 self(f'after_{event_type}'); final()
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:171, in Learner.__call__(self, event_name)
--> 171 def __call__(self, event_name): L(event_name).map(self._call_one)
File /usr/local/lib/python3.9/dist-packages/fastcore/foundation.py:156, in L.map(self, f, gen, *args, **kwargs)
--> 156 def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:840, in map_ex(iterable, f, gen, *args, **kwargs)
838 res = map(g, iterable)
839 if gen: return res
--> 840 return list(res)
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:825, in bind.__call__(self, *args, **kwargs)
823 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
824 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 825 return self.func(*fargs, **kwargs)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:175, in Learner._call_one(self, event_name)
173 def _call_one(self, event_name):
174 if not hasattr(event, event_name): raise Exception(f'missing {event_name}')
--> 175 for cb in self.cbs.sorted('order'): cb(event_name)
File /usr/local/lib/python3.9/dist-packages/fastai/callback/core.py:62, in Callback.__call__(self, event_name)
60 try: res = getcallable(self, event_name)()
61 except (CancelBatchException, CancelBackwardException, CancelEpochException, CancelFitException, CancelStepException, CancelTrainException, CancelValidException): raise
---> 62 except Exception as e: raise modify_exception(e, f'Exception occured in `{self.__class__.__name__}` when calling event `{event_name}`:\n\t{e.args[0]}', replace=True)
63 if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
64 return res
File /usr/local/lib/python3.9/dist-packages/fastai/callback/core.py:60, in Callback.__call__(self, event_name)
58 res = None
59 if self.run and _run:
---> 60 try: res = getcallable(self, event_name)()
61 except (CancelBatchException, CancelBackwardException, CancelEpochException, CancelFitException, CancelStepException, CancelTrainException, CancelValidException): raise
62 except Exception as e: raise modify_exception(e, f'Exception occured in `{self.__class__.__name__}` when calling event `{event_name}`:\n\t{e.args[0]}', replace=True)
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:541, in Recorder.after_batch(self)
539 if len(self.yb) == 0: return
540 mets = self._train_mets if self.training else self._valid_mets
--> 541 for met in mets: met.accumulate(self.learn)
542 if not self.training: return
543 self.lrs.append(self.opt.hypers[-1]['lr'])
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:476, in AvgLoss.accumulate(self, learn)
474 def accumulate(self, learn):
475 bs = find_bs(learn.yb)
--> 476 self.total += learn.to_detach(learn.loss.mean())*bs
477 self.count += bs
File /usr/local/lib/python3.9/dist-packages/fastai/learner.py:336, in Learner.to_detach(self, b, cpu, gather)
335 def to_detach(self,b,cpu=True,gather=True):
--> 336 return self.dl.to_detach(b,cpu,gather) if hasattr(getattr(self,'dl',None),'to_detach') else to_detach(b,cpu,gather)
File /usr/local/lib/python3.9/dist-packages/fastai/torch_core.py:242, in to_detach(b, cpu, gather)
240 if gather: x = maybe_gather(x)
241 return x.cpu() if cpu else x
--> 242 return apply(_inner, b, cpu=cpu, gather=gather)
File /usr/local/lib/python3.9/dist-packages/fastai/torch_core.py:222, in apply(func, x, *args, **kwargs)
220 if is_listy(x): return type(x)([apply(func, o, *args, **kwargs) for o in x])
221 if isinstance(x,dict): return {k: apply(func, v, *args, **kwargs) for k,v in x.items()}
--> 222 res = func(x, *args, **kwargs)
223 return res if x is None else retain_type(res, x)
File /usr/local/lib/python3.9/dist-packages/fastai/torch_core.py:241, in to_detach.<locals>._inner(x, cpu, gather)
239 x = x.detach()
240 if gather: x = maybe_gather(x)
--> 241 return x.cpu() if cpu else x
File /usr/local/lib/python3.9/dist-packages/fastai/torch_core.py:376, in TensorBase.__torch_function__(cls, func, types, args, kwargs)
374 if cls.debug and func.__name__ not in ('__str__','__repr__'): print(func, types, args, kwargs)
375 if _torch_handled(args, cls._opt, func): types = (torch.Tensor,)
--> 376 res = super().__torch_function__(func, types, args, ifnone(kwargs, {}))
377 dict_objs = _find_args(args) if args else _find_args(list(kwargs.values()))
378 if issubclass(type(res),TensorBase) and dict_objs: res.set_meta(dict_objs[0],as_copy=True)
File /usr/local/lib/python3.9/dist-packages/torch/_tensor.py:1121, in Tensor.__torch_function__(cls, func, types, args, kwargs)
1118 return NotImplemented
1120 with _C.DisableTorchFunction():
-> 1121 ret = func(*args, **kwargs)
1122 if func in get_default_nowrap_functions():
1123 return ret
File /usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/signal_handling.py:66, in _set_SIGCHLD_handler.<locals>.handler(signum, frame)
63 def handler(signum, frame):
64 # This following call uses `waitid` with WNOHANG from C side. Therefore,
65 # Python can still get and update the process status successfully.
---> 66 _error_if_any_worker_fails()
67 if previous_handler is not None:
68 assert callable(previous_handler)
RuntimeError: Exception occured in `Recorder` when calling event `after_batch`:
DataLoader worker (pid 209) is killed by signal: Aborted.