Hey,
I’m trying to create my own callback for reporting experiment stats for ClearML (based on the existing Wandb callback, this probably happens there too).
trying to run
test_items = [getattr(self.dls.valid_ds.items, 'iloc', self.dls.valid_ds.items)[i] for i in idxs]
self.valid_dl = self.dls.test_dl(test_items, with_labels=True)
exactly like in wandb:
When working in distributed training i’m getting the next error:
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/ubuntu/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/ubuntu/.vscode-server/extensions/ms-python.python-2021.12.1559732655/pythonFiles/lib/python/debugpy/__main__.py", line 45, in <module>
cli.main()
File "/home/ubuntu/.vscode-server/extensions/ms-python.python-2021.12.1559732655/pythonFiles/lib/python/debugpy/../debugpy/server/cli.py", line 444, in main
run()
File "/home/ubuntu/.vscode-server/extensions/ms-python.python-2021.12.1559732655/pythonFiles/lib/python/debugpy/../debugpy/server/cli.py", line 285, in run_file
runpy.run_path(target_as_str, run_name=compat.force_str("__main__"))
File "/home/ubuntu/anaconda3/lib/python3.7/runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "/home/ubuntu/anaconda3/lib/python3.7/runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "/home/ubuntu/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "distrib_test.py", line 36, in <module>
learn.fine_tune(10, 1e-3, cbs=[ClearMLCallback(), SaveModelCallback()])
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/callback/schedule.py", line 161, in fine_tune
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/callback/schedule.py", line 116, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/learner.py", line 221, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/learner.py", line 163, in _with_events
try: self(f'before_{event_type}'); f()
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/learner.py", line 141, in __call__
def __call__(self, event_name): L(event_name).map(self._call_one)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastcore/foundation.py", line 155, in map
def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastcore/basics.py", line 698, in map_ex
return list(res)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastcore/basics.py", line 683, in __call__
return self.func(*fargs, **kwargs)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/learner.py", line 145, in _call_one
for cb in self.cbs.sorted('order'): cb(event_name)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/callback/core.py", line 45, in __call__
if self.run and _run: res = getattr(self, event_name, noop)()
File "/home/ubuntu/wkdir/sagemaker_fastai_classifiers/basic_classifier/services/ClearMLCallback.py", line 88, in before_fit
self.valid_dl = self.dls.test_dl(test_items, with_labels=True)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/data/core.py", line 401, in test_dl
return self.valid.new(test_ds, **kwargs)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/data/core.py", line 63, in new
res = super().new(dataset, cls, do_setup=False, **kwargs)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/data/load.py", line 123, in new
cur_kwargs = dict(dataset=dataset, num_workers=self.fake_l.num_workers, pin_memory=self.pin_memory, timeout=self.timeout,
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastcore/basics.py", line 389, in __getattr__
if attr is not None: return getattr(attr,k)
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastai/data/core.py", line 335, in __getattr__
def __getattr__(self,k): return gather_attrs(self, k, 'tls')
File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/fastcore/transform.py", line 165, in gather_attrs
if not res: raise AttributeError(k)
AttributeError: pin_memory
I believe the error occurs because of inconsistencies between DistributedDL and DataLoader, which originates at this line:
which should probably be self.fake_l.pin_memory
, but I couldn’t find any documentation for what the FakeLoader
means so i’m not sure.
I tried changing the library code a bit, but it just propagates other inconsistencies, for example in the next line:
It tries to instantiate the DistributedDL, but it’s CTOR has different arguments than sent.
Please help