SaveModelCallback seems to be causing pickle errors while training with multi gpu using to_distributed()
:
File "/home/turgutluk/fastai/fastai/basic_train.py", line 111, in fit
fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
File "/home/turgutluk/fastai/fastai/basic_train.py", line 111, in fit
finally: cb_handler.on_train_end(exception)
File "/home/turgutluk/fastai/fastai/callback.py", line 322, in on_train_end
finally: cb_handler.on_train_end(exception)
File "/home/turgutluk/fastai/fastai/callback.py", line 322, in on_train_end
finally: cb_handler.on_train_end(exception)
File "/home/turgutluk/fastai/fastai/callback.py", line 322, in on_train_end
self('train_end', exception=exception)
File "/home/turgutluk/fastai/fastai/callback.py", line 250, in __call__
self('train_end', exception=exception)
File "/home/turgutluk/fastai/fastai/callback.py", line 250, in __call__
self('train_end', exception=exception)
File "/home/turgutluk/fastai/fastai/callback.py", line 250, in __call__
for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
File "/home/turgutluk/fastai/fastai/callback.py", line 240, in _call_and_update
for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
File "/home/turgutluk/fastai/fastai/callback.py", line 240, in _call_and_update
for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
File "/home/turgutluk/fastai/fastai/callback.py", line 240, in _call_and_update
new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
File "/home/turgutluk/fastai/fastai/callbacks/tracker.py", line 104, in on_train_end
new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
File "/home/turgutluk/fastai/fastai/callbacks/tracker.py", line 104, in on_train_end
new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
File "/home/turgutluk/fastai/fastai/callbacks/tracker.py", line 104, in on_train_end
self.learn.load(f'{self.name}', purge=False)
File "/home/turgutluk/fastai/fastai/basic_train.py", line 264, in load
self.learn.load(f'{self.name}', purge=False)
File "/home/turgutluk/fastai/fastai/basic_train.py", line 264, in load
self.learn.load(f'{self.name}', purge=False)
File "/home/turgutluk/fastai/fastai/basic_train.py", line 264, in load
state = torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device)
File "/home/turgutluk/.conda/envs/my_fastai/lib/python3.7/site-packages/torch/serialization.py", line 368, in load
state = torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device)
File "/home/turgutluk/.conda/envs/my_fastai/lib/python3.7/site-packages/torch/serialization.py", line 368, in load
state = torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device)
File "/home/turgutluk/.conda/envs/my_fastai/lib/python3.7/site-packages/torch/serialization.py", line 368, in load
return _load(f, map_location, pickle_module)
File "/home/turgutluk/.conda/envs/my_fastai/lib/python3.7/site-packages/torch/serialization.py", line 532, in _load
return _load(f, map_location, pickle_module)
File "/home/turgutluk/.conda/envs/my_fastai/lib/python3.7/site-packages/torch/serialization.py", line 532, in _load
return _load(f, map_location, pickle_module)
File "/home/turgutluk/.conda/envs/my_fastai/lib/python3.7/site-packages/torch/serialization.py", line 532, in _load
magic_number = pickle_module.load(f)
magic_number = pickle_module.load(f)
Here is the full script: https://github.com/KeremTurgutlu/hist_cancer_detection/blob/master/multi_gpu_training.py