I am stuck on this for several days so I thought I’d post it here.
I find there is something odd with the memory management after finishing an epoch. I tried to run Fastai U-Net on the Synthia dataset and this is an error I get despite having a batchsize 1 with a crop to 360 on an 11GB K80. I must be missing something, because camvid runs fine: (also does not seem to happen if I only load 10 examples to the dataset)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-18-637f3e7802b9> in <module>()
----> 1 learn.fit_one_cycle(10,slice(1e-6,1e-3), cbs=WandbCallback())
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
88 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
89 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
---> 90 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
91
92 # Cell
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
290 self._do_epoch_validate()
291 except CancelEpochException: self('after_cancel_epoch')
--> 292 finally: self('after_epoch')
293
294 except CancelFitException: self('after_cancel_fit')
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in __call__(self, event_name)
221 def ordered_cbs(self, cb_func:str): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, cb_func)]
222
--> 223 def __call__(self, event_name): L(event_name).map(self._call_one)
224 def _call_one(self, event_name):
225 assert hasattr(event, event_name)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in map(self, f, *args, **kwargs)
360 else f.format if isinstance(f,str)
361 else f.__getitem__)
--> 362 return self._new(map(g, self))
363
364 def filter(self, f, negate=False, **kwargs):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in _new(self, items, *args, **kwargs)
313 @property
314 def _xtra(self): return None
--> 315 def _new(self, items, *args, **kwargs): return type(self)(items, *args, use_list=None, **kwargs)
316 def __getitem__(self, idx): return self._get(idx) if is_indexer(idx) else L(self._get(idx), use_list=None)
317 def copy(self): return self._new(self.items.copy())
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
39 return x
40
---> 41 res = super().__call__(*((x,) + args), **kwargs)
42 res._newchk = 0
43 return res
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
304 if items is None: items = []
305 if (use_list is not None) or not _is_array(items):
--> 306 items = list(items) if use_list else _listify(items)
307 if match is not None:
308 if is_coll(match): match = len(match)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in _listify(o)
240 if isinstance(o, list): return o
241 if isinstance(o, str) or _is_array(o): return [o]
--> 242 if is_iter(o): return list(o)
243 return [o]
244
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __call__(self, *args, **kwargs)
206 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
207 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 208 return self.fn(*fargs, **kwargs)
209
210 # Cell
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in _call_one(self, event_name)
224 def _call_one(self, event_name):
225 assert hasattr(event, event_name)
--> 226 [cb(event_name) for cb in sort_by_run(self.cbs)]
227
228 def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in <listcomp>(.0)
224 def _call_one(self, event_name):
225 assert hasattr(event, event_name)
--> 226 [cb(event_name) for cb in sort_by_run(self.cbs)]
227
228 def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in __call__(self, event_name)
23 _run = (event_name not in _inner_loop or (self.run_train and getattr(self, 'training', True)) or
24 (self.run_valid and not getattr(self, 'training', False)))
---> 25 if self.run and _run: getattr(self, event_name, noop)()
26
27 @property
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/callback/wandb.py in after_epoch(self)
64 if self.log_preds:
65 b = self.valid_dl.one_batch()
---> 66 self.learn.one_batch(0, b)
67 preds = getattr(self.loss_func, 'activation', noop)(self.pred)
68 out = getattr(self.loss_func, 'decodes', noop)(preds)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in one_batch(self, i, b)
246 try:
247 self._split(b); self('begin_batch')
--> 248 self.pred = self.model(*self.xb); self('after_pred')
249 if len(self.yb) == 0: return
250 self.loss = self.loss_func(self.pred, *self.yb); self('after_loss')
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
545 result = self._slow_forward(*input, **kwargs)
546 else:
--> 547 result = self.forward(*input, **kwargs)
548 for hook in self._forward_hooks.values():
549 hook_result = hook(self, input, result)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/layers.py in forward(self, x)
415 for l in self.layers:
416 res.orig = x
--> 417 nres = l(res)
418 # We have to remove res.orig to avoid hanging refs and therefore memory leaks
419 res.orig = None
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
545 result = self._slow_forward(*input, **kwargs)
546 else:
--> 547 result = self.forward(*input, **kwargs)
548 for hook in self._forward_hooks.values():
549 hook_result = hook(self, input, result)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/vision/models/unet.py in forward(self, up_in)
38 if ssh != up_out.shape[-2:]:
39 up_out = F.interpolate(up_out, s.shape[-2:], mode='nearest')
---> 40 cat_x = self.relu(torch.cat([up_out, self.bn(s)], dim=1))
41 return self.conv2(self.conv1(cat_x))
42
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
545 result = self._slow_forward(*input, **kwargs)
546 else:
--> 547 result = self.forward(*input, **kwargs)
548 for hook in self._forward_hooks.values():
549 hook_result = hook(self, input, result)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/activation.py in forward(self, input)
92
93 def forward(self, input):
---> 94 return F.relu(input, inplace=self.inplace)
95
96 def extra_repr(self):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/functional.py in relu(input, inplace)
911 result = torch.relu_(input)
912 else:
--> 913 result = torch.relu(input)
914 return result
915
RuntimeError: CUDA out of memory. Tried to allocate 508.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 4.81 MiB free; 419.63 MiB cached)
I was so far unable to replicate this on Camvid. But I will keep trying. The odd thing is that on Camvid (essentially the same dataset), with bs=8 and much larger image size, the % MEM used is ~50.
Further interesting aspects:
- This is despite the fact that synthia takes ~80% GPU MEM with much more limited batchsize & res.
- One potential culprit that I am investigating now is the # of classes, as synthia does not seem to run with fewer than 24
n_out
, but theoretically the dataset should have 15 classes.
- WandbCallback() seems to have a substantial GPU MEM footprint. I’ve been noticing it before but now it has a substantial impact (this is wandb problem tho as it happens even w/o fastai from CLI)