LRFinded with Tensorboard callback causes error

cheremushkin · October 9, 2019, 2:45am

As the title says, I try to use Tensorboard callback as it’s described in docs:

learner = cnn_learner(data, models.resnet18).to_fp16()
learner.callback_fns.append(
    partial(LearnerTensorboardWriter, base_dir=Path('tensorboard/resnet18/'), name='1')
)

If I call learer.fit_one_cycle() then everything trains fine and I can see the result in the Tensorboard. However, calling learner.lr_find() results in an error:

NotImplementedError: Got <class ‘NoneType’>, but expected numpy array or torch tensor.

Traceback

NotImplementedError Traceback (most recent call last)
in
----> 1 learner.lr_find()

/opt/anaconda3/lib/python3.7/site-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
30 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
31 epochs = int(np.ceil(num_it/len(learn.data.train_dl)))
—> 32 learn.fit(epochs, start_lr, callbacks=[cb], wd=wd)
33
34 def to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=True, clip:float=None,

/opt/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
200 callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
201 self.cb_fns_registered = True
→ 202 fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
203
204 def create_opt(self, lr:Floats, wd:Floats=0.)->None:

/opt/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
106 cb_handler=cb_handler, pbar=pbar)
107 else: val_loss=None
→ 108 if cb_handler.on_epoch_end(val_loss): break
109 except Exception as e:
110 exception = e

/opt/anaconda3/lib/python3.7/site-packages/fastai/callback.py in on_epoch_end(self, val_loss)
315 “Epoch is done, process val_loss.”
316 self.state_dict[‘last_metrics’] = [val_loss] if val_loss is not None else [None]
→ 317 self(‘epoch_end’, call_mets = val_loss is not None)
318 self.state_dict[‘epoch’] += 1
319 return self.state_dict[‘stop_training’]

/opt/anaconda3/lib/python3.7/site-packages/fastai/callback.py in call(self, cb_name, call_mets, **kwargs)
249 if call_mets:
250 for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
→ 251 for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
252
253 def set_dl(self, dl:DataLoader):

/opt/anaconda3/lib/python3.7/site-packages/fastai/callback.py in _call_and_update(self, cb, cb_name, **kwargs)
239 def call_and_update(self, cb, cb_name, **kwargs)->None:
240 “Call cb_name on cb and update the inner state.”
→ 241 new = ifnone(getattr(cb, f’on{cb_name}')(**self.state_dict, **kwargs), dict())
242 for k,v in new.items():
243 if k not in self.state_dict:

/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py in on_epoch_end(self, last_metrics, iteration, **kwargs)
99 def on_epoch_end(self, last_metrics:MetricsList, iteration:int, **kwargs)->None:
100 “Callback function that writes epoch end appropriate data to Tensorboard.”
→ 101 self._write_metrics(iteration=iteration, last_metrics=last_metrics)
102
103 # TODO: We’re overriding almost everything here. Seems like a good idea to question that (“is a” vs “has a”)

/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py in _write_metrics(self, iteration, last_metrics, start_idx)
77 if last_metrics is None or len(last_metrics) < i+1: return
78 scalar_value = last_metrics[i]
—> 79 self._write_scalar(name=name, scalar_value=scalar_value, iteration=iteration)
80
81 def on_train_begin(self, **kwargs: Any) → None:

/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py in _write_scalar(self, name, scalar_value, iteration)
68 “Writes single scalar value to Tensorboard.”
69 tag = self.metrics_root + name
—> 70 self.tbwriter.add_scalar(tag=tag, scalar_value=scalar_value, global_step=iteration)
71
72 #TODO: Relying on a specific hardcoded start_idx here isn’t great. Is there a better solution?

/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/writer.py in add_scalar(self, tag, scalar_value, global_step, walltime)
386 scalar_value = workspace.FetchBlob(scalar_value)
387 self._get_file_writer().add_summary(
→ 388 scalar(tag, scalar_value), global_step, walltime)
389
390 def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None):

/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/summary.py in scalar(name, scalar, collections)
137 “”"
138 name = _clean_tag(name)
→ 139 scalar = make_np(scalar)
140 assert(scalar.squeeze().ndim == 0), ‘scalar should be 0D’
141 scalar = float(scalar)

/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/x2num.py in make_np(x)
32 return check_nan(prepare_mxnet(x))
33 raise NotImplementedError(
—> 34 ‘Got {}, but expected numpy array or torch tensor.’.format(type(x)))
35
36

NotImplementedError: Got <class ‘NoneType’>, but expected numpy array or torch tensor.

Is there a way to fix that issue?

Edit: there is also a warning after calling learner.lr_find():

Warning

Exception in thread Thread-4:
Traceback (most recent call last):
File “/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py”, line 595, in run_mod_and_filter_tensor_outputs
outs = wrap_retval(mod(*_clone_inputs(inputs)))
RuntimeError: Input type (Variable[CUDAFloatType]) and weight type (Variable[CUDAHalfType]) should be the same
The above operation failed in interpreter, with the following stack trace:
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py(340): conv2d_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py(343): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torchvision/models/resnet.py(59): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py(92): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py(92): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py(92): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py(904): trace_module
/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py(772): trace
/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/pytorch_graph.py(275): graph
/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/writer.py(774): add_graph
/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py(417): write
/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py(227): _queue_processor
/opt/anaconda3/lib/python3.7/threading.py(870): run
/opt/anaconda3/lib/python3.7/threading.py(926): _bootstrap_inner
/opt/anaconda3/lib/python3.7/threading.py(890): _bootstrap

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File “/opt/anaconda3/lib/python3.7/threading.py”, line 926, in _bootstrap_inner
self.run()
File “/opt/anaconda3/lib/python3.7/threading.py”, line 870, in run
self._target(*self._args, **self._kwargs)
File “/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py”, line 227, in _queue_processor
request.write()
File “/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py”, line 417, in write
self.tbwriter.add_graph(model=self.model, input_to_model=self.input_to_model)
File “/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/writer.py”, line 774, in add_graph
self._get_file_writer().add_graph(graph(model, input_to_model, verbose, **kwargs))
File “/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/pytorch_graph.py”, line 275, in graph
trace = torch.jit.trace(model, args)
File “/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py”, line 772, in trace
check_tolerance, _force_outplace, _module_class)
File “/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py”, line 914, in trace_module
check_tolerance, _force_outplace, True, _module_class)
File “/opt/anaconda3/lib/python3.7/site-packages/torch/autograd/grad_mode.py”, line 49, in decorate_no_grad
return func(*args, **kwargs)
File “/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py”, line 633, in _check_trace
traced_outs = run_mod_and_filter_tensor_outputs(traced_func, inputs, ‘trace’)
File “/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py”, line 601, in run_mod_and_filter_tensor_outputs
’ with test inputs.\nException:\n’ + indent(str(e)))
torch.jit.TracingCheckError: Tracing failed sanity checks!
Encountered an exception while running the trace with test inputs.
Exception:
Input type (Variable[CUDAFloatType]) and weight type (Variable[CUDAHalfType]) should be the same
The above operation failed in interpreter, with the following stack trace:
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py(340): conv2d_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py(343): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torchvision/models/resnet.py(59): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py(92): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py(92): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py(92): forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(531): _slow_forward
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py(545): call
/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py(904): trace_module
/opt/anaconda3/lib/python3.7/site-packages/torch/jit/init.py(772): trace
/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/pytorch_graph.py(275): graph
/opt/anaconda3/lib/python3.7/site-packages/tensorboardX/writer.py(774): add_graph
/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py(417): write
/opt/anaconda3/lib/python3.7/site-packages/fastai/callbacks/tensorboard.py(227): _queue_processor
/opt/anaconda3/lib/python3.7/threading.py(870): run
/opt/anaconda3/lib/python3.7/threading.py(926): _bootstrap_inner
/opt/anaconda3/lib/python3.7/threading.py(890): _bootstrap

TomB · October 9, 2019, 9:17am

There is this code in the callback:

if self.learn.data.valid_dl is None: return # Running learning rate finder, so return

But I guess subsequent fastai changes have broken it. So you’d need to find a way of detecting the LR finder. You don’t want to be logging to tensorboard then anyway. I haven’t looked into this yet.

To avoid this I just add the callback in fit(/fit_one_cycle)
So:

lrn = Learner(...) # Don't add tensorboard callback
lrn.lr_find()
lrn.recorder.plot()

tb_cb = TensorboardCallback(lrn, config=tb_config, writer=tb_writer)
lrn_cfn.fit_one_cycle(EPOCHS, LR, callbacks=[tb_cb])

My own tensorboard callback but should work fine with inbuilt one. This also means you can run tests of the learner before a full training without them being logged so is probably better overall. Still nice to fix the issue.