The error is here:
TypeError: **init** () got an unexpected keyword argument ‘fwd’
And it passes it down the stack looking for hooks in Learner. What you have to do is make the following adjustments:
class Hook():
def __init__(self, m, f, fwd=True):
if fwd:
self.hook = m.register_forward_hook(partial(f, self))
else:
self.hook = m.register_backward_hook(partial(f,self))
def remove(self): self.hook.remove()
def __del__(self): self.remove()
class Hooks(ListContainer):
def __init__(self, ms, f, fwd=True):
super().__init__([Hook(m, f, fwd) for m in ms])
def __enter__(self, *args): return self
def __exit__ (self, *args): self.remove()
def __del__(self): self.remove()
def __delitem__(self, i):
self[i].remove()
super().__delitem__(i)
def remove(self):
for h in self: h.remove()
Allowing for registering backward hooks. Here’s an updated version of the callbacks which should works (with the above adjustments made) in the notebook 09b_learner as of today. I added functionality to plot the percentage of “small” activations as in the notebook 06 as well as the color plot to see binned distribution of activations.
def append_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[],[])
means,stds, hists = hook.stats
if mod.training:
means.append(outp.data.mean().item())
stds.append(outp.data.std().item())
max_bin = outp.abs().max().item()
hists.append(outp.data.cpu().histc(60,-max_bin,max_bin))
def append_gradient_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[],[])
means,stds, hists = hook.stats
if mod.training:
means.append(outp[-1].data.abs().mean().item())
stds.append(outp[-1].data.abs().std().item())
max_bin = outp[-1].data.abs().max().item()
hists.append(outp[-1].data.cpu().histc(40, -max_bin, max_bin))
class LayerActivations(Callback):
_order = 0
def __init__(self, ms = None):
self.ms = ms
def begin_fit(self):
if self.ms is None:
self.ms = [m for m in self.model if len(list(m.parameters())) > 1]
self.hooks = Hooks(self.ms, append_stats, fwd=True)
def after_fit(self):
self.hooks.remove()
def plot_stats(self):
for k,m in enumerate(self.ms):
print('Block {}: {}'.format(k, m))
fig, ax = plt.subplots(figsize=(12,8))
act_means = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[0]) for k, hook in enumerate(self.hooks)})
for act in act_means.columns:
ax.plot(act_means[act])
ax.legend()
ax.set_xlabel('Iteration')
plt.title('Mean of activations by block')
fig, ax = plt.subplots(figsize=(12,8))
act_stds = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[1]) for k, hook in enumerate(self.hooks)})
for act in act_stds.columns:
ax.plot(act_stds[act])
ax.legend()
ax.set_xlabel('Iteration')
plt.title('Std of activations by block')
def plot_distributions(self, num_batches=120):
for k,h in enumerate(self.hooks):
fig,ax = plt.subplots(figsize=(12,8))
hist = get_hist(h)
if hist[:8].sum().item() == 0:
hist = hist[29:]
ax.imshow(hist[:,:num_batches], origin='lower', cmap='RdYlGn')
ax.axis('off')
plt.title('Block {}'.format(k))
def plot_percent_small(self):
for k,h in enumerate(self.hooks):
fig, ax = plt.subplots(figsize = (8,6))
vals = torch.stack(h.stats[2]).t().float()
vals = vals[29:31].sum(dim=0) / vals.sum(dim=0)
ax.plot(vals)
ax.set_xlabel('Iteration')
plt.title('Percent activations near zero: Block {}'.format(k))
class GradientNorms(Callback):
_order = 0
def __init__(self, ms = None):
self.ms = ms
def begin_fit(self):
if self.ms is None:
self.ms = [m for m in self.model if len(list(m.parameters())) > 1]
self.hooks = Hooks(self.ms, append_gradient_stats, fwd=False)
def after_fit(self):
self.hooks.remove()
def plot_stats(self):
for k,m in enumerate(self.ms):
print('Block {}: {}'.format(k, m))
fig, ax = plt.subplots(figsize=(12,8))
act_means = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[0]) for k, hook in enumerate(self.hooks)})
for act in act_means.columns:
ax.plot(act_means[act])
ax.legend()
ax.set_xlabel('Iteration')
plt.title('Mean of gradient norm by block')
fig, ax = plt.subplots(figsize=(12,8))
act_stds = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[1]) for k, hook in enumerate(self.hooks)})
for act in act_stds.columns:
ax.plot(act_stds[act])
ax.legend()
ax.set_xlabel('Iteration')
plt.title('Std of gradient norm by block')
def plot_distributions(self, num_batches=100):
for k,h in enumerate(self.hooks):
fig,ax = plt.subplots(figsize=(12,8))
ax.imshow(get_hist(h)[:,:num_batches], origin='lower', cmap='RdYlGn')
ax.axis('off')
plt.title('Block {}'.format(k))
usage is:
learn.layer_activation.plot_stats()
learn.layer_activation.plot_distributions()
learn.layer_activations.plot_percent_small()
learn.gradient_norms.plot_stats()
learn.gradient_norms.plot_distributions