Callback to register/deregister hooks and plot activations

mb4310 · April 16, 2019, 7:24pm

In notebook 6 Jeremy introduces Hooks to monitor activation means/std. He uses the hooks in a with/as loop to automate removal. I have coded a callback which registers then removes the hooks on train start/after finishing as well as added some plotting functionality that I hope you enjoy/find easy.

A couple notes: I have adjusted the “hooks” and “hook” class to take a “fwd=True” parameter which toggles between registering forward or backward hook. Other than that everything is as was in the notebooks. One can pass with partial a list of the specific modules one wants to hook into or else optionally it will default to those immediate children who have trainable parameters. One can change to monitor gradient by simply changing the code to fwd=False and optionally changing the title on the images and name of the callback. One can plot with calling simply learn.layer_activations.plot() . Let me know what you think!


class LayerActivations(Callback):
    _order = 0 
    def __init__(self, ms = None):
        self.ms = ms

    def begin_fit(self):
        if self.ms is None:
            self.ms = [m for m in self.model if len(list(m.parameters())) > 1]
        self.hooks = Hooks(self.ms, append_stats, fwd=True)

    def after_fit(self):
        self.hooks.remove()

    def plot(self):
        for k,m in enumerate(self.ms):
            print('Block {}: {}'.format(k, m))

        fig, ax = plt.subplots(figsize=(12,8))
        act_means = pd.DataFrame({'Block_{}'.format(k): hook.stats[0] for k, hook in enumerate(self.hooks)})
        for act in act_means.columns:
            ax.plot(act_means[act])
        ax.legend()
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Activation')
        plt.title('Mean of activations by block')

        fig, ax = plt.subplots(figsize=(12,8))
        act_stds = pd.DataFrame({'Block_{}'.format(k): hook.stats[1] for k, hook in enumerate(self.hooks)})
        for act in act_stds.columns:
            ax.plot(act_stds[act])
        ax.legend()
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Activation')
        plt.title('Std of activations by block')

LessW2020 · April 17, 2019, 3:17am

Looks great, thanks for making / posting this!

LessW2020 · April 17, 2019, 3:38am

Could you post an example of usage? I tried to use it in 09c notebook but got several errors related to the hooks class…

===========
AttributeError: ‘Hooks’ object has no attribute ‘items’

TypeError Traceback (most recent call last)
~\fastai_docs\dev_course\dl2\exp\nb_09b.py in fit(self, epochs)
59 try:
—> 60 self.do_begin_fit(epochs)
61 for epoch in range(epochs):

~\fastai_docs\dev_course\dl2\exp\nb_09b.py in do_begin_fit(self, epochs)
51 for cb in self.cbs: cb.set_runner(self)
—> 52 self(‘begin_fit’)
53

~\fastai_docs\dev_course\dl2\exp\nb_09b.py in call(self, cb_name)
80 assert cb_name in self.ALL_CBS
—> 81 for cb in sorted(self.cbs, key=lambda x: x._order): res = cb(cb_name) and res
82 return res

~\fastai_docs\dev_course\dl2\exp\nb_05b.py in call(self, cb_name)
20 f = getattr(self, cb_name, None)
—> 21 if f and f(): return True
22 return False

in begin_fit(self)
8 self.ms = [m for m in self.model if len(list(m.parameters())) > 1]
----> 9 self.hooks = Hooks(self.ms, append_stats, fwd=True)
10

TypeError: init() got an unexpected keyword argument ‘fwd’

During handling of the above exception, another exception occurred:

AttributeError Traceback (most recent call last)
in
----> 1 learn.fit(1)

~\fastai_docs\dev_course\dl2\exp\nb_09b.py in fit(self, epochs)
69
70 except CancelTrainException: self(‘after_cancel_train’)
—> 71 finally: self(‘after_fit’)
72
73 ALL_CBS = {‘begin_batch’, ‘after_pred’, ‘after_loss’, ‘after_backward’, ‘after_step’,

~\fastai_docs\dev_course\dl2\exp\nb_09b.py in call(self, cb_name)
79 res = False
80 assert cb_name in self.ALL_CBS
—> 81 for cb in sorted(self.cbs, key=lambda x: x._order): res = cb(cb_name) and res
82 return res
83

~\fastai_docs\dev_course\dl2\exp\nb_05b.py in call(self, cb_name)
19 def call(self, cb_name):
20 f = getattr(self, cb_name, None)
—> 21 if f and f(): return True
22 return False
23

in after_fit(self)
10
11 def after_fit(self):
—> 12 self.hooks.remove()
13
14 def plot(self):

~\fastai_docs\dev_course\dl2\exp\nb_05b.py in getattr(self, k)
10 _order=0
11 def set_runner(self, run): self.run=run
—> 12 def getattr(self, k): return getattr(self.run, k)
13
14 @property

AttributeError: ‘Learner’ object has no attribute ‘hooks’

Do we need to modify the learner to incorporate hooks?
Thanks!

mb4310 · April 17, 2019, 1:58pm

The error is here:

TypeError: **init** () got an unexpected keyword argument ‘fwd’

And it passes it down the stack looking for hooks in Learner. What you have to do is make the following adjustments:

class Hook():
    def __init__(self, m, f, fwd=True): 

        if fwd:
            self.hook = m.register_forward_hook(partial(f, self))
        else:
            self.hook = m.register_backward_hook(partial(f,self))

    def remove(self): self.hook.remove()
    def __del__(self): self.remove()

class Hooks(ListContainer):
    def __init__(self, ms, f, fwd=True): 
        super().__init__([Hook(m, f, fwd) for m in ms])
    def __enter__(self, *args): return self
    def __exit__ (self, *args): self.remove()
    def __del__(self): self.remove()

    def __delitem__(self, i):
        self[i].remove()
        super().__delitem__(i)
        
    def remove(self):
        for h in self: h.remove()

Allowing for registering backward hooks. Here’s an updated version of the callbacks which should works (with the above adjustments made) in the notebook 09b_learner as of today. I added functionality to plot the percentage of “small” activations as in the notebook 06 as well as the color plot to see binned distribution of activations.

def append_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[],[])
    means,stds, hists = hook.stats
    if mod.training:
        means.append(outp.data.mean().item())
        stds.append(outp.data.std().item())
        max_bin = outp.abs().max().item()
        hists.append(outp.data.cpu().histc(60,-max_bin,max_bin))

def append_gradient_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[],[])
    means,stds, hists = hook.stats
    if mod.training:
        means.append(outp[-1].data.abs().mean().item())
        stds.append(outp[-1].data.abs().std().item())
        max_bin = outp[-1].data.abs().max().item()
        hists.append(outp[-1].data.cpu().histc(40, -max_bin, max_bin))

class LayerActivations(Callback):
    _order = 0 
    def __init__(self, ms = None):
        self.ms = ms

    def begin_fit(self):
        if self.ms is None:
            self.ms = [m for m in self.model if len(list(m.parameters())) > 1]
        self.hooks = Hooks(self.ms, append_stats, fwd=True)

    def after_fit(self):
        self.hooks.remove()

    def plot_stats(self):
        for k,m in enumerate(self.ms):
            print('Block {}: {}'.format(k, m))

        fig, ax = plt.subplots(figsize=(12,8))
        act_means = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[0]) for k, hook in enumerate(self.hooks)})
        for act in act_means.columns:
            ax.plot(act_means[act])
        ax.legend()
        ax.set_xlabel('Iteration')
        plt.title('Mean of activations by block')

        fig, ax = plt.subplots(figsize=(12,8))
        act_stds = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[1]) for k, hook in enumerate(self.hooks)})
        for act in act_stds.columns:
            ax.plot(act_stds[act])
        ax.legend()
        ax.set_xlabel('Iteration')
        plt.title('Std of activations by block')

    def plot_distributions(self, num_batches=120):
        for k,h in enumerate(self.hooks):
            fig,ax = plt.subplots(figsize=(12,8))
            hist = get_hist(h)
            if hist[:8].sum().item() == 0:
                hist = hist[29:]
            ax.imshow(hist[:,:num_batches], origin='lower', cmap='RdYlGn')
            ax.axis('off')
            plt.title('Block {}'.format(k))

    def plot_percent_small(self):
        for k,h in enumerate(self.hooks):
            fig, ax = plt.subplots(figsize = (8,6))
            vals = torch.stack(h.stats[2]).t().float()
            vals = vals[29:31].sum(dim=0) / vals.sum(dim=0)
            ax.plot(vals)
            ax.set_xlabel('Iteration')
            plt.title('Percent activations near zero: Block {}'.format(k))

class GradientNorms(Callback):
    _order = 0 
    def __init__(self, ms = None):
        self.ms = ms

    def begin_fit(self):
        if self.ms is None:
            self.ms = [m for m in self.model if len(list(m.parameters())) > 1]
        self.hooks = Hooks(self.ms, append_gradient_stats, fwd=False)

    def after_fit(self):
        self.hooks.remove()

    def plot_stats(self):
        for k,m in enumerate(self.ms):
            print('Block {}: {}'.format(k, m))

        fig, ax = plt.subplots(figsize=(12,8))
        act_means = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[0]) for k, hook in enumerate(self.hooks)})
        for act in act_means.columns:
            ax.plot(act_means[act])
        ax.legend()
        ax.set_xlabel('Iteration')
        plt.title('Mean of gradient norm by block')

        fig, ax = plt.subplots(figsize=(12,8))
        act_stds = pd.DataFrame({'Block_{}'.format(k): Smoother().process(hook.stats[1]) for k, hook in enumerate(self.hooks)})
        for act in act_stds.columns:
            ax.plot(act_stds[act])
        ax.legend()
        ax.set_xlabel('Iteration')
        plt.title('Std of gradient norm by block')

    def plot_distributions(self, num_batches=100):
        for k,h in enumerate(self.hooks):
            fig,ax = plt.subplots(figsize=(12,8))
            ax.imshow(get_hist(h)[:,:num_batches], origin='lower', cmap='RdYlGn')
            ax.axis('off')
            plt.title('Block {}'.format(k))

usage is:
learn.layer_activation.plot_stats()
learn.layer_activation.plot_distributions()
learn.layer_activations.plot_percent_small()

learn.gradient_norms.plot_stats()
learn.gradient_norms.plot_distributions

LessW2020 · April 17, 2019, 2:03pm

Thanks very much for the update. I’m testing out LiSHT activation in place of ReLU right now and looking to leverage your code and plotting here for monitoring it’s performance, so thanks again.

mb4310 · April 17, 2019, 2:06pm

Great, hope it works out!

A quick note is that the callback expects the model to be a sequential model, it looks for sub-modules [x in model] where x.parameters() has length greater than one by default. Alternatively you can pass (with partial) a list of the modules you want to register hooks on explicitly.