In [ ]:

#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [ ]:

#hide
from fastbook import *

CNN Interpretation with CAM¶

CAM and Hooks¶

In [ ]:

path = untar_data(URLs.PETS)/'images'
def is_cat(x): return x[0].isupper()
dls = ImageDataLoaders.from_name_func(
    path, get_image_files(path), valid_pct=0.2, seed=21,
    label_func=is_cat, item_tfms=Resize(224))
learn = vision_learner(dls, resnet34, metrics=error_rate)
learn.fine_tune(1)

In [ ]:

img = PILImage.create(image_cat())
x, = first(dls.test_dl([img]))

In [ ]:

class Hook():
    def hook_func(self, m, i, o): self.stored = o.detach().clone()

In [ ]:

hook_output = Hook()
hook = learn.model[0].register_forward_hook(hook_output.hook_func)

In [ ]:

with torch.no_grad(): output = learn.model.eval()(x)

In [ ]:

act = hook_output.stored[0]

In [ ]:

F.softmax(output, dim=-1)

In [ ]:

dls.vocab

In [ ]:

x.shape

In [ ]:

cam_map = torch.einsum('ck,kij->cij', learn.model[1][-1].weight, act)
cam_map.shape

In [ ]:

x_dec = TensorImage(dls.train.decode((x,))[0][0])
_,ax = plt.subplots()
x_dec.show(ctx=ax)
ax.imshow(cam_map[1].detach().cpu(), alpha=0.6, extent=(0,224,224,0),
              interpolation='bilinear', cmap='magma');

In [ ]:

hook.remove()

In [ ]:

class Hook():
    def __init__(self, m):
        self.hook = m.register_forward_hook(self.hook_func)   
    def hook_func(self, m, i, o): self.stored = o.detach().clone()
    def __enter__(self, *args): return self
    def __exit__(self, *args): self.hook.remove()

In [ ]:

with Hook(learn.model[0]) as hook:
    with torch.no_grad(): output = learn.model.eval()(x.cuda())
    act = hook.stored

Gradient CAM¶

In [ ]:

class HookBwd():
    def __init__(self, m):
        self.hook = m.register_backward_hook(self.hook_func)   
    def hook_func(self, m, gi, go): self.stored = go[0].detach().clone()
    def __enter__(self, *args): return self
    def __exit__(self, *args): self.hook.remove()

In [ ]:

cls = 1
with HookBwd(learn.model[0]) as hookg:
    with Hook(learn.model[0]) as hook:
        output = learn.model.eval()(x.cuda())
        act = hook.stored
    output[0,cls].backward()
    grad = hookg.stored

In [ ]:

w = grad[0].mean(dim=[1,2], keepdim=True)
cam_map = (w * act[0]).sum(0)

In [ ]:

_,ax = plt.subplots()
x_dec.show(ctx=ax)
ax.imshow(cam_map.detach().cpu(), alpha=0.6, extent=(0,224,224,0),
              interpolation='bilinear', cmap='magma');

In [ ]:

with HookBwd(learn.model[0][-2]) as hookg:
    with Hook(learn.model[0][-2]) as hook:
        output = learn.model.eval()(x.cuda())
        act = hook.stored
    output[0,cls].backward()
    grad = hookg.stored

In [ ]:

w = grad[0].mean(dim=[1,2], keepdim=True)
cam_map = (w * act[0]).sum(0)

In [ ]:

_,ax = plt.subplots()
x_dec.show(ctx=ax)
ax.imshow(cam_map.detach().cpu(), alpha=0.6, extent=(0,224,224,0),
              interpolation='bilinear', cmap='magma');

Conclusion¶

Questionnaire¶

What is a "hook" in PyTorch?
Which layer does CAM use the outputs of?
Why does CAM require a hook?
Look at the source code of the ActivationStats class and see how it uses hooks.
Write a hook that stores the activations of a given layer in a model (without peeking, if possible).
Why do we call eval before getting the activations? Why do we use no_grad?
Use torch.einsum to compute the "dog" or "cat" score of each of the locations in the last activation of the body of the model.
How do you check which order the categories are in (i.e., the correspondence of index->category)?
Why are we using decode when displaying the input image?
What is a "context manager"? What special methods need to be defined to create one?
Why can't we use plain CAM for the inner layers of a network?
Why do we need to register a hook on the backward pass in order to do Grad-CAM?
Why can't we call output.backward() when output is a rank-2 tensor of output activations per image per class?

Further Research¶

Try removing keepdim and see what happens. Look up this parameter in the PyTorch docs. Why do we need it in this notebook?
Create a notebook like this one, but for NLP, and use it to find which words in a movie review are most significant in assessing the sentiment of a particular movie review.

In [ ]: