Core: Multiple Inputs and Keyword Arguments

chr5tphr · chr5tphr · commit c4fbdafb06ea · 2025-07-13T21:11:46.000+09:00
- torch 2.0.0 allows us to to pass multiple args and kwargs to hooks - handle multiple inputs and outputs in core.Hook and core.BasicHook, by passing all required grad_outputs and inputs to the backward implementation - BasicHook still only processes a single input - Hook checks the function signature to allow backwards-compatibility - added a basic test that uses the kwargs-signature - added a note that keyword arguments are supported in the documentation Notes: - This stands in conflict with #168, but promises a better implementation by handling inputs and outpus as common to a single function, rather than individually as proposed in #168 - This does not deal with parameter gradients, which are better left to a seperate PR implements #176
diff --git a/docs/source/how-to/write-custom-rules.rst b/docs/source/how-to/write-custom-rules.rst
@@ -85,15 +85,23 @@ for the layer-wise relevance propagation (LRP)-based **Composites**, used for
 all activations.
 :py:class:`~zennit.core.Hook` has a dictionary attribute ``stored_tensors``,
 which is used to store the output gradient as ``stored_tensors['grad_output']``.
-:py:meth:`~zennit.core.Hook.forward` has 3 arguments:
+:py:meth:`~zennit.core.Hook.forward` can have two shapes, one with keyword-arguments, and one without.
+If the rule does not need to handle keyword arguments:
 
 * ``module``, which is the current module the hook has been registered to,
-* ``input``, which is the module's input tensor, and
-* ``output``, which is the module's output tensor.
+* ``input``, which are the module's input tensors, and
+* ``output``, which are the module's output tensors.
+
+If the rule should also handle keyword arguments (new in version 1.0.0), the following signature may be used:
+
+* ``module``, which is the current module the hook has been registered to,
+* ``args``, which are the module's positional inputs (mixed tensors and parameters allowed),
+* ``kwargs``, which are the module's keyword inputs (tensors unsupported), and
+* ``output``, which are the module's output tensors.
 
 :py:meth:`~zennit.core.Hook.forward` is always called *after* the forward has
 been called, thus making ``output`` available.
-Using the notation above, ``input`` is :math:`x` and ``output`` is :math:`f(x)`.
+Using the first notation above, ``input`` is :math:`x` and ``output`` is :math:`f(x)`.
 
 A layer-wise *gradient times input* can be implemented by storing the input
 tensor in the forward pass and directly using ``grad_input`` in the backward
diff --git a/src/zennit/core.py b/src/zennit/core.py
@@ -20,6 +20,8 @@
 import weakref
 from contextlib import contextmanager
 from typing import Generator, Iterator
+from itertools import compress, repeat
+from inspect import signature
 
 import torch
 
@@ -235,6 +237,44 @@ def modifier_wrapper(input, name):
     return zero_params_wrapper
 
 
+def uncompress(data, selector, compressed) -> Generator:
+    '''Generator which, given a compressed iterable produced by :py:obj:`itertools.compress` and (some iterable similar
+    to) the original data and selector used for :py:obj:`~itertools.compress`, yields values from `compressed` or
+    `data` depending on `selector`. `True` values in `selector` skip `data` one ahead and yield a value from
+    `compressed`, while `False` values yield one value from `data`.
+
+    Parameters
+    ----------
+    data : iterable
+        The iterable (similar to the) original data. `False` values in the `selector` will be filled with values from
+        this iterator, while `True` values will cause this iterable to be skipped.
+    selector : iterable of bool
+        The original selector used to produce `compressed`. Chooses whether elements from `data` or from `compressed`
+        will be yielded.
+    compressed : iterable
+        The results of :py:obj:`itertools.compress`. Will be yielded for each `True` element in `selector`.
+
+    Yields
+    ------
+    object
+        An element of `data` if the associated element of `selector` is `False`, otherwise an element of `compressed`
+        while skipping `data` one ahead.
+
+    '''
+    its = iter(selector)
+    itc = iter(compressed)
+    itd = iter(data)
+    for select in its:
+        try:
+            if select:
+                next(itd)
+                yield next(itc)
+            else:
+                yield next(itd)
+        except StopIteration:
+            break
+
+
 class ParamMod:
     '''Class to produce a context manager to temporarily modify parameter attributes (all by default) of a module.
 
@@ -394,6 +434,7 @@ def forward(ctx, *inputs):
         inputs: tuple of :py:obj:`torch.Tensor`
             The unmodified inputs.
         '''
+        ctx.mark_non_differentiable(*[elem for elem in inputs if not elem.requires_grad])
         return inputs
 
     @staticmethod
@@ -422,15 +463,41 @@ def __init__(self):
         self.active = True
         self.tensor_handles = RemovableHandleList()
 
-    def pre_forward(self, module, input):
+    @staticmethod
+    def _inject_grad_fn(args):
+        tensor_mask = tuple(isinstance(elem, torch.Tensor) for elem in args)
+        tensors = tuple(compress(args, tensor_mask))
+        # tensors = [(n, elem) for elem in enumerate(args) if isinstance(elem, torch.Tensor)]
+
+        # only if gradient required
+        if not any(tensor.requires_grad for tensor in tensors):
+            return None, args, tensor_mask
+
+        # add identity to ensure .grad_fn exists and all tensors share the same .grad_fn
+        post_tensors = Identity.apply(*tensors)
+        grad_fn = next((tensor.grad_fn for tensor in post_tensors if tensor.grad_fn is not None), None)
+        if grad_fn is None:
+            # sanity check, should never happen because the check above already catches cases in which no input tensor
+            # requires a gradient, and in normal conditions, we will always obtain a grad_fn from `Identity` for each
+            # tensor with requires_grad=True
+            raise RuntimeError('Backward hook could not be registered!')  # pragma: no cover
+
+        # work-around to support in-place operations
+        post_tensors = tuple(elem.clone() for elem in post_tensors)
+        post_args = tuple(uncompress(args, tensor_mask, post_tensors))
+        return grad_fn, post_args, tensor_mask
+
+    def pre_forward(self, module, args, kwargs):
         '''Apply an Identity to the input before the module to register a backward hook.
 
         Parameters
         ----------
         module: :py:obj:`torch.nn.Module`
             The module to which this hook is attached.
-        input: :py:obj:`torch.Tensor`
-            The input tensor.
+        args: tuple of :py:obj:`torch.Tensor`
+            The input tensors passed to ``module.forward``.
+        kwargs: dict
+            The keyword arguments passed to ``module.forward``.
 
         Returns
         -------
@@ -440,40 +507,41 @@ def pre_forward(self, module, input):
         '''
         hook_ref = weakref.ref(self)
 
+        grad_fn, post_args, input_tensor_mask = self._inject_grad_fn(args)
+        if grad_fn is None:
+            return None
+
         @functools.wraps(self.backward)
         def wrapper(grad_input, grad_output):
             hook = hook_ref()
             if hook is not None and hook.active:
-                return hook.backward(module, grad_input, hook.stored_tensors['grad_output'])
+                return hook.backward(
+                    module,
+                    list(uncompress(
+                        repeat(None),
+                        input_tensor_mask,
+                        grad_input,
+                    )),
+                    hook.stored_tensors['grad_output'],
+                )
             return None
 
-        if not isinstance(input, tuple):
-            input = (input,)
+        # register the input tensor gradient hook
+        self.tensor_handles.append(grad_fn.register_hook(wrapper))
 
-        # only if gradient required
-        if input[0].requires_grad:
-            # add identity to ensure .grad_fn exists
-            post_input = Identity.apply(*input)
-            # register the input tensor gradient hook
-            self.tensor_handles.append(
-                post_input[0].grad_fn.register_hook(wrapper)
-            )
-            # work around to support in-place operations
-            post_input = tuple(elem.clone() for elem in post_input)
-        else:
-            # no gradient required
-            post_input = input
-        return post_input[0] if len(post_input) == 1 else post_input
-
-    def post_forward(self, module, input, output):
+        return post_args, kwargs
+
+    def post_forward(self, module, args, kwargs, output):
         '''Register a backward-hook to the resulting tensor right after the forward.
 
         Parameters
         ----------
         module: :py:obj:`torch.nn.Module`
             The module to which this hook is attached.
-        input: :py:obj:`torch.Tensor`
-            The input tensor.
+        args: tuple of :py:obj:`torch.Tensor`
+            The input tensors passed to ``module.forward``.
+        kwargs: tuple of object
+            The keyword arguments passed to ``module.forward``.
         output: :py:obj:`torch.Tensor`
             The output tensor.
 
@@ -484,23 +552,35 @@ def post_forward(self, module, input, output):
         '''
         hook_ref = weakref.ref(self)
 
+        single = not isinstance(output, tuple)
+        if single:
+            output = (output,)
+
+        grad_fn, post_output, output_tensor_mask = self._inject_grad_fn(output)
+        if grad_fn is None:
+            return None
+
         @functools.wraps(self.pre_backward)
         def wrapper(grad_input, grad_output):
             hook = hook_ref()
             if hook is not None and hook.active:
-                return hook.pre_backward(module, grad_input, grad_output)
+                return hook.pre_backward(
+                    module,
+                    grad_input,
+                    tuple(uncompress(
+                        repeat(None),
+                        output_tensor_mask,
+                        grad_output
+                    ))
+                )
             return None
 
-        if not isinstance(output, tuple):
-            output = (output,)
+        # register the output tensor gradient hook
+        self.tensor_handles.append(grad_fn.register_hook(wrapper))
 
-        # only if gradient required
-        if output[0].grad_fn is not None:
-            # register the output tensor gradient hook
-            self.tensor_handles.append(
-                output[0].grad_fn.register_hook(wrapper)
-            )
-        return output[0] if len(output) == 1 else output
+        if single:
+            return post_output[0]
+        return post_output
 
     def pre_backward(self, module, grad_input, grad_output):
         '''Store the grad_output for the backward hook.
@@ -516,15 +596,17 @@ def pre_backward(self, module, grad_input, grad_output):
         '''
         self.stored_tensors['grad_output'] = grad_output
 
-    def forward(self, module, input, output):
+    def forward(self, module, args, kwargs, output):
         '''Hook applied during forward-pass.
 
         Parameters
         ----------
         module: :py:obj:`torch.nn.Module`
             The module to which this hook is attached.
-        input: :py:obj:`torch.Tensor`
-            The input tensor.
+        args: tuple of :py:obj:`torch.Tensor`
+            The input tensors passed to ``module.forward``.
+        kwargs: tuple of object
+            The keyword arguments passed to ``module.forward``.
         output: :py:obj:`torch.Tensor`
             The output tensor.
         '''
@@ -573,11 +655,34 @@ def register(self, module):
             A list of removable handles, one for each registered hook.
 
         '''
+        def with_kwargs(method, has_output=True):
+            '''Check whether the method uses args/kwargs, or only inputs. This ensures compatibility with rules that do
+            not consider kwargs, and reduces code clutter.
+
+            Parameters
+            ----------
+            method: function
+                Function to check.
+            has_output: bool
+                Function to check.
+
+            Returns
+            -------
+            bool
+                True if `method` uses kwargs.
+            '''
+            params = signature(method).parameters
+            # assume with_kwargs if forward has not 3 parameters and 3rd is not called 'output'
+            if has_output:
+                return len(params) != 3 and list(params)[2] != 'output'
+            # e.g., pre_forward has no output, so we expect 2 parameters
+            return len(params) != 2
+
         return RemovableHandleList([
             RemovableHandle(self),
-            module.register_forward_pre_hook(self.pre_forward),
-            module.register_forward_hook(self.post_forward),
-            module.register_forward_hook(self.forward),
+            module.register_forward_pre_hook(self.pre_forward, with_kwargs=with_kwargs(self.pre_forward, False)),
+            module.register_forward_hook(self.post_forward, with_kwargs=with_kwargs(self.post_forward)),
+            module.register_forward_hook(self.forward, with_kwargs=with_kwargs(self.forward)),
         ])
 
 
@@ -645,19 +750,22 @@ def __init__(
         self.gradient_mapper = gradient_mapper
         self.reducer = reducer
 
-    def forward(self, module, input, output):
+    def forward(self, module, args, kwargs, output):
         '''Forward hook to save module in-/outputs.
 
         Parameters
         ----------
         module: :py:obj:`torch.nn.Module`
             The module to which this hook is attached.
-        input: :py:obj:`torch.Tensor`
-            The input tensor.
+        args: tuple of :py:obj:`torch.Tensor`
+            The input tensors passed to ``module.forward``.
+        kwargs: tuple of object
+            The keyword arguments passed to ``module.forward``.
         output: :py:obj:`torch.Tensor`
             The output tensor.
         '''
-        self.stored_tensors['input'] = input
+        self.stored_tensors['input'] = args
+        self.stored_tensors['kwargs'] = kwargs
 
     def backward(self, module, grad_input, grad_output):
         '''Backward hook to compute LRP based on the class attributes.
@@ -676,13 +784,15 @@ def backward(self, module, grad_input, grad_output):
         tuple of :py:obj:`torch.nn.Module`
             The modified input gradient tensors.
         '''
-        original_input = self.stored_tensors['input'][0].clone()
+        original_input, *original_args = self.stored_tensors['input']
+        original_input = original_input.clone()
+        original_kwargs = self.stored_tensors['kwargs']
         inputs = []
         outputs = []
         for in_mod, param_mod, out_mod in zip(self.input_modifiers, self.param_modifiers, self.output_modifiers):
             input = in_mod(original_input).requires_grad_()
             with ParamMod.ensure(param_mod)(module) as modified, torch.autograd.enable_grad():
-                output = modified.forward(input)
+                output = modified.forward(input, *original_args, **original_kwargs)
                 output = out_mod(output)
             inputs.append(input)
             outputs.append(output)
diff --git a/tests/unit/.pytest.ini b/tests/unit/.pytest.ini
@@ -10,3 +10,6 @@ testpaths = tests/unit
 #      with output, (a) all except passed (p/P), or (A) all
 # --showlocals: Show local variables in tracebacks
 addopts = -ra --showlocals
+
+markers =
+    extended: do tests with multiple seeds (deselect with '-m "not extended"')
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -43,9 +43,8 @@ def pytest_generate_tests(metafunc):
     scope='session',
     params=[
         0xdeadbeef,
-        0xd0c0ffee,
         *[pytest.param(seed, marks=pytest.mark.extended) for seed in [
-            0xc001bee5, 0xc01dfee7, 0xbe577001, 0xca7b0075, 0x1057b0a7, 0x900ddeed
+            0xd0c0ffee, 0xc001bee5, 0xc01dfee7, 0xbe577001, 0xca7b0075, 0x1057b0a7, 0x900ddeed
         ]],
     ],
     ids=hex
@@ -261,7 +260,7 @@ def partial_name_map_composite(name_map_composite, pyrng):
 
 @pytest.fixture(scope='session')
 def mixed_composite(partial_name_map_composite, special_first_layer_map_composite):
-    '''Fixture to create NameLayerMapComposites based on an explicit NameMapComposite and
+    '''Fixture to create mixtures of explicit NameMapComposite and
     SpecialFirstLayerMapComposites.
     '''
     composites = [partial_name_map_composite, special_first_layer_map_composite]
diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py