test(autogram): Add a way to test against no cross-terms (#467)

ValerianRey · ValerianRey · commit 7a95b96fdff1 · 2025-10-20T17:59:11.000+02:00
* Add CloneParams context to consider each parameter usage on a per-module-usage basis.
* Add _get_losses_and_params_with_cross_terms, _get_losses_and_params_without_cross_terms, and _get_losses_and_params to select between both.
diff --git a/tests/unit/autogram/test_engine.py b/tests/unit/autogram/test_engine.py
@@ -6,9 +6,10 @@
 import torch
 from pytest import mark, param
 from torch import Tensor
-from torch.nn import BatchNorm2d, InstanceNorm2d, Linear
+from torch.nn import BatchNorm2d, InstanceNorm2d, Linear, Module, Parameter
 from torch.optim import SGD
 from torch.testing import assert_close
+from torch.utils._pytree import PyTree
 from utils.architectures import (
     AlexNet,
     Cifar10Model,
@@ -64,6 +65,7 @@
 )
 from utils.dict_assertions import assert_tensor_dicts_are_close
 from utils.forward_backwards import (
+    CloneParams,
     autograd_forward_backward,
     autogram_forward_backward,
     compute_gramian,
@@ -148,15 +150,42 @@ def _assert_gramian_is_equivalent_to_autograd(
     inputs, targets = make_inputs_and_targets(model_autograd, batch_size)
     loss_fn = make_mse_loss_fn(targets)
 
-    losses = forward_pass(model_autograd, inputs, loss_fn, reduce_to_vector)
-    autograd_gramian = compute_gramian_with_autograd(losses, list(model_autograd.parameters()))
+    losses, params = _get_losses_and_params(model_autograd, inputs, loss_fn, reduce_to_vector)
+    autograd_gramian = compute_gramian_with_autograd(losses, params)
 
     losses = forward_pass(model_autogram, inputs, loss_fn, reduce_to_vector)
     autogram_gramian = engine.compute_gramian(losses)
 
     assert_close(autogram_gramian, autograd_gramian, rtol=1e-4, atol=3e-5)
 
 
+def _get_losses_and_params_with_cross_terms(
+    model: Module,
+    inputs: PyTree,
+    loss_fn: Callable[[PyTree], list[Tensor]],
+    reduction: Callable[[list[Tensor]], Tensor],
+) -> tuple[Tensor, list[Parameter]]:
+    losses = forward_pass(model, inputs, loss_fn, reduction)
+    params = list(model.parameters())
+    return losses, params
+
+
+def _get_losses_and_params_without_cross_terms(
+    model: Module,
+    inputs: PyTree,
+    loss_fn: Callable[[PyTree], list[Tensor]],
+    reduction: Callable[[list[Tensor]], Tensor],
+) -> tuple[Tensor, list[Parameter]]:
+    # Not considering cross-terms (except intra-module parameter reuse):
+    with CloneParams(model) as params:
+        losses = forward_pass(model, inputs, loss_fn, reduction)
+
+    return losses, params
+
+
+_get_losses_and_params = _get_losses_and_params_with_cross_terms
+
+
 @mark.parametrize(["factory", "batch_size"], PARAMETRIZATIONS)
 @mark.parametrize("batch_dim", [0, None])
 def test_compute_gramian(factory: ModuleFactory, batch_size: int, batch_dim: int | None):
@@ -250,11 +279,11 @@ def test_compute_gramian_various_output_shapes(
     inputs, targets = make_inputs_and_targets(model_autograd, batch_size)
     loss_fn = make_mse_loss_fn(targets)
 
-    losses = forward_pass(model_autograd, inputs, loss_fn, reduction)
+    losses, params = _get_losses_and_params(model_autograd, inputs, loss_fn, reduction)
     reshaped_losses = torch.movedim(losses, movedim_source, movedim_destination)
     # Go back to a vector so that compute_gramian_with_autograd works
     loss_vector = reshaped_losses.reshape([-1])
-    autograd_gramian = compute_gramian_with_autograd(loss_vector, list(model_autograd.parameters()))
+    autograd_gramian = compute_gramian_with_autograd(loss_vector, params)
     expected_gramian = reshape_gramian(autograd_gramian, list(reshaped_losses.shape))
 
     engine = Engine(model_autogram, batch_dim=batch_dim)
@@ -289,6 +318,7 @@ def test_compute_partial_gramian(gramian_module_names: set[str], batch_dim: int
     for m in gramian_modules:
         gramian_params += list(m.parameters())
 
+    # This includes cross-terms, but the model has no parameter reuse.
     losses = forward_pass(model, inputs, loss_fn, reduce_to_vector)
     autograd_gramian = compute_gramian_with_autograd(losses, gramian_params, retain_graph=True)
 
diff --git a/tests/utils/forward_backwards.py b/tests/utils/forward_backwards.py
@@ -4,6 +4,7 @@
 from torch import Tensor, nn, vmap
 from torch.nn.functional import mse_loss
 from torch.utils._pytree import PyTree, tree_flatten, tree_map
+from torch.utils.hooks import RemovableHandle
 from utils.architectures import get_in_out_shapes
 from utils.contexts import fork_rng
 
@@ -144,3 +145,75 @@ def compute_gramian(matrix: Tensor) -> Tensor:
     indices = list(range(matrix.ndim))
     transposed_matrix = matrix.movedim(indices, indices[::-1])
     return torch.tensordot(matrix, transposed_matrix, dims=([-1], [0]))
+
+
+class CloneParams:
+    """
+    ContextManager enabling the computation of per-usage gradients.
+
+    For each submodule with direct trainable parameters, registers:
+    - A pre-hook that clones the params before using them, so that gradients will be computed with
+      respect to the cloned params.
+    - A post-hook that restores the original params.
+
+    The list of clones is returned so that we know where to find the .grad values corresponding to
+    each individual usage of a parameter.
+
+    Exiting this context manager takes care of removing hooks and restoring the original params (in
+    case an exception occurred before the post-hook could do it).
+
+    Note that this does not work for intra-module parameter reuse, which would require a node-based
+    algorithm rather than a module-based algorithm.
+    """
+
+    def __init__(self, model: nn.Module):
+        self.model = model
+        self.clones = list[nn.Parameter]()
+        self._module_to_original_params = dict[nn.Module, dict[str, nn.Parameter]]()
+        self._handles: list[RemovableHandle] = []
+
+    def __enter__(self) -> list[nn.Parameter]:
+        """Register hooks and return list of (orig_param_id, clone_param)."""
+
+        def pre_hook(module: nn.Module, _) -> None:
+            self._module_to_original_params[module] = {}
+            for name, param in module.named_parameters():
+                if param is None or not param.requires_grad:
+                    continue
+                self._module_to_original_params[module][name] = param
+                clone = nn.Parameter(param.detach().clone().requires_grad_())
+                self._set_module_param(module, name, clone)
+                self.clones.append(clone)
+
+        def post_hook(module: nn.Module, _, __) -> None:
+            self._restore_original_params(module)
+
+        # Register hooks on all modules with direct trainable params
+        for mod in self.model.modules():
+            if any(p.requires_grad for p in mod.parameters(recurse=False)):
+                self._handles.append(mod.register_forward_pre_hook(pre_hook))
+                self._handles.append(mod.register_forward_hook(post_hook))
+
+        return self.clones
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Remove hooks and restore parameters."""
+        for handle in self._handles:
+            handle.remove()
+        for module in self.model.modules():
+            self._restore_original_params(module)
+
+        return False  # don’t suppress exceptions
+
+    def _restore_original_params(self, module: nn.Module):
+        original_params = self._module_to_original_params.pop(module, {})
+        for name, param in original_params.items():
+            self._set_module_param(module, name, param)
+
+    @staticmethod
+    def _set_module_param(module: nn.Module, name: str, param: nn.Parameter) -> None:
+        name_parts = name.split(".")
+        for module_name in name_parts[:-1]:
+            module = module.get_submodule(module_name)
+        param_name = name_parts[-1]
+        setattr(module, param_name, param)