Simplify engine:

ValerianRey · ValerianRey · commit 87b4da0f9e9d · 2025-10-23T00:45:31.000+02:00
* Remove FunctionalJacobianComputer
* Remove args and kwargs from interface of JacobianComputer, GramianComputer and JacobianAccumulator because they were only needed for the functional interface
* Remove kwargs from interface of Hook and stop registering it with with_kwargs=True (args are mandatory though, so rename them as _).
* Change JacobianComputer to compute generalized jacobians (shape [m0, ..., mk, n]) and change GramianComputer to compute optional generalized gramians (shape [m0, ..., mk, mk, ..., m0])
* Change engine.compute_gramian to always simply do one vmap level per dimension of the output, without caring about the batch_dim.
* Remove all reshapes and movedims in engine.compute_gramian: we don't need reshape anymore since the gramian is directly a generalized gramian, and we dont need movedim anymore since we vmap on all dimensions the same way, without having to put the non-batched dim in front. Merge compute_gramian and _compute_square_gramian.
* Use a DiagonalSparseTensor as initial jac_output of compute_gramian.
diff --git a/src/torchjd/autogram/_engine.py b/src/torchjd/autogram/_engine.py
@@ -7,13 +7,9 @@
 from ._edge_registry import EdgeRegistry
 from ._gramian_accumulator import GramianAccumulator
 from ._gramian_computer import GramianComputer, JacobianBasedGramianComputerWithCrossTerms
-from ._gramian_utils import movedim_gramian, reshape_gramian
-from ._jacobian_computer import (
-    AutogradJacobianComputer,
-    FunctionalJacobianComputer,
-    JacobianComputer,
-)
+from ._jacobian_computer import AutogradJacobianComputer
 from ._module_hook_manager import ModuleHookManager
+from .diagonal_sparse_tensor import DiagonalSparseTensor
 
 _MODULES_INCOMPATIBLE_WITH_BATCHED = (
     nn.BatchNorm1d,
@@ -202,11 +198,7 @@ def _hook_module_recursively(self, module: nn.Module) -> None:
                 self._hook_module_recursively(child)
 
     def _make_gramian_computer(self, module: nn.Module) -> GramianComputer:
-        jacobian_computer: JacobianComputer
-        if self._batch_dim is not None:
-            jacobian_computer = FunctionalJacobianComputer(module)
-        else:
-            jacobian_computer = AutogradJacobianComputer(module)
+        jacobian_computer = AutogradJacobianComputer(module)
         gramian_computer = JacobianBasedGramianComputerWithCrossTerms(jacobian_computer)
 
         return gramian_computer
@@ -261,33 +253,31 @@ def compute_gramian(self, output: Tensor) -> Tensor:
                 - etc.
         """
 
-        if self._batch_dim is not None:
-            # move batched dim to the end
-            ordered_output = output.movedim(self._batch_dim, -1)
-            ordered_shape = list(ordered_output.shape)
-            batch_size = ordered_shape[-1]
-            has_non_batch_dim = len(ordered_shape) > 1
-            target_shape = [batch_size]
-        else:
-            ordered_output = output
-            ordered_shape = list(ordered_output.shape)
-            has_non_batch_dim = len(ordered_shape) > 0
-            target_shape = []
+        self._module_hook_manager.gramian_accumulation_phase.value = True
 
-        if has_non_batch_dim:
-            target_shape = [-1] + target_shape
+        try:
+            leaf_targets = list(self._target_edges.get_leaf_edges({get_gradient_edge(output)}))
+
+            def differentiation(_grad_output: Tensor) -> tuple[Tensor, ...]:
+                return torch.autograd.grad(
+                    outputs=output,
+                    inputs=leaf_targets,
+                    grad_outputs=_grad_output,
+                    retain_graph=True,
+                )
 
-        reshaped_output = ordered_output.reshape(target_shape)
-        # There are four different cases for the shape of reshaped_output:
-        # - Not batched and not non-batched: scalar of shape []
-        # - Batched only: vector of shape [batch_size]
-        # - Non-batched only: vector of shape [dim]
-        # - Batched and non-batched: matrix of shape [dim, batch_size]
+            output_dims = list(range(output.ndim))
+            jac_output = DiagonalSparseTensor(torch.ones_like(output), output_dims * 2)
 
-        self._module_hook_manager.gramian_accumulation_phase.value = True
+            vmapped_diff = differentiation
+            for _ in output_dims:
+                vmapped_diff = vmap(vmapped_diff)
 
-        try:
-            square_gramian = self._compute_square_gramian(reshaped_output, has_non_batch_dim)
+            _ = vmapped_diff(jac_output)
+
+            # If the gramian were None, then leaf_targets would be empty, so autograd.grad would
+            # have failed. So gramian is necessarily a valid Tensor here.
+            gramian = cast(Tensor, self._gramian_accumulator.gramian)
         finally:
             # Reset everything that has a state, even if the previous call raised an exception
             self._module_hook_manager.gramian_accumulation_phase.value = False
@@ -296,40 +286,4 @@ def compute_gramian(self, output: Tensor) -> Tensor:
             for gramian_computer in self._gramian_computers.values():
                 gramian_computer.reset()
 
-        unordered_gramian = reshape_gramian(square_gramian, ordered_shape)
-
-        if self._batch_dim is not None:
-            gramian = movedim_gramian(unordered_gramian, [-1], [self._batch_dim])
-        else:
-            gramian = unordered_gramian
-
-        return gramian
-
-    def _compute_square_gramian(self, output: Tensor, has_non_batch_dim: bool) -> Tensor:
-        leaf_targets = list(self._target_edges.get_leaf_edges({get_gradient_edge(output)}))
-
-        def differentiation(_grad_output: Tensor) -> tuple[Tensor, ...]:
-            return torch.autograd.grad(
-                outputs=output,
-                inputs=leaf_targets,
-                grad_outputs=_grad_output,
-                retain_graph=True,
-            )
-
-        if has_non_batch_dim:
-            # There is one non-batched dimension, it is the first one
-            non_batch_dim_len = output.shape[0]
-            identity_matrix = torch.eye(non_batch_dim_len, device=output.device, dtype=output.dtype)
-            ones = torch.ones_like(output[0])
-            jac_output = torch.einsum("ij, ... -> ij...", identity_matrix, ones)
-
-            _ = vmap(differentiation)(jac_output)
-        else:
-            grad_output = torch.ones_like(output)
-            _ = differentiation(grad_output)
-
-        # If the gramian were None, then leaf_targets would be empty, so autograd.grad would
-        # have failed. So gramian is necessarily a valid Tensor here.
-        gramian = cast(Tensor, self._gramian_accumulator.gramian)
-
         return gramian
diff --git a/src/torchjd/autogram/_gramian_computer.py b/src/torchjd/autogram/_gramian_computer.py
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
 from typing import Optional
 
+import torch
 from torch import Tensor
-from torch.utils._pytree import PyTree
 
 from torchjd.autogram._jacobian_computer import JacobianComputer
 
@@ -13,8 +13,6 @@ def __call__(
         self,
         rg_outputs: tuple[Tensor, ...],
         grad_outputs: tuple[Tensor, ...],
-        args: tuple[PyTree, ...],
-        kwargs: dict[str, PyTree],
     ) -> Optional[Tensor]:
         """Compute what we can for a module and optionally return the gramian if it's ready."""
 
@@ -30,8 +28,12 @@ def __init__(self, jacobian_computer):
         self.jacobian_computer = jacobian_computer
 
     @staticmethod
-    def _to_gramian(jacobian: Tensor) -> Tensor:
-        return jacobian @ jacobian.T
+    def _to_gramian(matrix: Tensor) -> Tensor:
+        """Contracts the last dimension of matrix to make it into a Gramian."""
+
+        indices = list(range(matrix.ndim))
+        transposed_matrix = matrix.movedim(indices, indices[::-1])
+        return torch.tensordot(matrix, transposed_matrix, dims=([-1], [0]))
 
 
 class JacobianBasedGramianComputerWithCrossTerms(JacobianBasedGramianComputer):
@@ -53,20 +55,17 @@ def track_forward_call(self) -> None:
         self.remaining_counter += 1
 
     def __call__(
-        self,
-        rg_outputs: tuple[Tensor, ...],
-        grad_outputs: tuple[Tensor, ...],
-        args: tuple[PyTree, ...],
-        kwargs: dict[str, PyTree],
+        self, rg_outputs: tuple[Tensor, ...], grad_outputs: tuple[Tensor, ...]
     ) -> Optional[Tensor]:
         """Compute what we can for a module and optionally return the gramian if it's ready."""
 
-        jacobian_matrix = self.jacobian_computer(rg_outputs, grad_outputs, args, kwargs)
+        batched_jacobian = self.jacobian_computer(rg_outputs, grad_outputs)
+        jacobian = torch.func.debug_unwrap(batched_jacobian, recurse=True)
 
         if self.summed_jacobian is None:
-            self.summed_jacobian = jacobian_matrix
+            self.summed_jacobian = jacobian
         else:
-            self.summed_jacobian += jacobian_matrix
+            self.summed_jacobian += jacobian
 
         self.remaining_counter -= 1
 
diff --git a/src/torchjd/autogram/_jacobian_computer.py b/src/torchjd/autogram/_jacobian_computer.py
@@ -1,11 +1,9 @@
-from abc import ABC, abstractmethod
-from collections.abc import Callable
-from typing import cast
+from abc import ABC
 
 import torch
 from torch import Tensor, nn
 from torch.nn import Parameter
-from torch.utils._pytree import PyTree, tree_flatten, tree_map, tree_map_only
+from torch.utils._pytree import tree_flatten
 
 # Note about import from protected _pytree module:
 # PyTorch maintainers plan to make pytree public (see
@@ -25,112 +23,26 @@ class JacobianComputer(ABC):
 
     def __init__(self, module: nn.Module):
         self.module = module
-
         self.rg_params = dict[str, Parameter]()
-        self.frozen_params = dict[str, Parameter]()
 
         for name, param in module.named_parameters(recurse=True):
             if param.requires_grad:
                 self.rg_params[name] = param
-            else:
-                self.frozen_params[name] = param
-
-    def __call__(
-        self,
-        rg_outputs: tuple[Tensor, ...],
-        grad_outputs: tuple[Tensor, ...],
-        args: tuple[PyTree, ...],
-        kwargs: dict[str, PyTree],
-    ) -> Tensor:
-        # This makes __call__ vmappable.
-        return ComputeModuleJacobians.apply(
-            self._compute_jacobian, rg_outputs, grad_outputs, args, kwargs
-        )
 
-    @abstractmethod
-    def _compute_jacobian(
-        self,
-        rg_outputs: tuple[Tensor, ...],
-        grad_outputs: tuple[Tensor, ...],
-        args: tuple[PyTree, ...],
-        kwargs: dict[str, PyTree],
-    ) -> Tensor:
+    def __call__(self, rg_outputs: tuple[Tensor, ...], grad_outputs: tuple[Tensor, ...]) -> Tensor:
         """
-        Computes and returns the Jacobian. The output must be a matrix (2D Tensor).
+        Computes and returns the Jacobian. The output must be a generalized Jacobian with param
+        dimensions grouped.
         """
 
 
-class FunctionalJacobianComputer(JacobianComputer):
-    """
-    JacobianComputer using the functional differentiation API. This requires to use vmap, so it's
-    not compatible with every module, and it requires to have an extra forward pass to create the
-    vjp function.
-    """
-
-    def _compute_jacobian(
-        self,
-        _: tuple[Tensor, ...],
-        grad_outputs: tuple[Tensor, ...],
-        args: tuple[PyTree, ...],
-        kwargs: dict[str, PyTree],
-    ) -> Tensor:
-        grad_outputs_in_dims = (0,) * len(grad_outputs)
-        args_in_dims = tree_map(lambda t: 0 if isinstance(t, Tensor) else None, args)
-        kwargs_in_dims = tree_map(lambda t: 0 if isinstance(t, Tensor) else None, kwargs)
-        in_dims = (grad_outputs_in_dims, args_in_dims, kwargs_in_dims)
-        vmapped_vjp = torch.vmap(self._call_on_one_instance, in_dims=in_dims)
-
-        return vmapped_vjp(grad_outputs, args, kwargs)
-
-    def _call_on_one_instance(
-        self,
-        grad_outputs_j: tuple[Tensor, ...],
-        args_j: tuple[PyTree, ...],
-        kwargs_j: dict[str, PyTree],
-    ) -> Tensor:
-        # Note: we use unsqueeze(0) to turn a single activation (or grad_output) into a
-        # "batch" of 1 activation (or grad_output). This is because some layers (e.g.
-        # nn.Flatten) do not work equivalently if they're provided with a batch or with
-        # an element of a batch. We thus always provide them with batches, just of a
-        # different size.
-        args_j = tree_map_only(torch.Tensor, lambda x: x.unsqueeze(0), args_j)
-        kwargs_j = tree_map_only(torch.Tensor, lambda x: x.unsqueeze(0), kwargs_j)
-        grad_outputs_j_ = tuple(x.unsqueeze(0) for x in grad_outputs_j)
-
-        def functional_model_call(rg_params: dict[str, Parameter]) -> tuple[Tensor, ...]:
-            all_state = [
-                cast(dict[str, Tensor], rg_params),
-                dict(self.module.named_buffers()),
-                cast(dict[str, Tensor], self.frozen_params),
-            ]
-            output = torch.func.functional_call(self.module, all_state, args_j, kwargs_j)
-            flat_outputs = tree_flatten(output)[0]
-            rg_outputs = tuple(t for t in flat_outputs if isinstance(t, Tensor) and t.requires_grad)
-            return rg_outputs
-
-        vjp_func = torch.func.vjp(functional_model_call, self.rg_params)[1]
-
-        # vjp_func is a function that computes the vjp w.r.t. to the primals (tuple). Here the
-        # functional has a single primal which is dict(module.named_parameters()). We therefore take
-        # the 0'th element to obtain the dict of gradients w.r.t. the module's named_parameters.
-        gradients = vjp_func(grad_outputs_j_)[0]
-        gradient = torch.cat([t.reshape(-1) for t in gradients.values()])
-        return gradient
-
-
 class AutogradJacobianComputer(JacobianComputer):
     """
     JacobianComputer using the autograd engine. The main advantage of using this method is that it
     doesn't require making an extra forward pass.
     """
 
-    def _compute_jacobian(
-        self,
-        rg_outputs: tuple[Tensor, ...],
-        grad_outputs: tuple[Tensor, ...],
-        _: tuple[PyTree, ...],
-        __: dict[str, PyTree],
-    ) -> Tensor:
+    def __call__(self, rg_outputs: tuple[Tensor, ...], grad_outputs: tuple[Tensor, ...]) -> Tensor:
         flat_rg_params, ___ = tree_flatten(self.rg_params)
         grads = torch.autograd.grad(
             rg_outputs,
@@ -141,47 +53,4 @@ def _compute_jacobian(
             materialize_grads=True,
         )
         flattened_grads = torch.cat([g.reshape(-1) for g in grads])
-        jacobian = flattened_grads.unsqueeze(0)
-        return jacobian
-
-
-class ComputeModuleJacobians(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        compute_jacobian_fn: Callable[
-            [tuple[Tensor, ...], tuple[Tensor, ...], tuple[PyTree, ...], dict[str, PyTree]], Tensor
-        ],
-        rg_outputs: tuple[Tensor, ...],
-        grad_outputs: tuple[Tensor, ...],
-        args: tuple[PyTree, ...],
-        kwargs: dict[str, PyTree],
-    ) -> Tensor:
-        # There is no non-batched dimension
-        jacobian = compute_jacobian_fn(rg_outputs, grad_outputs, args, kwargs)
-        return jacobian
-
-    @staticmethod
-    def vmap(
-        _,
-        in_dims: tuple[None, None, tuple[int, ...], None, None],
-        compute_jacobian_fn: Callable,
-        rg_outputs: tuple[Tensor, ...],
-        jac_outputs: tuple[Tensor, ...],
-        args: tuple[PyTree, ...],
-        kwargs: dict[str, PyTree],
-    ) -> tuple[Tensor, None]:
-        # There is a non-batched dimension
-        # We do not vmap over the args, kwargs, or rg_outputs for the non-batched dimension
-        generalized_jacobian = torch.vmap(compute_jacobian_fn, in_dims=in_dims[1:])(
-            rg_outputs,
-            jac_outputs,
-            args,
-            kwargs,
-        )
-        shape = generalized_jacobian.shape
-        jacobian = generalized_jacobian.reshape([shape[0] * shape[1], -1])
-        return jacobian, None
-
-    @staticmethod
-    def setup_context(*_) -> None:
-        pass
+        return flattened_grads
diff --git a/src/torchjd/autogram/_module_hook_manager.py b/src/torchjd/autogram/_module_hook_manager.py