TorchJD
diff --git a/‎CHANGELOG.md‎
Lines changed: 5 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/source/docs/aggregation/flattening.rst‎
Lines changed: 9 additions & 0 deletions b/‎docs/source/docs/aggregation/flattening.rst‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/source/docs/aggregation/index.rst‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/docs/aggregation/index.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/examples/index.rst‎
Lines changed: 5 additions & 0 deletions b/‎docs/source/examples/index.rst‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source/examples/iwmtl.rst‎
Lines changed: 65 additions & 0 deletions b/‎docs/source/examples/iwmtl.rst‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎docs/source/examples/iwrm.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/examples/iwrm.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/examples/partial_jd.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/examples/partial_jd.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/index.rst‎
Lines changed: 7 additions & 2 deletions b/‎docs/source/index.rst‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/torchjd/aggregation/__init__.py‎
Lines changed: 26 additions & 2 deletions b/‎src/torchjd/aggregation/__init__.py‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎src/torchjd/aggregation/_flattening.py‎
Lines changed: 35 additions & 0 deletions b/‎src/torchjd/aggregation/_flattening.py‎
Lines changed: 35 additions & 0 deletions
@@ -12,7 +12,8 @@ changes that do not affect the user.
 
 - Added the `autogram` package, with the `autogram.Engine`. This is an implementation of Algorithm 3
   from [Jacobian Descent for Multi-Objective Optimization](https://arxiv.org/pdf/2406.16232),
-  optimized for batched computations, as in IWRM.
+  optimized for batched computations, as in IWRM. Generalized Gramians can also be obtained by using
+  the autogram engine on a tensor of losses of arbitrary shape.
 - For all `Aggregator`s based on the weighting of the Gramian of the Jacobian, made their
   `Weighting` class public. It can be used directly on a Gramian (computed via the
   `autogram.Engine`) to extract some weights. The list of new public classes is:
@@ -29,8 +30,11 @@ changes that do not affect the user.
   - `PCGradWeighting`
   - `RandomWeighting`
   - `SumWeighting`
+- Added `GeneralizedWeighting` (base class) and `Flattening` (implementation) to extract tensors of
+  weights from generalized Gramians.
 - Added usage example for IWRM with autogram.
 - Added usage example for IWRM with partial autogram.
+- Added usage example for IWMTL with autogram.
 
 ### Changed
 
 
@@ -0,0 +1,9 @@
+:hide-toc:
+
+Flattening
+==========
+
+.. autoclass:: torchjd.aggregation.Flattening
+    :members:
+    :undoc-members:
+    :exclude-members: forward
@@ -17,6 +17,11 @@ Abstract base classes
     :undoc-members:
     :exclude-members: forward
 
+.. autoclass:: torchjd.aggregation.GeneralizedWeighting
+    :members:
+    :undoc-members:
+    :exclude-members: forward
+
 
 .. toctree::
     :hidden:
@@ -28,6 +33,7 @@ Abstract base classes
     config.rst
     constant.rst
     dualproj.rst
+    flattening.rst
     graddrop.rst
     imtl_g.rst
     krum.rst
 
@@ -18,6 +18,10 @@ This section contains some usage examples for TorchJD.
 - :doc:`Multi-Task Learning (MTL) <mtl>` provides an example of multi-task learning where Jacobian
   descent is used to optimize the vector of per-task losses of a multi-task model, using the
   dedicated backpropagation function :doc:`mtl_backward <../docs/autojac/mtl_backward>`.
+- :doc:`Instance-Wise Multi-Task Learning (IWMTL) <iwmtl>` shows how to combine multi-task learning
+  with instance-wise risk minimization: one loss per task and per element of the batch, using the
+  :doc:`autogram.Engine <../docs/autogram/engine>` and a :doc:`GeneralizedWeighting
+  <../docs/aggregation/index>`.
 - :doc:`Recurrent Neural Network (RNN) <rnn>` shows how to apply Jacobian descent to RNN training,
   with one loss per output sequence element.
 - :doc:`Monitoring Aggregations <monitoring>` shows how to monitor the aggregation performed by the
@@ -34,6 +38,7 @@ This section contains some usage examples for TorchJD.
     iwrm.rst
     partial_jd.rst
     mtl.rst
+    iwmtl.rst
     rnn.rst
     monitoring.rst
     lightning_integration.rst
 
@@ -0,0 +1,65 @@
+Instance-Wise Multi-Task Learning (IWMTL)
+=========================================
+
+When training a model with multiple tasks, the gradients of the individual tasks are likely to
+conflict. This is particularly true when looking at the individual (per-sample) gradients.
+The :doc:`autogram engine <../docs/autogram/engine>` can be used to efficiently compute the Gramian
+of the Jacobian of the matrix of per-sample and per-task losses. Weights can then be extracted from
+this Gramian to reweight the gradients and resolve conflict entirely.
+
+The following example shows how to do that.
+
+.. code-block:: python
+    :emphasize-lines: 5-6, 18-20, 31-32, 34-35, 37-38, 41-42
+
+    import torch
+    from torch.nn import Linear, MSELoss, ReLU, Sequential
+    from torch.optim import SGD
+
+    from torchjd.aggregation import Flattening, UPGradWeighting
+    from torchjd.autogram import Engine
+
+    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
+    task1_module = Linear(3, 1)
+    task2_module = Linear(3, 1)
+    params = [
+        *shared_module.parameters(),
+        *task1_module.parameters(),
+        *task2_module.parameters(),
+    ]
+
+    optimizer = SGD(params, lr=0.1)
+    mse = MSELoss(reduction="none")
+    weighting = Flattening(UPGradWeighting())
+    engine = Engine(shared_module.modules(), batch_dim=0)
+
+    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
+    task1_targets = torch.randn(8, 16)  # 8 batches of 16 targets for the first task
+    task2_targets = torch.randn(8, 16)  # 8 batches of 16 targets for the second task
+
+    for input, target1, target2 in zip(inputs, task1_targets, task2_targets):
+        features = shared_module(input)  # shape: [16, 3]
+        out1 = task1_module(features).squeeze(1)  # shape: [16]
+        out2 = task2_module(features).squeeze(1)  # shape: [16]
+
+        # Compute the matrix of losses: one loss per element of the batch and per task
+        losses = torch.stack([mse(out1, target1), mse(out2, target2)], dim=1)  # shape: [16, 2]
+
+        # Compute the gramian (inner products between pairs of gradients of the losses)
+        gramian = engine.compute_gramian(losses)  # shape: [16, 2, 2, 16]
+
+        # Obtain the weights that lead to no conflict between reweighted gradients
+        weights = weighting(gramian)  # shape: [16, 2]
+
+        optimizer.zero_grad()
+        # Do the standard backward pass, but weighted using the obtained weights
+        losses.backward(weights)
+        optimizer.step()
+
+.. note::
+    In this example, the tensor of losses is a matrix rather than a vector. The gramian is thus a
+    4D tensor rather than a matrix, and a
+    :class:`~torchjd.aggregation._weighting_bases.GeneralizedWeighting`, such as
+    :class:`~torchjd.aggregation._flattening.Flattening`, has to be used to extract a matrix of
+    weights from it. More information about ``GeneralizedWeighting`` can be found in the
+    :doc:`../../docs/aggregation/index` page.
@@ -129,7 +129,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
             params = model.parameters()
             optimizer = SGD(params, lr=0.1)
             weighting = UPGradWeighting()
-            engine = Engine(model.modules())
+            engine = Engine(model.modules(), batch_dim=0)
 
             for x, y in zip(X, Y):
                 y_hat = model(x).squeeze(dim=1)  # shape: [16]
 
@@ -33,7 +33,7 @@ first ``Linear`` layer, thereby reducing memory usage and computation time.
 
     # Create the autogram engine that will compute the Gramian of the
     # Jacobian with respect to the two last Linear layers' parameters.
-    engine = Engine(model[2:].modules())
+    engine = Engine(model[2:].modules(), batch_dim=0)
 
     params = model.parameters()
     optimizer = SGD(params, lr=0.1)
 
@@ -38,8 +38,8 @@ per-task losses has to be minimized. To start using TorchJD for multi-task learn
 Another more interesting application is to consider separately the loss of each element in the
 batch. This is what we define as :doc:`Instance-Wise Risk Minimization <examples/iwrm>` (IWRM).
 
-For IWRM, in many cases, there exists an algorithm that is both equivalent to Jacobian descent, and
-much more efficient. This algorithm, called Gramian-based Jacobian descent, consists in computing
+The Gramian-based Jacobian descent algorithm provides a very efficient alternative way of
+performing Jacobian descent. It consists in computing
 the Gramian of the Jacobian iteratively during the backward pass (without ever storing the full
 Jacobian in memory), weighting the losses using the information of the Gramian, and then computing
 the gradient of the obtained weighted loss. The iterative computation of the Gramian corresponds to
@@ -48,6 +48,11 @@ Algorithm 3 of
 documentation and usage example of this algorithm is provided in
 :doc:`autogram.Engine <docs/autogram/engine>`.
 
+The original usage of the autogram engine is to compute the Gramian of the Jacobian very efficiently
+for :doc:`IWRM <examples/iwrm>`. Another direct application is when considering one loss per element
+of the batch and per task, in the context of multi-task learning. We call this
+:doc:`Instance-Wise Risk Multi-Task Learning <examples/iwmtl>` (IWMTL).
+
 TorchJD is open-source, under MIT License. The source code is available on
 `GitHub <https://github.com/TorchJD/torchjd>`_.
 
 
@@ -19,7 +19,8 @@
 :class:`Aggregators <torchjd.aggregation._aggregator_bases.Aggregator>` and :class:`Weightings
 <torchjd.aggregation._weighting_bases.Weighting>` are callables that take a Jacobian matrix or a
 Gramian matrix as inputs, respectively. The following example shows how to use UPGrad to either
-aggregate a Jacobian or obtain the weights from the Gramian of the Jacobian.
+aggregate a Jacobian (of shape ``[m, n]``, where ``m`` is the number of objectives and ``n`` is the
+number of parameters), or obtain the weights from the Gramian of the Jacobian (of shape ``[m, m]``).
 
 >>> from torch import tensor
 >>> from torchjd.aggregation import UPGrad, UPGradWeighting
@@ -34,13 +35,36 @@
 >>> weights = weighting(gramian)
 >>> weights
 tensor([1.1109, 0.7894])
+
+When dealing with a more general tensor of objectives, of shape ``[m_1, ..., m_k]`` (i.e. not
+necessarily a simple vector), the Jacobian will be of shape ``[m_1, ..., m_k, n]``, and its Gramian
+will be called a `generalized Gramian`, of shape ``[m_1, ..., m_k, m_k, ..., m_1]``. One can use a
+:class:`GeneralizedWeighting<torchjd.aggregation._weighting_bases.GeneralizedWeighting>` to extract
+a tensor of weights (of shape ``[m_1, ..., m_k]``) from such a generalized Gramian. The simplest
+:class:`GeneralizedWeighting<torchjd.aggregation._weighting_bases.GeneralizedWeighting>` is
+:class:`Flattening<torchjd.aggregation._flattening.Flattening>`: it simply "flattens" the
+generalized Gramian into a square Gramian matrix (of shape ``[m_1 * ... * m_k, m_1 * ... * m_k]``),
+applies a normal weighting to it to obtain a vector of weights, and returns the reshaped tensor of
+weights.
+
+>>> from torch import ones
+>>> from torchjd.aggregation import Flattening, UPGradWeighting
+>>>
+>>> weighting = Flattening(UPGradWeighting())
+>>> # Generate a generalized Gramian filled with ones, for the sake of the example
+>>> generalized_gramian = ones((2, 3, 3, 2))
+>>> weights = weighting(generalized_gramian)
+>>> weights
+tensor([[0.1667, 0.1667, 0.1667],
+        [0.1667, 0.1667, 0.1667]])
 """
 
 from ._aggregator_bases import Aggregator
 from ._aligned_mtl import AlignedMTL, AlignedMTLWeighting
 from ._config import ConFIG
 from ._constant import Constant, ConstantWeighting
 from ._dualproj import DualProj, DualProjWeighting
+from ._flattening import Flattening
 from ._graddrop import GradDrop
 from ._imtl_g import IMTLG, IMTLGWeighting
 from ._krum import Krum, KrumWeighting
@@ -54,7 +78,7 @@
 from ._utils.check_dependencies import (
     OptionalDepsNotInstalledError as _OptionalDepsNotInstalledError,
 )
-from ._weighting_bases import Weighting
+from ._weighting_bases import GeneralizedWeighting, Weighting
 
 try:
     from ._cagrad import CAGrad, CAGradWeighting
 
@@ -0,0 +1,35 @@
+from math import prod
+
+from torch import Tensor
+
+from torchjd.aggregation._weighting_bases import GeneralizedWeighting, PSDMatrix, Weighting
+from torchjd.autogram._gramian_utils import reshape_gramian
+
+
+class Flattening(GeneralizedWeighting):
+    """
+    :class:`~torchjd.aggregation._weighting_bases.GeneralizedWeighting` flattening the generalized
+    Gramian into a square matrix, extracting a vector of weights from it using a
+    :class:`~torchjd.aggregation._weighting_bases.Weighting`, and returning the reshaped tensor of
+    weights.
+
+    For instance, when applied to a generalized Gramian of shape ``[2, 3, 3, 2]``, it would flatten
+    it into a square Gramian matrix of shape ``[6, 6]``, apply the weighting on it to get a vector
+    of weights of shape ``[6]``, and then return this vector reshaped into a matrix of shape
+    ``[2, 3]``.
+
+    :param weighting: The weighting to apply to the Gramian matrix.
+    """
+
+    def __init__(self, weighting: Weighting[PSDMatrix]):
+        super().__init__()
+        self.weighting = weighting
+
+    def forward(self, generalized_gramian: Tensor) -> Tensor:
+        k = generalized_gramian.ndim // 2
+        shape = generalized_gramian.shape[:k]
+        m = prod(shape)
+        square_gramian = reshape_gramian(generalized_gramian, [m])
+        weights_vector = self.weighting(square_gramian)
+        weights = weights_vector.reshape(shape)
+        return weights