f-dangel
diff --git a/‎docs/examples/example_03_param_groups.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/examples/example_03_param_groups.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/examples/example_04_advanced.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/examples/example_04_advanced.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎singd/optim/optimizer.py‎
Lines changed: 30 additions & 9 deletions b/‎singd/optim/optimizer.py‎
Lines changed: 30 additions & 9 deletions
diff --git a/‎singd/optim/utils.py‎
Lines changed: 56 additions & 16 deletions b/‎singd/optim/utils.py‎
Lines changed: 56 additions & 16 deletions
diff --git a/‎test/optim/test_autocast.py‎
Lines changed: 1 addition & 1 deletion b/‎test/optim/test_autocast.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/optim/test_checkpointing.py‎
Lines changed: 1 addition & 1 deletion b/‎test/optim/test_checkpointing.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/optim/test_gradient_accumulation.py‎
Lines changed: 2 additions & 2 deletions b/‎test/optim/test_gradient_accumulation.py‎
Lines changed: 2 additions & 2 deletions
@@ -60,7 +60,7 @@
     "momentum": 0.9,
     "weight_decay": 1e-2,
     "lr_cov": 1e-2,
-    "batch_averaged": True,
+    "loss_average": "batch",
     "T": 1,
     "alpha1": 0.5,
 }
 
@@ -93,7 +93,7 @@
     "momentum": 0.9,
     "weight_decay": 1e-2,
     "lr_cov": 1e-2,
-    "batch_averaged": True,
+    "loss_average": "batch",
     "T": 1,
     "alpha1": 0.5,
     "structures": ("dense", "dense"),
 
@@ -49,6 +49,7 @@ class SINGD(Optimizer):
 https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.state_dict.html) and
             [`.load_state_dict()`](\
 https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.load_state_dict.html)).
+        SUPPORTED_LOSS_AVERAGE: Supported loss averaging schemes.
         _step_supports_amp_scaling: Indicates that `step` handles gradient scaling
             internally if the optimizer is used together with a
             [`torch.cuda.amp.GradScaler`](\
@@ -69,6 +70,11 @@ class SINGD(Optimizer):
         "triutoeplitz": TriuToeplitzMatrix,
     }
     SUPPORTED_MODULES: Tuple[Type[Module], ...] = (Linear, Conv2d)
+    SUPPORTED_LOSS_AVERAGE: Tuple[Union[None, str], ...] = (
+        None,
+        "batch",
+        "batch+sequence",
+    )
     _step_supports_amp_scaling = True  # do not modify this name (PyTorch convention)!
 
     def __init__(
@@ -81,7 +87,7 @@ def __init__(
         alpha1: float = 0.5,  # α₁ in the paper
         weight_decay: float = 0.0,  # γ in the paper
         T: int = 10,  # T in the paper
-        batch_averaged: bool = True,
+        loss_average: Union[None, str] = "batch",
         lr_cov: Union[float, Callable[[int], float]] = 1e-2,  # β₁ in the paper
         structures: Tuple[str, str] = ("dense", "dense"),
         kfac_approx: str = "expand",
@@ -121,8 +127,15 @@ def __init__(
             weight_decay: (\\(\\gamma\\) in the paper) Weight decay on the parameters.
                 Default: `0.0`.
             T: Pre-conditioner update frequency. Default: `10`.
-            batch_averaged: Whether the loss function is a mean over per-sample
-                losses. Default is `True`. If `False `, the loss function is a sum.
+            loss_average: Whether the loss function is a mean over per-sample
+                losses and if yes, over which dimensions the mean is taken.
+                If `"batch"`, the loss function is a mean over as many terms as
+                the size of the mini-batch. If `"batch+sequence"`, the loss
+                function is a mean over as many terms as the size of the
+                mini-batch times the sequence length, e.g. in the case of
+                language modeling. If `None`, the loss function is a sum. This
+                argument is used to ensure that the preconditioner is scaled
+                consistently with the loss and the gradient. Default: `"batch"`.
             lr_cov: (β₁ in the paper) Learning rate for the updates of the pre-
                 conditioner momenta \\(\\mathbf{m}_\\mathbf{K}\\) and
                 \\(\\mathbf{m}_\\mathbf{C}\\). Default is `1e-2`. Also allows for a
@@ -205,7 +218,7 @@ def __init__(
             alpha1=alpha1,
             weight_decay=weight_decay,
             T=T,
-            batch_averaged=batch_averaged,
+            loss_average=loss_average,
             lr_cov=lr_cov,
             structures=structures,
             kfac_approx=kfac_approx,
@@ -280,6 +293,8 @@ def _check_param_groups(self, model: Module) -> Dict[int, int]:
             ValueError: If `kfac_approx` for any param group is not
                 `'expand'` or `'reduce'`.
             ValueError: If parameters in a supported layer are in different groups.
+            ValueError: If `loss_average` for any param group is not in
+                self.SUPPORTED_LOSS_AVERAGE.
 
         Returns:
             A dictionary mapping parameter IDs (`.data_ptr()`) to group indices.
@@ -298,6 +313,12 @@ def _check_param_groups(self, model: Module) -> Dict[int, int]:
                     "kfac_approx has to be set to either 'expand' or 'reduce', "
                     f"but was set to {group['kfac_approx']}."
                 )
+            if group["loss_average"] not in self.SUPPORTED_LOSS_AVERAGE:
+                raise ValueError(
+                    "loss_average has to be set to one out of "
+                    f"{self.SUPPORTED_LOSS_AVERAGE}, but was set to "
+                    f"{group['loss_average']}."
+                )
 
         # Find out which parameter is in which group
         param_to_group_idx = {}
@@ -551,7 +572,7 @@ def _accumulate_H_terms(
         if self.steps % T != 0:
             return
 
-        batch_averaged = self._get_param_group_entry(module, "batch_averaged")
+        loss_average = self._get_param_group_entry(module, "loss_average")
         kfac_approx = self._get_param_group_entry(module, "kfac_approx")
         module_name = self.module_names[module]
 
@@ -563,7 +584,7 @@ def _accumulate_H_terms(
 
         g = grad_output[0].data
         # Process into matrix according to kfac_approx, add scaling from batch average
-        g = process_grad_output(g, module, batch_averaged, kfac_approx)
+        g = process_grad_output(g, module, loss_average, kfac_approx)
 
         # 2) Update H_K, H_C
         K, C = self.Ks[module_name], self.Cs[module_name]
@@ -583,7 +604,7 @@ def _accumulate_H_terms(
         # If DDP is used.
         if dist.is_initialized():
             # all-reduce across devices (computes average by default).
-            op = dist.ReduceOp.AVG if batch_averaged else dist.ReduceOp.SUM
+            op = dist.ReduceOp.AVG if loss_average else dist.ReduceOp.SUM
             H_K.all_reduce(op=op)
             H_C.all_reduce(op=op)
 
@@ -672,8 +693,8 @@ def _compute_natural_gradient(self, module: Module) -> Tuple[Tensor, ...]:
         # If DDP is used.
         if dist.is_initialized():
             # all-reduce across devices.
-            batch_averaged = self._get_param_group_entry(module, "batch_averaged")
-            op = dist.ReduceOp.AVG if batch_averaged else dist.ReduceOp.SUM
+            loss_average = self._get_param_group_entry(module, "loss_average")
+            op = dist.ReduceOp.AVG if loss_average else dist.ReduceOp.SUM
             dist.all_reduce(nat_grad, op=op)
 
         # 3) UN-CONCATENATE, UN-RESHAPE, AND COPY THE NATURAL GRADIENT TO `.GRAD`
 
@@ -141,47 +141,69 @@ def linear_process_input(x: Tensor, layer: Linear, kfac_approx: str) -> Tensor:
 
 
 def process_grad_output(
-    grad_output: Tensor, module: Module, batch_averaged: bool, kfac_approx: str
+    grad_output: Tensor,
+    module: Module,
+    loss_average: Union[None, str],
+    kfac_approx: str,
 ) -> Tensor:
     """Reshape output gradients into matrices and apply scaling.
 
     Args:
         grad_output: The gradient w.r.t. the output of the module.
         module: The module.
-        batch_averaged: Whether the loss is a mean over per-sample losses.
+        loss_average: Whether the loss function is a mean over per-sample
+            losses and if yes, over which dimensions the mean is taken.
+            If `"batch"`, the loss function is a mean over as many terms as
+            the size of the mini-batch. If `"batch+sequence"`, the loss
+            function is a mean over as many terms as the size of the
+            mini-batch times the sequence length, e.g. in the case of
+            language modeling. If `None`, the loss function is a sum. This
+            argument is used to ensure that the preconditioner is scaled
+            consistently with the loss and the gradient. Default: `"batch"`.
         kfac_approx: The KFAC approximation to use for linear weight-sharing
             layers. Possible values are `"expand"` and `"reduce"`.
 
     Returns:
         The processed output gradient.
 
     Raises:
+        AssertionError: If `loss_average` is not `None`, `"batch"`, or
+            `"batch+sequence"`.
         AssertionError: If `kfac_approx` is neither `"expand"` nor `"reduce"`.
         NotImplementedError: If the module is not supported.
     """
+    assert loss_average in {None, "batch", "batch+sequence"}
     assert kfac_approx in {"expand", "reduce"}
     grad_scaling = 1.0
     if isinstance(module, Conv2d):
         return conv2d_process_grad_output(
-            grad_output, batch_averaged, grad_scaling, kfac_approx
+            grad_output, loss_average, grad_scaling, kfac_approx
         )
     elif isinstance(module, Linear):
         return linear_process_grad_output(
-            grad_output, batch_averaged, grad_scaling, kfac_approx
+            grad_output, loss_average, grad_scaling, kfac_approx
         )
     else:
         raise NotImplementedError(f"Can't process grad_output for {module}.")
 
 
 def conv2d_process_grad_output(
-    g: Tensor, batch_averaged: bool, scaling: float, kfac_approx: str
+    g: Tensor, loss_average: Union[None, str], scaling: float, kfac_approx: str
 ) -> Tensor:
     """Process the output gradient of a convolution before the self-inner product.
 
     Args:
         g: Gradient w.r.t. the output of a convolution. Has shape
             `[batch_size, C_out, O1, O2]`.
-        batch_averaged: Whether to multiply with the batch size.
+        loss_average: Whether the loss function is a mean over per-sample
+            losses and if yes, over which dimensions the mean is taken.
+            If `"batch"`, the loss function is a mean over as many terms as
+            the size of the mini-batch. If `"batch+sequence"`, the loss
+            function is a mean over as many terms as the size of the
+            mini-batch times the sequence length, e.g. in the case of
+            language modeling. If `None`, the loss function is a sum. This
+            argument is used to ensure that the preconditioner is scaled
+            consistently with the loss and the gradient. Default: `"batch"`.
         scaling: An additional scaling that will be applied to the gradient.
         kfac_approx: The KFAC approximation to use. Possible values are
             `"expand"` and `"reduce"`.
@@ -190,11 +212,14 @@ def conv2d_process_grad_output(
         The processed scaled gradient. Has shape `[batch_size, C_out]` for
         `"reduce"` and `[batch_size * O1 * O2, C_out]` for `"expand"`.
     """
-    # The scaling by `sqrt(batch_size)` when `batch_averaged=True` assumes
-    # that we are in the reduce setting, i.e. the number of loss terms equals
-    # the batch size.
-    batch_size = g.shape[0]
-    scaling = scaling * sqrt(batch_size) if batch_averaged else scaling
+    # We have to adjust the scaling to account for the mean reduction of the
+    # loss used for computing the gradients when loss_average is not None.
+    if loss_average is not None:
+        num_loss_terms = g.shape[0]  # batch_size
+        if loss_average == "batch+sequence":
+            num_loss_terms *= g.shape[2:].numel()  # spatial size = O1 * O2
+
+        scaling *= sqrt(num_loss_terms)
 
     if kfac_approx == "expand":
         # KFAC-expand approximation
@@ -207,15 +232,23 @@ def conv2d_process_grad_output(
 
 
 def linear_process_grad_output(
-    g: Tensor, batch_averaged: bool, scaling: float, kfac_approx: str
+    g: Tensor, loss_average: Union[None, str], scaling: float, kfac_approx: str
 ) -> Tensor:
     """Process the output gradient of a linear layer before the self-inner product.
 
     Args:
         g: Gradient w.r.t. the output of a linear layer. Has shape
             `[batch_size, ..., d_out]` where `...` is an arbitrary number of
             weight-shared dimensions.
-        batch_averaged: Whether to multiply with the batch size.
+        loss_average: Whether the loss function is a mean over per-sample
+            losses and if yes, over which dimensions the mean is taken.
+            If `"batch"`, the loss function is a mean over as many terms as
+            the size of the mini-batch. If `"batch+sequence"`, the loss
+            function is a mean over as many terms as the size of the
+            mini-batch times the sequence length, e.g. in the case of
+            language modeling. If `None`, the loss function is a sum. This
+            argument is used to ensure that the preconditioner is scaled
+            consistently with the loss and the gradient. Default: `"batch"`.
         scaling: An additional scaling that will be applied to the gradient.
         kfac_approx: The KFAC approximation to use for linear weight-sharing
             layers. Possible values are `"expand"` and `"reduce"`.
@@ -224,14 +257,21 @@ def linear_process_grad_output(
         The processed gradient. Has shape `[batch_size, d_out]` for `"reduce"`
         and `[batch_size * ..., d_out]` for `"expand"`.
     """
+    # We have to adjust the scaling to account for the mean reduction of the
+    # loss used for computing the gradients when loss_average is not None.
+    if loss_average is not None:
+        num_loss_terms = g.shape[0]  # batch_size
+        if loss_average == "batch+sequence":
+            # Size of all weight-sharing dimensions.
+            num_loss_terms *= g.shape[1:-1].numel()
+
+        scaling *= sqrt(num_loss_terms)
+
     if kfac_approx == "expand":
         # KFAC-expand approximation
         g = rearrange(g, "b ... d_out -> (b ...) d_out")
     else:
         # KFAC-reduce approximation
         g = reduce(g, "b ... d_out -> b d_out", "sum")
 
-    # The use of `g.shape[0]` assumes that the setting of the loss, i.e. the
-    # number of loss terms, matches the `kfac_approx` that is used.
-    scaling = scaling * sqrt(g.shape[0]) if batch_averaged else scaling
     return g * scaling
@@ -48,7 +48,7 @@ def test_autocast():
         "momentum": 0.9,
         "weight_decay": 1e-2,
         "lr_cov": 1e-2,
-        "batch_averaged": True,
+        "loss_average": "batch",
         "T": 1,
         "alpha1": 0.5,
         "structures": ("dense", "dense"),
 
@@ -42,7 +42,7 @@ def setup() -> Tuple[Sequential, Module, SINGD]:
         "momentum": 0.9,
         "weight_decay": 1e-2,
         "lr_cov": 1e-2,
-        "batch_averaged": True,
+        "loss_average": "batch",
         "T": 1,
         "alpha1": 0.5,
         "structures": ("dense", "dense"),
 
@@ -54,14 +54,14 @@ def test_gradient_accumulation(reduction: str):
     loss_func_mini = CrossEntropyLoss(reduction=reduction)
     loss_func_micro = deepcopy(loss_func_mini)
 
-    batch_averaged = {"mean": True, "sum": False}[reduction]
+    loss_average = {"mean": "batch", "sum": None}[reduction]
     optim_hyperparams = {
         "lr": 5e-4,
         "damping": 1e-4,
         "momentum": 0.9,
         "weight_decay": 1e-2,
         "lr_cov": 1e-2,
-        "batch_averaged": batch_averaged,
+        "loss_average": loss_average,
         "T": 1,
         "alpha1": 0.5,
         "structures": ("dense", "dense"),
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@`
`60`	`60`	`"momentum": 0.9,`
`61`	`61`	`"weight_decay": 1e-2,`
`62`	`62`	`"lr_cov": 1e-2,`
`63`		`- "batch_averaged": True,`
	`63`	`+ "loss_average": "batch",`
`64`	`64`	`"T": 1,`
`65`	`65`	`"alpha1": 0.5,`
`66`	`66`	`}`