Any keyword arguments will be passed to the AveragedModel constructor

Seppo Enarvi · Seppo Enarvi · commit b46188ba341c · 2025-02-22T16:01:44.000+02:00
diff --git a/src/lightning/pytorch/callbacks/weight_averaging.py b/src/lightning/pytorch/callbacks/weight_averaging.py
@@ -18,10 +18,9 @@
 
 import itertools
 from copy import deepcopy
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
-from torch import Tensor
 from torch.optim.swa_utils import AveragedModel
 from typing_extensions import override
 
@@ -35,32 +34,63 @@ class WeightAveraging(Callback):
     r"""A callback that updates an averaged model for Stochastic Weight Averaging (SWA) or Exponential Moving Average
     (EMA) after each training step.
 
-    The user can customize when the average model is updated by overriding the ``should_update()`` method.
+    Arguments given to the constructor will be passed to the :class:`AveragedModel` constructor. There are a couple of
+    differences to the default values, however. By default, the average model is stored on the CPU. If ``device`` is set
+    to ``None``, the device will be inferred from the original model. By default, the callback will compute running
+    averages for both the parameters and the buffers of the model. Setting ``use_buffers`` to ``False`` will cause only
+    the model parameters to be averaged, leaving updating the batch normalization statistics to the user (using
+    ``torch.optim.swa_utils.update_bn()``).
+
+    You can provide a custom averaging function with the ``avg_fn`` or ``multi_avg_fn`` parameter. See the
+    :class:`AveragedModel` class for details. If no averaging function is provided, the default is to compute the
+    equally-weighted average of the weights (SWA).
+
+    You can customize when the average model is updated by overriding the ``should_update()`` method. The callback calls
+    it with either ``step_idx`` or ``epoch_idx`` and the method returns a boolean indicating whether to update after the
+    given step or epoch. The default is to update after every step.
 
     During validation and after the training finishes, the current model parameters will be replaced with the averaged
     values.
 
+    Example::
+
+        from lightning.pytorch.callbacks import WeightAveraging
+        from torch.optim.swa_utils import get_ema_avg_fn
+
+        class EMAWeightAveraging(WeightAveraging):
+            def __init__(self):
+                super().__init__(avg_fn=get_ema_avg_fn())
+
+            def should_update(self, step_idx=None, epoch_idx=None):
+                # Start after 100 steps.
+                return (step_idx is not None) and (step_idx >= 100)
+
+        trainer = Trainer(callbacks=EMAWeightAveraging(), max_epochs=10)
+        trainer.fit(model, dataloader)
+
     Args:
         device: If provided, the :class:`AveragedModel` will be stored on the ``device``. If ``None`` the device will be
             inferred from the original model.
-        avg_fn: The averaging function used to update the parameters. The function must take in an
-            :class:`AveragedModel` parameter, a current model parameter, and the number of models already averaged. If
-            ``None``, an equally weighted average will be used.
+        use_buffers: If ``False``, the buffers of the model will not be averaged.
+        kwargs: Additional keyword arguments to be passed to the :class:`AveragedModel` constructor, such as ``avg_fn``
+            or ``multi_avg_fn``.
 
     """
 
     def __init__(
         self,
         device: Optional[Union[torch.device, str, int]] = "cpu",
-        avg_fn: Optional[Callable[[Tensor, Tensor, Union[Tensor, int]], Tensor]] = None,
-    ):
+        use_buffers: bool = True,
+        **kwargs: Any,
+    ) -> None:
         # The default value is a string so that jsonargparse knows how to serialize it.
         if isinstance(device, str):
             self._device: Optional[Union[torch.device, int]] = torch.device(device)
         else:
             self._device = device
+        self._use_buffers = use_buffers
+        self._kwargs = kwargs
 
-        self._avg_fn = avg_fn
         self._average_model: Optional[AveragedModel] = None
 
         # Number of optimizer steps taken, when the average model was last updated. Initializing this with zero ensures
@@ -76,8 +106,9 @@ def should_update(self, step_idx: Optional[int] = None, epoch_idx: Optional[int]
         """Called after every optimizer step and after every training epoch to check whether the average model should
         be updated.
 
-        One of the arguments is set to the zero-based index of the last training step or epoch. The user can customize
-        when the average model gets updated by overriding this method.
+        One of the arguments is set to the zero-based index of the last training step or epoch. The default
+        implementation returns ``True`` when any ``step_idx`` is provided. The user can customize when the average model
+        gets updated by overriding this method.
 
         Args:
             step_idx: Index of the last optimizer step, or ``None`` when called at the epoch end.
@@ -103,7 +134,9 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: s
         """
         if stage == "fit":
             device = self._device or pl_module.device
-            self._average_model = AveragedModel(model=pl_module, device=device, avg_fn=self._avg_fn, use_buffers=True)
+            self._average_model = AveragedModel(
+                model=pl_module, device=device, use_buffers=self._use_buffers, **self._kwargs
+            )
 
     @override
     def on_train_batch_end(