WeightAveraging calls the configure_model hook but issues a warning

Seppo Enarvi · Seppo Enarvi · commit 5a690570da9f · 2025-04-26T11:23:31.000+03:00
diff --git a/src/lightning/pytorch/callbacks/weight_averaging.py b/src/lightning/pytorch/callbacks/weight_averaging.py
@@ -26,6 +26,7 @@
 
 import lightning.pytorch as pl
 from lightning.pytorch.callbacks.callback import Callback
+from lightning.pytorch.utilities.model_helpers import is_overridden
 from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
 from lightning.pytorch.utilities.types import STEP_OUTPUT
 
@@ -55,6 +56,13 @@ class WeightAveraging(Callback):
     See also the documentation on the :ref:`weight averaging callbacks <advanced/training_tricks:Weight Averaging>`
     provided by Lightning.
 
+    Note:
+        To ensure that the :class:`AveragedModel` will contain all layers,
+        :meth:`~lightning.pytorch.callbacks.weight_averaging.WeightAveraging.setup` will call
+        :meth:`~lightning.pytorch.core.hooks.ModelHooks.configure_model` before instantiating the
+        :class:`AveragedModel`. However, that hook is not called in a strategy aware context, sharded models do not work
+        with weight averaging, and a warning will be issued.
+
     Example::
 
         from lightning.pytorch.callbacks import WeightAveraging
@@ -137,6 +145,16 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: s
         """
         if stage == "fit":
             device = self._device or pl_module.device
+
+            # If the configure_model hook is overridden, call it to create the layers before constructing the
+            # AveragedModel. However, sharding will not be done and a warning will be issued.
+            if is_overridden("configure_model", pl_module):
+                rank_zero_warn(
+                    "You're using the WeightAveraging callback with a model that overrides the configure_model "
+                    "callback. WeightAveraging doesn't support sharding model layers, so you may run out of memory."
+                )
+                pl_module.configure_model()
+
             self._average_model = AveragedModel(
                 model=pl_module, device=device, use_buffers=self._use_buffers, **self._kwargs
             )
diff --git a/tests/tests_pytorch/callbacks/test_weight_averaging.py b/tests/tests_pytorch/callbacks/test_weight_averaging.py
@@ -47,6 +47,19 @@ def configure_optimizers(self) -> None:
         return torch.optim.SGD(self.layer.parameters(), lr=0.1)
 
 
+class LargeTestModel(BoringModel):
+    def __init__(self):
+        super().__init__()
+        self.layer = None
+
+    def configure_model(self):
+        print("XXX configure_model")
+        self.layer = nn.Sequential(nn.Linear(32, 32), nn.ReLU(), nn.Linear(32, 2))
+
+    def configure_optimizers(self):
+        return torch.optim.SGD(self.parameters(), lr=0.01)
+
+
 class EMAAveragingFunction:
     """EMA averaging function.
 
@@ -252,8 +265,26 @@ def test_swa(tmp_path):
     _train(model, dataset, tmp_path, SWATestCallback())
 
 
+@pytest.mark.parametrize(
+    ("strategy", "accelerator", "devices"),
+    [
+        ("auto", "cpu", 1),
+        pytest.param("auto", "gpu", 1, marks=RunIf(min_cuda_gpus=1)),
+        pytest.param("fsdp", "gpu", 1, marks=RunIf(min_cuda_gpus=1)),
+        pytest.param("ddp", "gpu", 2, marks=RunIf(min_cuda_gpus=2)),
+        pytest.param("fsdp", "gpu", 2, marks=RunIf(min_cuda_gpus=2)),
+    ],
+)
+def test_ema_configure_model(tmp_path, strategy, accelerator, devices):
+    model = LargeTestModel()
+    dataset = RandomDataset(32, 32)
+    callback = EMATestCallback()
+    _train(model, dataset, tmp_path, callback, strategy=strategy, accelerator=accelerator, devices=devices)
+    assert isinstance(callback._average_model.module.layer, nn.Sequential)
+
+
 def _train(
-    model: TestModel,
+    model: BoringModel,
     dataset: Dataset,
     tmp_path: str,
     callback: WeightAveraging,
@@ -262,7 +293,7 @@ def _train(
     devices: int = 1,
     checkpoint_path: Optional[str] = None,
     will_crash: bool = False,
-) -> TestModel:
+) -> None:
     deterministic = accelerator == "cpu"
     trainer = Trainer(
         accelerator=accelerator,