Merge branch 'master' into update-pyproject-py310

justusschock · web-flow · commit 0df63f72ba62 · 2025-11-19T16:56:50.000+01:00
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -212,6 +212,8 @@ We welcome any useful contribution! For your convenience here's a recommended wo
    - [Test README](https://github.com/Lightning-AI/pytorch-lightning/blob/master/tests/README.md)
    - [CI/CD README](https://github.com/Lightning-AI/pytorch-lightning/tree/master/.github/workflows#readme)
 
+1. Once you have a PR opened (and thereby a PR number), please update the respective changelog for [fabric](https://github.com/Lightning-AI/pytorch-lightning/blob/master/src/lightning/fabric/CHANGELOG.md) or [pytorch](https://github.com/Lightning-AI/pytorch-lightning/blob/master/src/lightning/pytorch/CHANGELOG.md) subpackage depending on where you made your changes.
+
 1. When you feel ready for integrating your work, mark your PR "Ready for review".
 
    - Your code should be readable and follow the project's design principles.
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -40,23 +40,20 @@ jobs:
       matrix:
         os: [macOS-14, ubuntu-22.04, windows-2022]
         config:
-          # only run PyTorch latest
+          # Test unified "lightning" package with PyTorch 2.1-2.5
           - { pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
           - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" }
           - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
           - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
           - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
 
-          # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues
+          # Test "fabric" package with PyTorch 2.6-2.9
           - { pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" }
-
-          # "fabric" installs the standalone package
-          - { pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.7" }
-
-          # adding recently cut Torch 2.7 - FUTURE
+          - { pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.7" }
           - { pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.8" }
+          - { pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.9" }
 
-          # "oldest" versions tests, only on minimum Python
+          # Test minimum supported versions (oldest)
           - { pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" }
     timeout-minutes: 25 # because of building grpcio on Mac
     env:
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -44,23 +44,20 @@ jobs:
       matrix:
         os: [macOS-14, ubuntu-22.04, windows-2022]
         config:
-          # only run PyTorch latest
+          # Test unified "lightning" package with PyTorch 2.1-2.5
           - { pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
           - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" }
           - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
           - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
           - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
 
-          # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues
+          # Test "pytorch" package with PyTorch 2.6-2.9
           - { pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" }
-
-          # "pytorch" installs the standalone package
-          - { pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.7" }
-
-          # adding recently cut Torch 2.7 - FUTURE
+          - { pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" }
           - { pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.8" }
+          - { pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.9" }
 
-          # "oldest" versions tests, only on minimum Python
+          # Test minimum supported versions (oldest)
           - { pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" }
     timeout-minutes: 50
     env:
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -25,7 +25,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
+
+
+- Learning rate scheduler is stepped at the end of epoch when `on_train_batch_start` returns -1 ([#21296](https://github.com/Lightning-AI/pytorch-lightning/issues/21296)).
+
 
 
 ---
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -25,6 +25,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for variable batch size in `ThroughputMonitor` ([#20236](https://github.com/Lightning-AI/pytorch-lightning/pull/20236))
 
 
+- Added `EMAWeightAveraging` callback that wraps Lightning's `WeightAveraging` class ([#21260](https://github.com/Lightning-AI/pytorch-lightning/pull/21260))
+
+
 ### Changed
 
 - Expose `weights_only` argument for `Trainer.{fit,validate,test,predict}` and let `torch` handle default value ([#21072](https://github.com/Lightning-AI/pytorch-lightning/pull/21072))
diff --git a/src/lightning/pytorch/callbacks/__init__.py b/src/lightning/pytorch/callbacks/__init__.py
@@ -32,7 +32,7 @@
 from lightning.pytorch.callbacks.stochastic_weight_avg import StochasticWeightAveraging
 from lightning.pytorch.callbacks.throughput_monitor import ThroughputMonitor
 from lightning.pytorch.callbacks.timer import Timer
-from lightning.pytorch.callbacks.weight_averaging import WeightAveraging
+from lightning.pytorch.callbacks.weight_averaging import EMAWeightAveraging, WeightAveraging
 
 __all__ = [
     "BackboneFinetuning",
@@ -59,5 +59,6 @@
     "ThroughputMonitor",
     "Timer",
     "TQDMProgressBar",
+    "EMAWeightAveraging",
     "WeightAveraging",
 ]
diff --git a/src/lightning/pytorch/callbacks/weight_averaging.py b/src/lightning/pytorch/callbacks/weight_averaging.py
@@ -21,7 +21,7 @@
 from typing import Any
 
 import torch
-from torch.optim.swa_utils import AveragedModel
+from torch.optim.swa_utils import AveragedModel, get_ema_avg_fn
 from typing_extensions import override
 
 import lightning.pytorch as pl
@@ -361,3 +361,55 @@ def _copy_average_to_current(self, pl_module: "pl.LightningModule") -> None:
         current_params = itertools.chain(pl_module.parameters(), pl_module.buffers())
         for average_param, current_param in zip(average_params, current_params):
             current_param.data.copy_(average_param.data)
+
+
+class EMAWeightAveraging(WeightAveraging):
+    """Exponential Moving Average (EMA) Weight Averaging callback."""
+
+    def __init__(
+        self,
+        device: Optional[Union[torch.device, str, int]] = None,
+        use_buffers: bool = True,
+        decay: float = 0.999,
+        update_every_n_steps: int = 1,
+        update_starting_at_step: Optional[int] = None,
+        update_starting_at_epoch: Optional[int] = None,
+        **kwargs: Any,
+    ):
+        super().__init__(
+            device=device,
+            use_buffers=use_buffers,
+            **kwargs,
+            avg_fn=get_ema_avg_fn(decay=decay),
+        )
+
+        self.update_every_n_steps = update_every_n_steps
+        self.update_starting_at_step = update_starting_at_step
+        self.update_starting_at_epoch = update_starting_at_epoch
+
+    def should_update(self, step_idx: Optional[int] = None, epoch_idx: Optional[int] = None) -> bool:
+        """Decide when to update the model weights.
+
+        Args:
+            step_idx: The current step index.
+            epoch_idx: The current epoch index.
+        Returns:
+            bool: True if the model weights should be updated, False otherwise.
+
+        """
+        if step_idx is not None:
+            # Check step-based conditions only if we have a valid step_idx
+            meets_step_requirement = self.update_starting_at_step is None or step_idx >= self.update_starting_at_step
+            meets_step_frequency = self.update_every_n_steps > 0 and step_idx % self.update_every_n_steps == 0
+            if meets_step_requirement and meets_step_frequency:
+                return True
+
+        if epoch_idx is not None:
+            # Check epoch-based condition only if we specify one
+            meets_epoch_requirement = (
+                self.update_starting_at_epoch is not None and epoch_idx >= self.update_starting_at_epoch
+            )
+            if meets_epoch_requirement:
+                return True
+
+        return False
diff --git a/src/lightning/pytorch/core/hooks.py b/src/lightning/pytorch/core/hooks.py
@@ -69,6 +69,7 @@ def on_train_batch_start(self, batch: Any, batch_idx: int) -> int | None:
         """Called in the training loop before anything happens for that batch.
 
         If you return -1 here, you will skip training for the rest of the current epoch.
+        Learning rate scheduler will still be stepped at the end of epoch.
 
         Args:
             batch: The batched data as it is returned by the training DataLoader.
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -323,30 +323,33 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
         trainer._logger_connector.on_batch_start(batch)
 
         batch_output: _OPTIMIZER_LOOP_OUTPUTS_TYPE | _MANUAL_LOOP_OUTPUTS_TYPE | None = None  # for mypy
+        should_skip_rest_of_epoch = False
+
         if batch is None and not using_dataloader_iter:
             self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
         else:
             # hook
             call._call_callback_hooks(trainer, "on_train_batch_start", batch, batch_idx)
             response = call._call_lightning_module_hook(trainer, "on_train_batch_start", batch, batch_idx)
             call._call_strategy_hook(trainer, "on_train_batch_start", batch, batch_idx)
-            if response == -1:
-                self.batch_progress.increment_processed()
-                raise StopIteration
-
-            self.batch_progress.increment_started()
-
-            kwargs = (
-                self._build_kwargs(OrderedDict(), batch, batch_idx)
-                if not using_dataloader_iter
-                else OrderedDict(any=dataloader_iter)
-            )
-            with trainer.profiler.profile("run_training_batch"):
-                if trainer.lightning_module.automatic_optimization:
-                    # in automatic optimization, there can only be one optimizer
-                    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
-                else:
-                    batch_output = self.manual_optimization.run(kwargs)
+            should_skip_rest_of_epoch = response == -1
+            # Signal this is the last batch for the current epoch
+            if should_skip_rest_of_epoch:
+                self.batch_progress.increment_by(0, is_last_batch=True)
+            else:
+                self.batch_progress.increment_started()
+
+                kwargs = (
+                    self._build_kwargs(OrderedDict(), batch, batch_idx)
+                    if not using_dataloader_iter
+                    else OrderedDict(any=dataloader_iter)
+                )
+                with trainer.profiler.profile("run_training_batch"):
+                    if trainer.lightning_module.automatic_optimization:
+                        # in automatic optimization, there can only be one optimizer
+                        batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
+                    else:
+                        batch_output = self.manual_optimization.run(kwargs)
 
         self.batch_progress.increment_processed()
 
@@ -356,6 +359,10 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
         if self._num_ready_batches_reached():
             self.update_lr_schedulers("epoch", update_plateau_schedulers=False)
 
+        if should_skip_rest_of_epoch:
+            # Only raise StopIteration now so that the training epoch loop can finish
+            raise StopIteration
+
         if using_dataloader_iter:
             # update the hook kwargs now that the step method might have consumed the iterator
             batch = data_fetcher._batch
diff --git a/src/lightning/pytorch/utilities/imports.py b/src/lightning/pytorch/utilities/imports.py
@@ -28,6 +28,7 @@
 _TORCHMETRICS_GREATER_EQUAL_0_11 = RequirementCache("torchmetrics>=0.11.0")  # using new API with task
 _TORCHMETRICS_GREATER_EQUAL_1_0_0 = RequirementCache("torchmetrics>=1.0.0")
 _TORCH_EQUAL_2_8 = RequirementCache("torch>=2.8.0,<2.9.0")
+_TORCH_EQUAL_2_9 = RequirementCache("torch>=2.9.0,<2.10.0")
 
 _OMEGACONF_AVAILABLE = package_available("omegaconf")
 _TORCHVISION_AVAILABLE = RequirementCache("torchvision")
diff --git a/tests/tests_pytorch/callbacks/test_weight_averaging.py b/tests/tests_pytorch/callbacks/test_weight_averaging.py
@@ -23,7 +23,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 from lightning.pytorch import LightningModule, Trainer
-from lightning.pytorch.callbacks import WeightAveraging
+from lightning.pytorch.callbacks import EMAWeightAveraging, WeightAveraging
 from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
 from tests_pytorch.helpers.runif import RunIf
 
@@ -329,3 +329,123 @@ def _train_and_resume(model: TestModel, dataset: Dataset, tmp_path: str, devices
     callback = EMATestCallback(devices=devices)
     _train(model, dataset, tmp_path, callback, devices=devices, checkpoint_path=checkpoint_path, **kwargs)
     return model
+
+
+@pytest.mark.parametrize(
+    ("strategy", "accelerator", "devices"),
+    [
+        ("auto", "cpu", 1),
+        pytest.param("auto", "gpu", 1, marks=RunIf(min_cuda_gpus=1)),
+    ],
+)
+def test_ema_weight_averaging(tmp_path, strategy, accelerator, devices):
+    """Test EMAWeightAveraging callback with various update configurations."""
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+
+    # Test with default settings (update every step)
+    callback = EMAWeightAveraging(decay=0.999, update_every_n_steps=1)
+    _train(model, dataset, tmp_path, callback, strategy=strategy, accelerator=accelerator, devices=devices)
+
+    # Verify the average model was created and updated
+    assert callback._average_model is not None
+    assert callback._average_model.n_averaged > 0
+
+
+def test_ema_weight_averaging_step_frequency(tmp_path):
+    """Test EMAWeightAveraging with custom step update frequency."""
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+
+    # Update every 5 steps
+    callback = EMAWeightAveraging(decay=0.95, update_every_n_steps=5)
+    _train(model, dataset, tmp_path, callback)
+
+    assert callback._average_model is not None
+
+
+def test_ema_weight_averaging_starting_step(tmp_path):
+    """Test EMAWeightAveraging with delayed start based on steps."""
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+
+    # Start updating after step 10
+    callback = EMAWeightAveraging(decay=0.999, update_every_n_steps=1, update_starting_at_step=10)
+    _train(model, dataset, tmp_path, callback)
+
+    assert callback._average_model is not None
+
+
+def test_ema_weight_averaging_starting_epoch(tmp_path):
+    """Test EMAWeightAveraging with delayed start based on epochs."""
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+
+    # Start updating after epoch 3
+    callback = EMAWeightAveraging(decay=0.999, update_every_n_steps=1, update_starting_at_epoch=3)
+    _train(model, dataset, tmp_path, callback)
+
+    assert callback._average_model is not None
+
+
+def test_ema_weight_averaging_should_update(tmp_path):
+    """Test the should_update logic of EMAWeightAveraging."""
+    # Test with step-based updates
+    callback = EMAWeightAveraging(update_every_n_steps=5, update_starting_at_step=10)
+
+    # Before starting step
+    assert not callback.should_update(step_idx=5)
+    assert not callback.should_update(step_idx=9)
+
+    # At and after starting step, but not on update frequency
+    assert callback.should_update(step_idx=10)  # First update
+    assert not callback.should_update(step_idx=11)
+    assert not callback.should_update(step_idx=14)
+    assert callback.should_update(step_idx=15)  # Second update
+
+    # Test with epoch-based updates
+    callback = EMAWeightAveraging(update_starting_at_epoch=2)
+
+    assert not callback.should_update(epoch_idx=0)
+    assert not callback.should_update(epoch_idx=1)
+    assert callback.should_update(epoch_idx=2)
+    assert callback.should_update(epoch_idx=3)
+
+
+def test_ema_weight_averaging_checkpoint_save_load(tmp_path):
+    """Test that EMAWeightAveraging correctly saves and loads checkpoints."""
+    model = TestModel()
+    model.crash_on_epoch = 2
+    dataset = RandomDataset(32, 32)
+
+    callback = EMAWeightAveraging(decay=0.99, update_every_n_steps=2)
+
+    # Train and create checkpoint
+    _train(model, dataset, tmp_path, callback, will_crash=True)
+
+    # Resume from checkpoint
+    model2 = TestModel()
+    callback2 = EMAWeightAveraging(decay=0.99, update_every_n_steps=2)
+    import glob  # should be at the top
+
+    _train(
+        model2,
+        dataset,
+        tmp_path,
+        callback2,
+        checkpoint_path=glob.glob((tmp_path / "checkpoints" / "*.ckpt").as_posix())[0],
+    )
+
+    assert callback2._average_model is not None
+
+
+@pytest.mark.parametrize("decay", [0.9, 0.99, 0.999, 0.9999])
+def test_ema_weight_averaging_decay_values(tmp_path, decay):
+    """Test EMAWeightAveraging with different decay values."""
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+
+    callback = EMAWeightAveraging(decay=decay, update_every_n_steps=1)
+    _train(model, dataset, tmp_path, callback)
+
+    assert callback._average_model is not None
diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py
@@ -14,7 +14,7 @@
 import pytest
 
 from lightning.fabric.utilities.imports import _IS_WINDOWS
-from lightning.pytorch.utilities.imports import _TORCH_EQUAL_2_8
+from lightning.pytorch.utilities.imports import _TORCH_EQUAL_2_8, _TORCH_EQUAL_2_9
 from lightning.pytorch.utilities.testing import _runif_reasons
 
 
@@ -27,6 +27,6 @@ def RunIf(**kwargs):
 _xfail_gloo_windows = pytest.mark.xfail(
     RuntimeError,
     strict=True,
-    condition=(_IS_WINDOWS and _TORCH_EQUAL_2_8),
+    condition=(_IS_WINDOWS and (_TORCH_EQUAL_2_8 or _TORCH_EQUAL_2_9)),
     reason="makeDeviceForHostname(): unsupported gloo device",
 )
diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py