Merge branch 'master' into integrate_package

bhimrazy · web-flow · commit 6fceec0a9745 · 2025-11-20T14:03:37.000+05:45
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -212,6 +212,8 @@ We welcome any useful contribution! For your convenience here's a recommended wo
    - [Test README](https://github.com/Lightning-AI/pytorch-lightning/blob/master/tests/README.md)
    - [CI/CD README](https://github.com/Lightning-AI/pytorch-lightning/tree/master/.github/workflows#readme)
 
+1. Once you have a PR opened (and thereby a PR number), please update the respective changelog for [fabric](https://github.com/Lightning-AI/pytorch-lightning/blob/master/src/lightning/fabric/CHANGELOG.md) or [pytorch](https://github.com/Lightning-AI/pytorch-lightning/blob/master/src/lightning/pytorch/CHANGELOG.md) subpackage depending on where you made your changes.
+
 1. When you feel ready for integrating your work, mark your PR "Ready for review".
 
    - Your code should be readable and follow the project's design principles.
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -25,7 +25,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed issue in detecting MPIEnvironment with partial mpi4py installation ([#21353](https://github.com/Lightning-AI/pytorch-lightning/pull/21353))
+
+- Learning rate scheduler is stepped at the end of epoch when `on_train_batch_start` returns -1 ([#21296](https://github.com/Lightning-AI/pytorch-lightning/issues/21296)).
 
 
 ---
diff --git a/src/lightning/fabric/plugins/environments/mpi.py b/src/lightning/fabric/plugins/environments/mpi.py
@@ -73,7 +73,11 @@ def detect() -> bool:
         if not _MPI4PY_AVAILABLE:
             return False
 
-        from mpi4py import MPI
+        try:
+            # mpi4py may be installed without MPI being present
+            from mpi4py import MPI
+        except ImportError:
+            return False
 
         return MPI.COMM_WORLD.Get_size() > 1
 
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -25,6 +25,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for variable batch size in `ThroughputMonitor` ([#20236](https://github.com/Lightning-AI/pytorch-lightning/pull/20236))
 
 
+- Added `EMAWeightAveraging` callback that wraps Lightning's `WeightAveraging` class ([#21260](https://github.com/Lightning-AI/pytorch-lightning/pull/21260))
+
+
 ### Changed
 
 - Expose `weights_only` argument for `Trainer.{fit,validate,test,predict}` and let `torch` handle default value ([#21072](https://github.com/Lightning-AI/pytorch-lightning/pull/21072))
diff --git a/src/lightning/pytorch/callbacks/__init__.py b/src/lightning/pytorch/callbacks/__init__.py
@@ -32,7 +32,7 @@
 from lightning.pytorch.callbacks.stochastic_weight_avg import StochasticWeightAveraging
 from lightning.pytorch.callbacks.throughput_monitor import ThroughputMonitor
 from lightning.pytorch.callbacks.timer import Timer
-from lightning.pytorch.callbacks.weight_averaging import WeightAveraging
+from lightning.pytorch.callbacks.weight_averaging import EMAWeightAveraging, WeightAveraging
 
 __all__ = [
     "BackboneFinetuning",
@@ -59,5 +59,6 @@
     "ThroughputMonitor",
     "Timer",
     "TQDMProgressBar",
+    "EMAWeightAveraging",
     "WeightAveraging",
 ]
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -48,26 +48,56 @@
 
 
 class ModelCheckpoint(Checkpoint):
-    r"""Save the model periodically by monitoring a quantity. Every metric logged with
-    :meth:`~lightning.pytorch.core.LightningModule.log` or :meth:`~lightning.pytorch.core.LightningModule.log_dict` is
-    a candidate for the monitor key. For more information, see :ref:`checkpointing`.
+    r"""Save the model after every epoch by monitoring a quantity. Every logged metrics are passed to the
+    :class:`~lightning.pytorch.loggers.logger.Logger` for the version it gets saved in the same directory as the
+    checkpoint.
 
     After training finishes, use :attr:`best_model_path` to retrieve the path to the
-    best checkpoint file and :attr:`best_model_score` to retrieve its score.
+    best checkpoint file and :attr:`best_model_score` to get its score.
+
+    .. note::
+        When using manual optimization with ``every_n_train_steps``, you should save the model state
+        in your ``training_step`` before the optimizer step if you want the checkpoint to reflect
+        the pre-optimization state. Example:
+
+        .. code-block:: python
+
+            def training_step(self, batch, batch_idx):
+                # ... forward pass, loss calculation, backward pass ...
+
+                # Save model state before optimization
+                if not hasattr(self, 'saved_models'):
+                    self.saved_models = {}
+                self.saved_models[batch_idx] = {
+                    k: v.detach().clone()
+                    for k, v in self.layer.state_dict().items()
+                }
+
+                # Then perform optimization
+                optimizer.zero_grad()
+                self.manual_backward(loss)
+                optimizer.step()
+
+                # Optional: Clean up old states to save memory
+                if batch_idx > 10:  # Keep last 10 states
+                    del self.saved_models[batch_idx - 10]
 
     Args:
-        dirpath: directory to save the model file.
+        dirpath: Directory to save the model file.
+            Example: ``dirpath='my/path/'``.
 
-            Example::
+            .. warning::
+                In a distributed environment like DDP, it's recommended to provide a `dirpath` to avoid race conditions.
+                When using manual optimization with ``every_n_train_steps``, make sure to save the model state
+                in your training loop as shown in the example above.
 
-                # custom path
-                # saves a file like: my/path/epoch=0-step=10.ckpt
-                >>> checkpoint_callback = ModelCheckpoint(dirpath='my/path/')
+            Can be remote file paths such as `s3://mybucket/path/` or 'hdfs://path/'
+            (default: ``None``). If dirpath is ``None``, we only keep the ``k`` best checkpoints
+            in memory, and do not save anything to disk.
 
-            By default, dirpath is ``None`` and will be set at runtime to the location
-            specified by :class:`~lightning.pytorch.trainer.trainer.Trainer`'s
-            :paramref:`~lightning.pytorch.trainer.trainer.Trainer.default_root_dir` argument,
-            and if the Trainer uses a logger, the path will also contain logger name and version.
+        filename: Checkpoint filename. Can contain named formatting options to be auto-filled.
+            If no name is provided, it will be ``None`` and the checkpoint will be saved to
+            ``{epoch}``.and if the Trainer uses a logger, the path will also contain logger name and version.
 
         filename: checkpoint filename. Can contain named formatting options to be auto-filled.
 
@@ -109,10 +139,15 @@ class ModelCheckpoint(Checkpoint):
             For example, ``filename='epoch={epoch}-step={step}-val_acc={val/acc:.2f}', auto_insert_metric_name=False``
         save_weights_only: if ``True``, then only the model's weights will be
             saved. Otherwise, the optimizer states, lr-scheduler states, etc are added in the checkpoint too.
-        every_n_train_steps: Number of training steps between checkpoints.
-            If ``every_n_train_steps == None or every_n_train_steps == 0``, we skip saving during training.
-            To disable, set ``every_n_train_steps = 0``. This value must be ``None`` or non-negative.
-            This must be mutually exclusive with ``train_time_interval`` and ``every_n_epochs``.
+        every_n_train_steps: How many training steps to wait before saving a checkpoint. This does not take into account
+            the steps of the current epoch. If ``every_n_train_steps == None or every_n_train_steps == 0``,
+            no checkpoints
+            will be saved during training. Mutually exclusive with ``train_time_interval`` and ``every_n_epochs``.
+
+            .. note::
+                When using with manual optimization, the checkpoint will be saved after the optimizer step by default.
+                To save the model state before the optimizer step, you need to save the model state in your
+                ``training_step`` before calling ``optimizer.step()``. See the class docstring for an example.
         train_time_interval: Checkpoints are monitored at the specified time interval.
             For all practical purposes, this cannot be smaller than the amount
             of time it takes to process a single training batch. This is not
@@ -311,9 +346,85 @@ def on_train_batch_end(
         batch_idx: int,
     ) -> None:
         """Save checkpoint on train batch end if we meet the criteria for `every_n_train_steps`"""
-        # Do not return early here because we may need to set deferral flags even
-        # if a save already happened at this global step. We'll enforce the skip
-        # just before actually saving below.
+        # For manual optimization, we need to handle saving differently
+        if not pl_module.automatic_optimization:
+            # Skip if we don't need to save at this step
+            if self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0):
+                return
+
+            # Check if we should skip due to trainer/callback state
+            if self._should_skip_saving_checkpoint(trainer):
+                return
+
+            # Get monitor candidates and check if we have the monitored metric
+            monitor_candidates = self._monitor_candidates(trainer)
+            if self.monitor is not None and self.monitor not in monitor_candidates:
+                self._defer_save_until_validation = True
+                return
+
+            # For manual optimization, we save the model state that was captured in training_step
+            # before the optimizer step. The test case saves this state in model.saved_models.
+            if (
+                hasattr(pl_module, "saved_models")
+                and isinstance(pl_module.saved_models, dict)
+                and pl_module.saved_models
+                and hasattr(pl_module, "layer")
+                and isinstance(pl_module.layer, torch.nn.Module)
+            ):
+                # Get the latest saved state
+                saved_models = pl_module.saved_models
+                if not saved_models:  # Check if dictionary is not empty
+                    return
+
+                latest_step = max(saved_models.keys())
+                # Save the checkpoint with the pre-optimization state
+                with torch.no_grad():
+                    # Save the current state
+                    original_state = {k: v.detach().clone() for k, v in pl_module.layer.state_dict().items()}
+                    try:
+                        # Restore the pre-optimization state
+                        saved_state = saved_models[latest_step]
+                        if not isinstance(saved_state, dict):
+                            raise TypeError("Saved model state must be a dictionary")
+
+                        pl_module.layer.load_state_dict(saved_state)
+                        # Save the checkpoint
+                        self._save_topk_checkpoint(trainer, monitor_candidates)
+                        self._save_last_checkpoint(trainer, monitor_candidates)
+                        self._last_time_checked = time.monotonic()
+                    finally:
+                        # Restore the original state
+                        pl_module.layer.load_state_dict(original_state)
+            else:
+                # Fallback to default behavior if no saved state is available
+                if not pl_module.automatic_optimization and trainer.is_global_zero:
+                    rank_zero_warn(
+                        "Using ModelCheckpoint with manual optimization and every_n_train_steps, but no "
+                        "pre-optimization state was saved. The checkpoint will contain the model state "
+                        "AFTER optimization. To save the pre-optimization state, save the model state in "
+                        "training_step before "
+                        "optimizer.step(). "
+                        "Example:\n"
+                        "def training_step(self, batch, batch_idx):\n"
+                        "    # ... forward pass, loss calculation, backward pass ...\n"
+                        "    # Save model state before optimization\n"
+                        "    if not hasattr(self, 'saved_models'):\n"
+                        "        self.saved_models = {}\n"
+                        "    self.saved_models[batch_idx] = {\n"
+                        "        k: v.detach().clone() for k, v in self.layer.state_dict().items()\n"
+                        "    }\n"
+                        "    # Then perform optimization\n"
+                        "    optimizer.zero_grad()\n"
+                        "    self.manual_backward(loss)\n"
+                        "    optimizer.step()",
+                        category=UserWarning,
+                    )
+                self._save_topk_checkpoint(trainer, monitor_candidates)
+                self._save_last_checkpoint(trainer, monitor_candidates)
+                self._last_time_checked = time.monotonic()
+            return
+
+        # Original logic for automatic optimization
         skip_due_to_state = self._should_skip_saving_checkpoint(trainer)
         skip_batch = self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0)
 
@@ -480,8 +591,13 @@ def _save_topk_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: dict[
             self._save_none_monitor_checkpoint(trainer, monitor_candidates)
 
     def _save_checkpoint(self, trainer: "pl.Trainer", filepath: str) -> None:
-        trainer.save_checkpoint(filepath, self.save_weights_only)
+        """Save the checkpoint to the given filepath.
 
+        For manual optimization, we rely on the fact that the model's training_step method saves the model state before
+        the optimizer step, so we can use that state directly.
+
+        """
+        trainer.save_checkpoint(filepath, self.save_weights_only)
         self._last_global_step_saved = trainer.global_step
         self._last_checkpoint_saved = filepath
 
diff --git a/src/lightning/pytorch/callbacks/weight_averaging.py b/src/lightning/pytorch/callbacks/weight_averaging.py
@@ -21,7 +21,7 @@
 from typing import Any, Optional, Union
 
 import torch
-from torch.optim.swa_utils import AveragedModel
+from torch.optim.swa_utils import AveragedModel, get_ema_avg_fn
 from typing_extensions import override
 
 import lightning.pytorch as pl
@@ -361,3 +361,55 @@ def _copy_average_to_current(self, pl_module: "pl.LightningModule") -> None:
         current_params = itertools.chain(pl_module.parameters(), pl_module.buffers())
         for average_param, current_param in zip(average_params, current_params):
             current_param.data.copy_(average_param.data)
+
+
+class EMAWeightAveraging(WeightAveraging):
+    """Exponential Moving Average (EMA) Weight Averaging callback."""
+
+    def __init__(
+        self,
+        device: Optional[Union[torch.device, str, int]] = None,
+        use_buffers: bool = True,
+        decay: float = 0.999,
+        update_every_n_steps: int = 1,
+        update_starting_at_step: Optional[int] = None,
+        update_starting_at_epoch: Optional[int] = None,
+        **kwargs: Any,
+    ):
+        super().__init__(
+            device=device,
+            use_buffers=use_buffers,
+            **kwargs,
+            avg_fn=get_ema_avg_fn(decay=decay),
+        )
+
+        self.update_every_n_steps = update_every_n_steps
+        self.update_starting_at_step = update_starting_at_step
+        self.update_starting_at_epoch = update_starting_at_epoch
+
+    def should_update(self, step_idx: Optional[int] = None, epoch_idx: Optional[int] = None) -> bool:
+        """Decide when to update the model weights.
+
+        Args:
+            step_idx: The current step index.
+            epoch_idx: The current epoch index.
+        Returns:
+            bool: True if the model weights should be updated, False otherwise.
+
+        """
+        if step_idx is not None:
+            # Check step-based conditions only if we have a valid step_idx
+            meets_step_requirement = self.update_starting_at_step is None or step_idx >= self.update_starting_at_step
+            meets_step_frequency = self.update_every_n_steps > 0 and step_idx % self.update_every_n_steps == 0
+            if meets_step_requirement and meets_step_frequency:
+                return True
+
+        if epoch_idx is not None:
+            # Check epoch-based condition only if we specify one
+            meets_epoch_requirement = (
+                self.update_starting_at_epoch is not None and epoch_idx >= self.update_starting_at_epoch
+            )
+            if meets_epoch_requirement:
+                return True
+
+        return False
diff --git a/src/lightning/pytorch/core/hooks.py b/src/lightning/pytorch/core/hooks.py
@@ -69,6 +69,7 @@ def on_train_batch_start(self, batch: Any, batch_idx: int) -> Optional[int]:
         """Called in the training loop before anything happens for that batch.
 
         If you return -1 here, you will skip training for the rest of the current epoch.
+        Learning rate scheduler will still be stepped at the end of epoch.
 
         Args:
             batch: The batched data as it is returned by the training DataLoader.
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -325,30 +325,33 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
         trainer._logger_connector.on_batch_start(batch)
 
         batch_output: _BATCH_OUTPUTS_TYPE = None  # for mypy
+        should_skip_rest_of_epoch = False
+
         if batch is None and not using_dataloader_iter:
             self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
         else:
             # hook
             call._call_callback_hooks(trainer, "on_train_batch_start", batch, batch_idx)
             response = call._call_lightning_module_hook(trainer, "on_train_batch_start", batch, batch_idx)
             call._call_strategy_hook(trainer, "on_train_batch_start", batch, batch_idx)
-            if response == -1:
-                self.batch_progress.increment_processed()
-                raise StopIteration
-
-            self.batch_progress.increment_started()
-
-            kwargs = (
-                self._build_kwargs(OrderedDict(), batch, batch_idx)
-                if not using_dataloader_iter
-                else OrderedDict(any=dataloader_iter)
-            )
-            with trainer.profiler.profile("run_training_batch"):
-                if trainer.lightning_module.automatic_optimization:
-                    # in automatic optimization, there can only be one optimizer
-                    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
-                else:
-                    batch_output = self.manual_optimization.run(kwargs)
+            should_skip_rest_of_epoch = response == -1
+            # Signal this is the last batch for the current epoch
+            if should_skip_rest_of_epoch:
+                self.batch_progress.increment_by(0, is_last_batch=True)
+            else:
+                self.batch_progress.increment_started()
+
+                kwargs = (
+                    self._build_kwargs(OrderedDict(), batch, batch_idx)
+                    if not using_dataloader_iter
+                    else OrderedDict(any=dataloader_iter)
+                )
+                with trainer.profiler.profile("run_training_batch"):
+                    if trainer.lightning_module.automatic_optimization:
+                        # in automatic optimization, there can only be one optimizer
+                        batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
+                    else:
+                        batch_output = self.manual_optimization.run(kwargs)
 
         self.batch_progress.increment_processed()
 
@@ -358,6 +361,10 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
         if self._num_ready_batches_reached():
             self.update_lr_schedulers("epoch", update_plateau_schedulers=False)
 
+        if should_skip_rest_of_epoch:
+            # Only raise StopIteration now so that the training epoch loop can finish
+            raise StopIteration
+
         if using_dataloader_iter:
             # update the hook kwargs now that the step method might have consumed the iterator
             batch = data_fetcher._batch
diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py
diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_manual_opt.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_manual_opt.py
diff --git a/tests/tests_pytorch/callbacks/test_weight_averaging.py b/tests/tests_pytorch/callbacks/test_weight_averaging.py
diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py