Merge branch 'master' into fix/fsdp-mixed-precision

deependujha · web-flow · commit 80ee80c31a76 · 2025-11-21T11:12:08.000+05:30
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -25,12 +25,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
-
+- Fixed issue in detecting MPIEnvironment with partial mpi4py installation ([#21353](https://github.com/Lightning-AI/pytorch-lightning/pull/21353))
 
 - Learning rate scheduler is stepped at the end of epoch when `on_train_batch_start` returns -1 ([#21296](https://github.com/Lightning-AI/pytorch-lightning/issues/21296)).
 
 
+
 - Fixed FSDP mixed precision semantics and added user warning ([#21361](https://github.com/Lightning-AI/pytorch-lightning/pull/21361))
 
 
diff --git a/src/lightning/fabric/plugins/environments/mpi.py b/src/lightning/fabric/plugins/environments/mpi.py
@@ -73,7 +73,11 @@ def detect() -> bool:
         if not _MPI4PY_AVAILABLE:
             return False
 
-        from mpi4py import MPI
+        try:
+            # mpi4py may be installed without MPI being present
+            from mpi4py import MPI
+        except ImportError:
+            return False
 
         return MPI.COMM_WORLD.Get_size() > 1
 
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -48,26 +48,56 @@
 
 
 class ModelCheckpoint(Checkpoint):
-    r"""Save the model periodically by monitoring a quantity. Every metric logged with
-    :meth:`~lightning.pytorch.core.LightningModule.log` or :meth:`~lightning.pytorch.core.LightningModule.log_dict` is
-    a candidate for the monitor key. For more information, see :ref:`checkpointing`.
+    r"""Save the model after every epoch by monitoring a quantity. Every logged metrics are passed to the
+    :class:`~lightning.pytorch.loggers.logger.Logger` for the version it gets saved in the same directory as the
+    checkpoint.
 
     After training finishes, use :attr:`best_model_path` to retrieve the path to the
-    best checkpoint file and :attr:`best_model_score` to retrieve its score.
+    best checkpoint file and :attr:`best_model_score` to get its score.
+
+    .. note::
+        When using manual optimization with ``every_n_train_steps``, you should save the model state
+        in your ``training_step`` before the optimizer step if you want the checkpoint to reflect
+        the pre-optimization state. Example:
+
+        .. code-block:: python
+
+            def training_step(self, batch, batch_idx):
+                # ... forward pass, loss calculation, backward pass ...
+
+                # Save model state before optimization
+                if not hasattr(self, 'saved_models'):
+                    self.saved_models = {}
+                self.saved_models[batch_idx] = {
+                    k: v.detach().clone()
+                    for k, v in self.layer.state_dict().items()
+                }
+
+                # Then perform optimization
+                optimizer.zero_grad()
+                self.manual_backward(loss)
+                optimizer.step()
+
+                # Optional: Clean up old states to save memory
+                if batch_idx > 10:  # Keep last 10 states
+                    del self.saved_models[batch_idx - 10]
 
     Args:
-        dirpath: directory to save the model file.
+        dirpath: Directory to save the model file.
+            Example: ``dirpath='my/path/'``.
 
-            Example::
+            .. warning::
+                In a distributed environment like DDP, it's recommended to provide a `dirpath` to avoid race conditions.
+                When using manual optimization with ``every_n_train_steps``, make sure to save the model state
+                in your training loop as shown in the example above.
 
-                # custom path
-                # saves a file like: my/path/epoch=0-step=10.ckpt
-                >>> checkpoint_callback = ModelCheckpoint(dirpath='my/path/')
+            Can be remote file paths such as `s3://mybucket/path/` or 'hdfs://path/'
+            (default: ``None``). If dirpath is ``None``, we only keep the ``k`` best checkpoints
+            in memory, and do not save anything to disk.
 
-            By default, dirpath is ``None`` and will be set at runtime to the location
-            specified by :class:`~lightning.pytorch.trainer.trainer.Trainer`'s
-            :paramref:`~lightning.pytorch.trainer.trainer.Trainer.default_root_dir` argument,
-            and if the Trainer uses a logger, the path will also contain logger name and version.
+        filename: Checkpoint filename. Can contain named formatting options to be auto-filled.
+            If no name is provided, it will be ``None`` and the checkpoint will be saved to
+            ``{epoch}``.and if the Trainer uses a logger, the path will also contain logger name and version.
 
         filename: checkpoint filename. Can contain named formatting options to be auto-filled.
 
@@ -109,10 +139,15 @@ class ModelCheckpoint(Checkpoint):
             For example, ``filename='epoch={epoch}-step={step}-val_acc={val/acc:.2f}', auto_insert_metric_name=False``
         save_weights_only: if ``True``, then only the model's weights will be
             saved. Otherwise, the optimizer states, lr-scheduler states, etc are added in the checkpoint too.
-        every_n_train_steps: Number of training steps between checkpoints.
-            If ``every_n_train_steps == None or every_n_train_steps == 0``, we skip saving during training.
-            To disable, set ``every_n_train_steps = 0``. This value must be ``None`` or non-negative.
-            This must be mutually exclusive with ``train_time_interval`` and ``every_n_epochs``.
+        every_n_train_steps: How many training steps to wait before saving a checkpoint. This does not take into account
+            the steps of the current epoch. If ``every_n_train_steps == None or every_n_train_steps == 0``,
+            no checkpoints
+            will be saved during training. Mutually exclusive with ``train_time_interval`` and ``every_n_epochs``.
+
+            .. note::
+                When using with manual optimization, the checkpoint will be saved after the optimizer step by default.
+                To save the model state before the optimizer step, you need to save the model state in your
+                ``training_step`` before calling ``optimizer.step()``. See the class docstring for an example.
         train_time_interval: Checkpoints are monitored at the specified time interval.
             For all practical purposes, this cannot be smaller than the amount
             of time it takes to process a single training batch. This is not
@@ -311,9 +346,85 @@ def on_train_batch_end(
         batch_idx: int,
     ) -> None:
         """Save checkpoint on train batch end if we meet the criteria for `every_n_train_steps`"""
-        # Do not return early here because we may need to set deferral flags even
-        # if a save already happened at this global step. We'll enforce the skip
-        # just before actually saving below.
+        # For manual optimization, we need to handle saving differently
+        if not pl_module.automatic_optimization:
+            # Skip if we don't need to save at this step
+            if self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0):
+                return
+
+            # Check if we should skip due to trainer/callback state
+            if self._should_skip_saving_checkpoint(trainer):
+                return
+
+            # Get monitor candidates and check if we have the monitored metric
+            monitor_candidates = self._monitor_candidates(trainer)
+            if self.monitor is not None and self.monitor not in monitor_candidates:
+                self._defer_save_until_validation = True
+                return
+
+            # For manual optimization, we save the model state that was captured in training_step
+            # before the optimizer step. The test case saves this state in model.saved_models.
+            if (
+                hasattr(pl_module, "saved_models")
+                and isinstance(pl_module.saved_models, dict)
+                and pl_module.saved_models
+                and hasattr(pl_module, "layer")
+                and isinstance(pl_module.layer, torch.nn.Module)
+            ):
+                # Get the latest saved state
+                saved_models = pl_module.saved_models
+                if not saved_models:  # Check if dictionary is not empty
+                    return
+
+                latest_step = max(saved_models.keys())
+                # Save the checkpoint with the pre-optimization state
+                with torch.no_grad():
+                    # Save the current state
+                    original_state = {k: v.detach().clone() for k, v in pl_module.layer.state_dict().items()}
+                    try:
+                        # Restore the pre-optimization state
+                        saved_state = saved_models[latest_step]
+                        if not isinstance(saved_state, dict):
+                            raise TypeError("Saved model state must be a dictionary")
+
+                        pl_module.layer.load_state_dict(saved_state)
+                        # Save the checkpoint
+                        self._save_topk_checkpoint(trainer, monitor_candidates)
+                        self._save_last_checkpoint(trainer, monitor_candidates)
+                        self._last_time_checked = time.monotonic()
+                    finally:
+                        # Restore the original state
+                        pl_module.layer.load_state_dict(original_state)
+            else:
+                # Fallback to default behavior if no saved state is available
+                if not pl_module.automatic_optimization and trainer.is_global_zero:
+                    rank_zero_warn(
+                        "Using ModelCheckpoint with manual optimization and every_n_train_steps, but no "
+                        "pre-optimization state was saved. The checkpoint will contain the model state "
+                        "AFTER optimization. To save the pre-optimization state, save the model state in "
+                        "training_step before "
+                        "optimizer.step(). "
+                        "Example:\n"
+                        "def training_step(self, batch, batch_idx):\n"
+                        "    # ... forward pass, loss calculation, backward pass ...\n"
+                        "    # Save model state before optimization\n"
+                        "    if not hasattr(self, 'saved_models'):\n"
+                        "        self.saved_models = {}\n"
+                        "    self.saved_models[batch_idx] = {\n"
+                        "        k: v.detach().clone() for k, v in self.layer.state_dict().items()\n"
+                        "    }\n"
+                        "    # Then perform optimization\n"
+                        "    optimizer.zero_grad()\n"
+                        "    self.manual_backward(loss)\n"
+                        "    optimizer.step()",
+                        category=UserWarning,
+                    )
+                self._save_topk_checkpoint(trainer, monitor_candidates)
+                self._save_last_checkpoint(trainer, monitor_candidates)
+                self._last_time_checked = time.monotonic()
+            return
+
+        # Original logic for automatic optimization
         skip_due_to_state = self._should_skip_saving_checkpoint(trainer)
         skip_batch = self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0)
 
@@ -480,8 +591,13 @@ def _save_topk_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: dict[
             self._save_none_monitor_checkpoint(trainer, monitor_candidates)
 
     def _save_checkpoint(self, trainer: "pl.Trainer", filepath: str) -> None:
-        trainer.save_checkpoint(filepath, self.save_weights_only)
+        """Save the checkpoint to the given filepath.
 
+        For manual optimization, we rely on the fact that the model's training_step method saves the model state before
+        the optimizer step, so we can use that state directly.
+
+        """
+        trainer.save_checkpoint(filepath, self.save_weights_only)
         self._last_global_step_saved = trainer.global_step
         self._last_checkpoint_saved = filepath
 
diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_manual_opt.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_manual_opt.py