Docs fixes (#19529)

awaelchli · web-flow · commit 2a827f3f6f8d · 2024-02-26T12:06:08.000+01:00
diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst
@@ -167,9 +167,11 @@ In distributed training cases where a model is running across many machines, Lig
     trainer = Trainer(strategy="ddp")
     model = MyLightningModule(hparams)
     trainer.fit(model)
+
     # Saves only on the main process
+    # Handles strategy-specific saving logic like XLA, FSDP, DeepSpeed etc.
     trainer.save_checkpoint("example.ckpt")
 
-Not using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` can lead to unexpected behavior and potential deadlock. Using other saving functions will result in all devices attempting to save the checkpoint. As a result, we highly recommend using the Trainer's save functionality.
-If using custom saving functions cannot be avoided, we recommend using the :func:`~lightning.pytorch.utilities.rank_zero.rank_zero_only` decorator to ensure saving occurs only on the main process. Note that this will only work if all ranks hold the exact same state and won't work when using
-model parallel distributed strategies such as deepspeed or sharded training.
+
+By using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` instead of ``torch.save``, you make your code agnostic to the distributed training strategy being used.
+It will ensure that checkpoints are saved correctly in a multi-process setting, avoiding race conditions, deadlocks and other common issues that normally require boilerplate code to handle properly.
diff --git a/docs/source-pytorch/starter/installation.rst b/docs/source-pytorch/starter/installation.rst
@@ -16,7 +16,9 @@ Install lightning inside a virtual env or conda environment with pip
 
     python -m pip install lightning
 
---------------
+
+----
+
 
 ******************
 Install with Conda
@@ -66,17 +68,17 @@ Install future patch releases from the source. Note that the patch release conta
 ^^^^^^^^^^^^^^^^^^^^^^
 Custom PyTorch Version
 ^^^^^^^^^^^^^^^^^^^^^^
-To use any PyTorch version visit the `PyTorch Installation Page <https://pytorch.org/get-started/locally/#start-locally>`_.
 
+To use any PyTorch version visit the `PyTorch Installation Page <https://pytorch.org/get-started/locally/#start-locally>`_.
 You can find the list of supported PyTorch versions in our :ref:`compatibility matrix <versioning:Compatibility matrix>`.
 
 ----
 
 
 *******************************************
-Optimized for ML workflows (lightning Apps)
+Optimized for ML workflows (Lightning Apps)
 *******************************************
-If you are deploying workflows built with Lightning in production and require fewer dependencies, try using the optimized `lightning[apps]` package:
+If you are deploying workflows built with Lightning in production and require fewer dependencies, try using the optimized ``lightning[apps]`` package:
 
 .. code-block:: bash
 
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -89,13 +89,12 @@ class ModelCheckpoint(Checkpoint):
             in a deterministic manner. Default: ``None``.
         save_top_k: if ``save_top_k == k``,
             the best k models according to the quantity monitored will be saved.
-            if ``save_top_k == 0``, no models are saved.
-            if ``save_top_k == -1``, all models are saved.
+            If ``save_top_k == 0``, no models are saved.
+            If ``save_top_k == -1``, all models are saved.
             Please note that the monitors are checked every ``every_n_epochs`` epochs.
-            if ``save_top_k >= 2`` and the callback is called multiple
-            times inside an epoch, the name of the saved file will be
-            appended with a version count starting with ``v1``
-            unless ``enable_version_counter`` is set to False.
+            If ``save_top_k >= 2`` and the callback is called multiple times inside an epoch, and the filename remains
+            unchanged, the name of the saved file will be appended with a version count starting with ``v1`` to avoid
+            collisions unless ``enable_version_counter`` is set to False.
         mode: one of {min, max}.
             If ``save_top_k != 0``, the decision to overwrite the current save file is made
             based on either the maximization or the minimization of the monitored quantity.
diff --git a/src/lightning/pytorch/core/hooks.py b/src/lightning/pytorch/core/hooks.py
@@ -85,6 +85,10 @@ def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int) -
             batch: The batched data as it is returned by the training DataLoader.
             batch_idx: the index of the batch
 
+        Note:
+            The value ``outputs["loss"]`` here will be the normalized value w.r.t ``accumulate_grad_batches`` of the
+            loss returned from ``training_step``.
+
         """
 
     def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
@@ -1285,20 +1285,12 @@ def optimizer_step(
 
         Examples::
 
-            # DEFAULT
             def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
-                optimizer.step(closure=optimizer_closure)
+                # Add your custom logic to run directly before `optimizer.step()`
 
-            # Learning rate warm-up
-            def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
-                # update params
                 optimizer.step(closure=optimizer_closure)
 
-                # manually warm up lr without a scheduler
-                if self.trainer.global_step < 500:
-                    lr_scale = min(1.0, float(self.trainer.global_step + 1) / 500.0)
-                    for pg in optimizer.param_groups:
-                        pg["lr"] = lr_scale * self.learning_rate
+                # Add your custom logic to run directly after `optimizer.step()`
 
         """
         optimizer.step(closure=optimizer_closure)