Lightning-AI
diff --git a/‎.azure/gpu-tests-pytorch.yml‎
Lines changed: 1 addition & 8 deletions b/‎.azure/gpu-tests-pytorch.yml‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎.github/workflows/ci-examples-app.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-examples-app.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-tests-app.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-tests-app.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-tests-fabric.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-tests-fabric.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-tests-pytorch.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci-tests-pytorch.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/code-checks.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/code-checks.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/docs-build.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/docs-build.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-fabric/fundamentals/convert.rst‎
Lines changed: 15 additions & 0 deletions b/‎docs/source-fabric/fundamentals/convert.rst‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/source-pytorch/cli/lightning_cli_advanced_3.rst‎
Lines changed: 10 additions & 0 deletions b/‎docs/source-pytorch/cli/lightning_cli_advanced_3.rst‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/source-pytorch/common/checkpointing_intermediate.rst‎
Lines changed: 5 additions & 3 deletions b/‎docs/source-pytorch/common/checkpointing_intermediate.rst‎
Lines changed: 5 additions & 3 deletions
@@ -105,16 +105,9 @@ jobs:
           done
         displayName: "Adjust dependencies"
 
-      - bash: |
-          pip install -q -r .actions/requirements.txt
-          python .actions/assistant.py requirements_prune_pkgs \
-            --packages="[lightning-colossalai]" \
-            --req_files="[requirements/_integrations/strategies.txt]"
-        displayName: "Prune packages" # these have installation issues
-
       - bash: |
           extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
-          pip install -e ".[${extra}dev]" -r requirements/_integrations/strategies.txt pytest-timeout -U --find-links="${TORCH_URL}"
+          pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}"
         displayName: "Install package & dependencies"
 
       - bash: pip uninstall -y lightning
 
@@ -67,7 +67,7 @@ jobs:
         run: python .actions/assistant.py replace_oldest_ver
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels
 
@@ -73,7 +73,7 @@ jobs:
         run: python .actions/assistant.py replace_oldest_ver
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels
 
@@ -114,7 +114,7 @@ jobs:
           done
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels
 
@@ -120,7 +120,7 @@ jobs:
           cat requirements/pytorch/base.txt
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels
@@ -161,7 +161,7 @@ jobs:
           cache-key: "pypi_wheels"
 
       - name: Cache datasets
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: Datasets
           key: pl-dataset
 
@@ -34,7 +34,7 @@ jobs:
           python-version: "3.10.6"
 
       - name: Mypy cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: .mypy_cache
           key: mypy-${{ hashFiles('requirements/typing.txt') }}
 
@@ -80,7 +80,7 @@ jobs:
           pip install lai-sphinx-theme -U -f ${PYPI_LOCAL_DIR}
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels
 
@@ -90,6 +90,21 @@ Check out our before-and-after example for `image classification <https://github
 ----
 
 
+****************
+Optional changes
+****************
+
+Here are a few optional upgrades you can make to your code, if applicable:
+
+- Replace ``torch.save()`` and ``torch.load()`` with Fabric's :doc:`save and load methods <../guide/checkpoint/checkpoint>`.
+- Replace collective operations from ``torch.distributed`` (barrier, broadcast, etc.) with Fabric's :doc:`collective methods <../advanced/distributed_communication>`.
+- Use Fabric's :doc:`no_backward_sync() context manager <../advanced/gradient_accumulation>` if you implemented gradient accumulation.
+- Initialize your model under the :doc:`init_module() <../advanced/model_init>` context manager.
+
+
+----
+
+
 **********
 Next steps
 **********
 
@@ -197,6 +197,7 @@ Since the init parameters of the model have as a type hint a class, in the confi
                 decoder: Instance of a module for decoding
             """
             super().__init__()
+            self.save_hyperparameters()
             self.encoder = encoder
             self.decoder = decoder
 
@@ -216,6 +217,13 @@ If the CLI is implemented as ``LightningCLI(MyMainModel)`` the configuration wou
 
 It is also possible to combine ``subclass_mode_model=True`` and submodules, thereby having two levels of ``class_path``.
 
+.. tip::
+
+    By having ``self.save_hyperparameters()`` it becomes possible to load the model from a checkpoint. Simply do
+    ``ModelClass.load_from_checkpoint("path/to/checkpoint.ckpt")``. In the case of using ``subclass_mode_model=True``,
+    then load it like ``LightningModule.load_from_checkpoint("path/to/checkpoint.ckpt")``. ``save_hyperparameters`` is
+    optional and can be safely removed if there is no need to load from a checkpoint.
+
 
 Fixed optimizer and scheduler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -279,6 +287,7 @@ An example of a model that uses two optimizers is the following:
     class MyModel(LightningModule):
         def __init__(self, optimizer1: OptimizerCallable, optimizer2: OptimizerCallable):
             super().__init__()
+            self.save_hyperparameters()
             self.optimizer1 = optimizer1
             self.optimizer2 = optimizer2
 
@@ -318,6 +327,7 @@ that uses dependency injection for an optimizer and a learning scheduler is:
             scheduler: LRSchedulerCallable = torch.optim.lr_scheduler.ConstantLR,
         ):
             super().__init__()
+            self.save_hyperparameters()
             self.optimizer = optimizer
             self.scheduler = scheduler
 
 
@@ -167,9 +167,11 @@ In distributed training cases where a model is running across many machines, Lig
     trainer = Trainer(strategy="ddp")
     model = MyLightningModule(hparams)
     trainer.fit(model)
+
     # Saves only on the main process
+    # Handles strategy-specific saving logic like XLA, FSDP, DeepSpeed etc.
     trainer.save_checkpoint("example.ckpt")
 
-Not using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` can lead to unexpected behavior and potential deadlock. Using other saving functions will result in all devices attempting to save the checkpoint. As a result, we highly recommend using the Trainer's save functionality.
-If using custom saving functions cannot be avoided, we recommend using the :func:`~lightning.pytorch.utilities.rank_zero.rank_zero_only` decorator to ensure saving occurs only on the main process. Note that this will only work if all ranks hold the exact same state and won't work when using
-model parallel distributed strategies such as deepspeed or sharded training.
+
+By using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` instead of ``torch.save``, you make your code agnostic to the distributed training strategy being used.
+It will ensure that checkpoints are saved correctly in a multi-process setting, avoiding race conditions, deadlocks and other common issues that normally require boilerplate code to handle properly.