Lightning-AI
diff --git a/‎.github/workflows/_legacy-checkpoints.yml‎
Lines changed: 10 additions & 6 deletions b/‎.github/workflows/_legacy-checkpoints.yml‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎.github/workflows/ci-tests-fabric.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci-tests-fabric.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/ci-tests-pytorch.yml‎
Lines changed: 33 additions & 27 deletions b/‎.github/workflows/ci-tests-pytorch.yml‎
Lines changed: 33 additions & 27 deletions
diff --git a/‎docs/source-pytorch/accelerators/gpu_faq.rst‎
Lines changed: 54 additions & 14 deletions b/‎docs/source-pytorch/accelerators/gpu_faq.rst‎
Lines changed: 54 additions & 14 deletions
diff --git a/‎docs/source-pytorch/common/hooks.rst‎
Lines changed: 38 additions & 19 deletions b/‎docs/source-pytorch/common/hooks.rst‎
Lines changed: 38 additions & 19 deletions
diff --git a/‎docs/source-pytorch/common/trainer.rst‎
Lines changed: 3 additions & 1 deletion b/‎docs/source-pytorch/common/trainer.rst‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/source-pytorch/expertise_levels.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source-pytorch/expertise_levels.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-pytorch/levels/intermediate.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source-pytorch/levels/intermediate.rst‎
Lines changed: 1 addition & 1 deletion
@@ -57,28 +57,32 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: actions/setup-python@v5
+      - name: Install uv and set Python version
+        uses: astral-sh/setup-uv@v6
         with:
-          # Python version here needs to be supported by all PL versions listed in back-compatible-versions.txt.
           python-version: "3.9"
+          # TODO: Avoid activating environment like this
+          # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
+          activate-environment: true
+          enable-cache: true
 
       - name: Install PL from source
         env:
           PACKAGE_NAME: pytorch
           FREEZE_REQUIREMENTS: 1
         timeout-minutes: 20
-        run: pip install . --extra-index-url="${TORCH_URL}"
+        run: uv pip install . --extra-index-url="${TORCH_URL}"
         if: inputs.pl_version == ''
 
       - name: Install PL version
         timeout-minutes: 20
-        run: pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}"
+        run: uv pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}"
         if: inputs.pl_version != ''
 
       - name: Adjust tests -> PL
         if: ${{ matrix.pkg-name != 'lightning' }}
         run: |
-          pip install -q -r .actions/requirements.txt
+          uv pip install -q -r .actions/requirements.txt
           python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
             --source_import="lightning.fabric,lightning.pytorch" \
             --target_import="lightning_fabric,pytorch_lightning"
@@ -115,7 +119,7 @@ jobs:
         # export to env bool if secrets.AWS_REGION is not empty
         run: echo "WITH_SECRETS=$([ -n '${{ secrets.AWS_REGION }}' ] && echo 1 || echo 0)" >> $GITHUB_ENV
 
-      - run: pip install -r requirements/ci.txt
+      - run: uv pip install -r requirements/ci.txt
       - name: Upload checkpoints to S3
         if: ${{ env.WITH_SECRETS == '1' }}
         working-directory: ${{ env.LEGACY_FOLDER }}
 
@@ -79,7 +79,7 @@ jobs:
         run: pip install -q -r .actions/requirements.txt
 
       - name: Set min. dependencies
-        if: ${{ matrix.requires == 'oldest' }}
+        if: ${{ matrix.config.requires == 'oldest' }}
         run: |
           cd requirements/fabric
           pip install -U "lightning-utilities[cli]"
@@ -88,7 +88,7 @@ jobs:
           pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
-        if: ${{ matrix.requires != 'oldest' }}
+        if: ${{ matrix.config.requires != 'oldest' }}
         run: |
           pip install -q -r requirements/ci.txt
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
 
@@ -69,48 +69,49 @@ jobs:
       TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/"
       TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/"
       FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
-      PYPI_CACHE_DIR: "_pip-wheels"
       # TODO: Remove this - Enable running MPS tests on this platform
       DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
     steps:
       - uses: actions/checkout@v5
 
-      - name: Set up Python ${{ matrix.config.python-version }}
-        uses: actions/setup-python@v5
+      - name: Install uv and set Python version
+        uses: astral-sh/setup-uv@v6
         with:
           python-version: ${{ matrix.config.python-version || '3.9' }}
+          # TODO: Avoid activating environment like this
+          # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
+          activate-environment: true
+          enable-cache: true
 
-      - name: basic setup
-        run: pip install -q -r .actions/requirements.txt
+      - name: Basic setup
+        run: uv pip install -q -r .actions/requirements.txt
+
+      - name: Append Env. vars for Linux
+        if: ${{ runner.os == 'Linux' }}
+        run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
+      - name: Append Env. vars for MacOS
+        if: ${{ runner.os == 'macOS' }}
+        run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
 
       - name: Set min. dependencies
-        if: ${{ matrix.requires == 'oldest' }}
+        if: ${{ matrix.config.requires == 'oldest' }}
         run: |
           cd requirements/pytorch
-          pip install -U "lightning-utilities[cli]"
+          uv pip install -U "lightning-utilities[cli]"
           python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt', 'test.txt']"
-          pip install "cython<3.0" wheel
-          pip install "pyyaml==5.4" --no-build-isolation
+          uv pip install "cython<3.0" wheel
+          uv pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
-        if: ${{ matrix.requires != 'oldest' }}
+        if: ${{ matrix.config.requires != 'oldest' }}
         run: |
-          pip install -q -r requirements/ci.txt
+          uv pip install -q -r requirements/ci.txt
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
           for fpath in `ls requirements/**/*.txt`; do \
             python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \
           done
           cat requirements/pytorch/base.txt
 
-      - name: pip wheels cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ${{ env.PYPI_CACHE_DIR }}
-          key: pypi_wheels
-      - run: |
-          mkdir -p $PYPI_CACHE_DIR
-          ls -lh $PYPI_CACHE_DIR
-
       - name: Env. variables
         run: |
           # Switch PyTorch URL between stable and test/future
@@ -125,20 +126,22 @@ jobs:
       - name: Install package & dependencies
         timeout-minutes: 20
         run: |
-          pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
-            -U --upgrade-strategy=eager --prefer-binary \
+          uv pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
+            --upgrade \
             -r requirements/_integrations/accelerators.txt \
-            --extra-index-url="${TORCH_URL}" \
-            --find-links="${PYPI_CACHE_DIR}" \
+            --find-links="${TORCH_URL}" \
             --find-links="https://download.pytorch.org/whl/torch-tensorrt"
-          pip list
+          uv pip list
+
       - name: Drop LAI from extensions
         if: ${{ matrix.config.pkg-name != 'lightning' }}
         # Lightning is dependency of Habana or other accelerators/integrations so in case we test PL we need to remove it
-        run: pip uninstall -y lightning
+        run: uv pip uninstall lightning
+
       - name: Drop PL for LAI
         if: ${{ matrix.config.pkg-name == 'lightning' }}
-        run: pip uninstall -y pytorch-lightning
+        run: uv pip uninstall pytorch-lightning
+
       - name: Dump handy wheels
         if: github.event_name == 'push' && github.ref == 'refs/heads/master'
         continue-on-error: true
@@ -215,6 +218,9 @@ jobs:
           name: CPU-coverage
           fail_ci_if_error: false
 
+      - name: Minimize uv cache
+        run: uv cache prune --ci
+
   pl-cpu-guardian:
     runs-on: ubuntu-latest
     needs: pl-cpu
 
@@ -5,31 +5,71 @@
 GPU training (FAQ)
 ==================
 
-******************************************************************
-How should I adjust the learning rate when using multiple devices?
-******************************************************************
+***************************************************************
+How should I adjust the batch size when using multiple devices?
+***************************************************************
 
-When using distributed training make sure to modify your learning rate according to your effective
-batch size.
+Lightning automatically shards your data across multiple GPUs, meaning that each device only sees a unique subset of your
+data, but the `batch_size` in your DataLoader remains the same. This means that the effective batch size e.g. the
+total number of samples processed in one forward/backward pass is
 
-Let's say you have a batch size of 7 in your dataloader.
+.. math::
 
-.. testcode::
+    \text{Effective Batch Size} = \text{DataLoader Batch Size} \times \text{Number of Devices} \times \text{Number of Nodes}
 
-    class LitModel(LightningModule):
-        def train_dataloader(self):
-            return Dataset(..., batch_size=7)
-
-Whenever you use multiple devices and/or nodes, your effective batch size will be 7 * devices * num_nodes.
+A couple of examples to illustrate this:
 
 .. code-block:: python
 
-    # effective batch size = 7 * 8
+    dataloader = DataLoader(..., batch_size=7)
+
+    # Single GPU: effective batch size = 7
+    Trainer(accelerator="gpu", devices=1)
+
+    # Multi-GPU: effective batch size = 7 * 8 = 56
     Trainer(accelerator="gpu", devices=8, strategy=...)
 
-    # effective batch size = 7 * 8 * 10
+    # Multi-node: effective batch size = 7 * 8 * 10 = 560
     Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy=...)
 
+In general you should be able to use the same `batch_size` in your DataLoader regardless of the number of devices you are
+using.
+
+.. note::
+
+    If you want distributed training to work exactly the same as single GPU training, you need to set the `batch_size`
+    in your DataLoader to `original_batch_size / num_devices` to maintain the same effective batch size. However, this
+    can lead to poor GPU utilization.
+
+----
+
+******************************************************************
+How should I adjust the learning rate when using multiple devices?
+******************************************************************
+
+Because the effective batch size is larger when using multiple devices, you need to adjust your learning rate
+accordingly. Because the learning rate is a hyperparameter that controls how much to change the model in response to
+the estimated error each time the model weights are updated, it is important to scale it with the effective batch size.
+
+In general, there are two common scaling rules:
+
+1. **Linear scaling**: Increase the learning rate linearly with the number of devices.
+
+    .. code-block:: python
+
+        # Example: Linear scaling
+        base_lr = 1e-3
+        num_devices = 8
+        scaled_lr = base_lr * num_devices  # 8e-3
+
+2. **Square root scaling**: Increase the learning rate by the square root of the number of devices.
+
+    .. code-block:: python
+
+        # Example: Square root scaling
+        base_lr = 1e-3
+        num_devices = 8
+        scaled_lr = base_lr * (num_devices ** 0.5)  # 2.83e-3
 
 .. note:: Huge batch sizes are actually really bad for convergence. Check out:
         `Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour <https://arxiv.org/abs/1706.02677>`_
 
@@ -83,13 +83,30 @@ with the source of each hook indicated:
     trainer.fit()
     │
     ├── setup(stage="fit")
-    │   └── [Callbacks only]
-    │
-    ├── on_fit_start()
+    │   ├── [LightningDataModule]
     │   ├── [Callbacks]
     │   ├── [LightningModule]
+    │   ├── [LightningModule.configure_shared_model()]
+    │   ├── [LightningModule.configure_model()]
+    │   ├── Strategy.restore_checkpoint_before_setup
+    │   │   ├── [LightningModule.on_load_checkpoint()]
+    │   │   ├── [LightningModule.load_state_dict()]
+    │   │   ├── [LightningDataModule.load_state_dict()]
+    │   │   ├── [Callbacks.on_load_checkpoint()]
+    │   │   └── [Callbacks.load_state_dict()]
     │   └── [Strategy]
     │
+    ├── on_fit_start()
+    │   ├── [Callbacks]
+    │   └── [LightningModule]
+    │
+    ├── Strategy.restore_checkpoint_after_setup
+    │   ├── [LightningModule.on_load_checkpoint()]
+    │   ├── [LightningModule.load_state_dict()]
+    │   ├── [LightningDataModule.load_state_dict()]
+    │   ├── [Callbacks.on_load_checkpoint()]
+    │   └── [Callbacks.load_state_dict()]
+    │
     ├── on_sanity_check_start()
     │   ├── [Callbacks]
     │   ├── [LightningModule]
@@ -143,23 +160,24 @@ with the source of each hook indicated:
     │   │   │   ├── [LightningModule]
     │   │   │   └── [Strategy]
     │   │   │
-    │   │   ├── on_before_zero_grad()
-    │   │   │   ├── [Callbacks]
-    │   │   │   └── [LightningModule]
-    │   │   │
     │   │   ├── [Forward Pass - training_step()]
     │   │   │   └── [Strategy only]
     │   │   │
-    │   │   ├── on_before_backward()
+    │   │   ├── on_before_zero_grad()
     │   │   │   ├── [Callbacks]
     │   │   │   └── [LightningModule]
     │   │   │
-    │   │   ├── [Backward Pass]
-    │   │   │   └── [Strategy only]
+    │   │   ├── optimizer_zero_grad()
+    │   │   │   └── [LightningModule only - optimizer_zero_grad()]
     │   │   │
-    │   │   ├── on_after_backward()
-    │   │   │   ├── [Callbacks]
-    │   │   │   └── [LightningModule]
+    │   │   ├── [Backward Pass - Strategy.backward()]
+    │   │   │   ├── on_before_backward()
+    │   │   │   │   ├── [Callbacks]
+    │   │   │   │   └── [LightningModule]
+    │   │   │   ├── LightningModule.backward()
+    │   │   │   └── on_after_backward()
+    │   │   │       ├── [Callbacks]
+    │   │   │       └── [LightningModule]
     │   │   │
     │   │   ├── on_before_optimizer_step()
     │   │   │   ├── [Callbacks]
@@ -212,13 +230,14 @@ with the source of each hook indicated:
     │   ├── [LightningModule]
     │   └── [Strategy]
     │
-    ├── on_fit_end()
-    │   ├── [Callbacks]
-    │   ├── [LightningModule]
-    │   └── [Strategy]
-    │
     └── teardown(stage="fit")
-        └── [Callbacks only]
+        ├── [Strategy]
+        ├── on_fit_end()
+        │   ├── [Callbacks]
+        │   └── [LightningModule]
+        ├── [LightningDataModule]
+        ├── [Callbacks]
+        └── [LightningModule]
 
 ***********************
 Testing Loop Hook Order
 
@@ -510,6 +510,7 @@ limit_train_batches
 
 How much of training dataset to check.
 Useful when debugging or testing something that happens at the end of an epoch.
+Value is per device.
 
 .. testcode::
 
@@ -535,7 +536,7 @@ limit_test_batches
     :width: 400
     :muted:
 
-How much of test dataset to check.
+How much of test dataset to check. Value is per device.
 
 .. testcode::
 
@@ -560,6 +561,7 @@ limit_val_batches
 
 How much of validation dataset to check.
 Useful when debugging or testing something that happens at the end of an epoch.
+Value is per device.
 
 .. testcode::
 
 
@@ -84,7 +84,7 @@ Learn to scale up your models and enable collaborative model development at acad
 .. Add callout items below this line
 
 .. displayitem::
-   :header: Level 7: Interactive cloud development
+   :header: Level 7: Hardware acceleration
    :description: Learn how to access GPUs and TPUs on the cloud.
    :button_link: levels/intermediate_level_7.html
    :col_css: col-md-6
 
@@ -16,7 +16,7 @@ Learn to scale up your models and enable collaborative model development at acad
 .. Add callout items below this line
 
 .. displayitem::
-   :header: Level 7: Interactive cloud development
+   :header: Level 7: Hardware acceleration
    :description: Learn how to access GPUs and TPUs on the cloud.
    :button_link: intermediate_level_7.html
    :col_css: col-md-6