Lightning-AI
diff --git a/‎.github/workflows/_legacy-checkpoints.yml‎
Lines changed: 10 additions & 6 deletions b/‎.github/workflows/_legacy-checkpoints.yml‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎.github/workflows/ci-tests-fabric.yml‎
Lines changed: 34 additions & 33 deletions b/‎.github/workflows/ci-tests-fabric.yml‎
Lines changed: 34 additions & 33 deletions
diff --git a/‎.github/workflows/ci-tests-pytorch.yml‎
Lines changed: 32 additions & 25 deletions b/‎.github/workflows/ci-tests-pytorch.yml‎
Lines changed: 32 additions & 25 deletions
diff --git a/‎docs/source-fabric/guide/index.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source-fabric/guide/index.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-fabric/levels/intermediate.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source-fabric/levels/intermediate.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-pytorch/accelerators/gpu_faq.rst‎
Lines changed: 54 additions & 14 deletions b/‎docs/source-pytorch/accelerators/gpu_faq.rst‎
Lines changed: 54 additions & 14 deletions
@@ -57,28 +57,32 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: actions/setup-python@v5
+      - name: Install uv and set Python version
+        uses: astral-sh/setup-uv@v6
         with:
-          # Python version here needs to be supported by all PL versions listed in back-compatible-versions.txt.
           python-version: "3.9"
+          # TODO: Avoid activating environment like this
+          # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
+          activate-environment: true
+          enable-cache: true
 
       - name: Install PL from source
         env:
           PACKAGE_NAME: pytorch
           FREEZE_REQUIREMENTS: 1
         timeout-minutes: 20
-        run: pip install . --extra-index-url="${TORCH_URL}"
+        run: uv pip install . --extra-index-url="${TORCH_URL}"
         if: inputs.pl_version == ''
 
       - name: Install PL version
         timeout-minutes: 20
-        run: pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}"
+        run: uv pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}"
         if: inputs.pl_version != ''
 
       - name: Adjust tests -> PL
         if: ${{ matrix.pkg-name != 'lightning' }}
         run: |
-          pip install -q -r .actions/requirements.txt
+          uv pip install -q -r .actions/requirements.txt
           python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
             --source_import="lightning.fabric,lightning.pytorch" \
             --target_import="lightning_fabric,pytorch_lightning"
@@ -115,7 +119,7 @@ jobs:
         # export to env bool if secrets.AWS_REGION is not empty
         run: echo "WITH_SECRETS=$([ -n '${{ secrets.AWS_REGION }}' ] && echo 1 || echo 0)" >> $GITHUB_ENV
 
-      - run: pip install -r requirements/ci.txt
+      - run: uv pip install -r requirements/ci.txt
       - name: Upload checkpoints to S3
         if: ${{ env.WITH_SECRETS == '1' }}
         working-directory: ${{ env.LEGACY_FOLDER }}
 
@@ -62,49 +62,57 @@ jobs:
     env:
       PACKAGE_NAME: ${{ matrix.config.pkg-name }}
       FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
-      PYPI_CACHE_DIR: "_pip-wheels"
       TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/"
       TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/"
       # TODO: Remove this - Enable running MPS tests on this platform
       DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
     steps:
       - uses: actions/checkout@v5
 
-      - name: Set up Python ${{ matrix.config.python-version }}
-        uses: actions/setup-python@v5
+      - name: Install uv and set Python version
+        uses: astral-sh/setup-uv@v6
         with:
           python-version: ${{ matrix.config.python-version || '3.9' }}
+          # TODO: Avoid activating environment like this
+          # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
+          activate-environment: true
+          enable-cache: true
 
-      - name: basic setup
-        run: pip install -q -r .actions/requirements.txt
+      - name: Basic setup
+        run: uv pip install -q -r .actions/requirements.txt
+
+      - name: Append Env. vars for Linux
+        if: ${{ runner.os == 'Linux' }}
+        run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
+
+      - name: Append Env. vars for MacOS
+        if: ${{ runner.os == 'macOS' }}
+        run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
+
+      - name: Append Env. vars for Windows
+        if: ${{ runner.os == 'windows' }}
+        run: |
+          # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support"
+          echo "USE_LIBUV=0" >> $GITHUB_ENV
 
       - name: Set min. dependencies
         if: ${{ matrix.config.requires == 'oldest' }}
         run: |
           cd requirements/fabric
-          pip install -U "lightning-utilities[cli]"
+          uv pip install -U "lightning-utilities[cli]"
           python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt', 'test.txt']"
-          pip install "cython<3.0" wheel
-          pip install "pyyaml==5.4" --no-build-isolation
+          uv pip install "cython<3.0" wheel
+          uv pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
         if: ${{ matrix.config.requires != 'oldest' }}
         run: |
-          pip install -q -r requirements/ci.txt
+          uv pip install -q -r requirements/ci.txt
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
           for fpath in `ls requirements/**/*.txt`; do \
             python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \
           done
 
-      - name: pip wheels cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ${{ env.PYPI_CACHE_DIR }}
-          key: pypi_wheels
-      - run: |
-          mkdir -p $PYPI_CACHE_DIR
-          ls -lh $PYPI_CACHE_DIR
-
       - name: Expand Env. variables
         run: |
           # Switch PyTorch URL between stable and test/future
@@ -113,25 +121,15 @@ jobs:
           python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.config.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV
           # if you install mono-package set dependency only for this subpackage
           python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.config.pkg-name}}' != 'lightning' else 'fabric-'))" >> $GITHUB_ENV
-      - name: Append Env. vars for MacOS
-        if: ${{ runner.os == 'macOS' }}
-        run: |
-          # trying to avoid "gloo" issue with SIGABRT
-          echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
-      - name: Append Env. vars for Windows
-        if: ${{ runner.os == 'windows' }}
-        run: |
-          # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support"
-          echo "USE_LIBUV=0" >> $GITHUB_ENV
 
       - name: Install package & dependencies
         timeout-minutes: 20
         run: |
-          pip install -e ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
-            -U --upgrade-strategy=eager --prefer-binary \
-            --extra-index-url="${TORCH_URL}" \
-            --find-links="${PYPI_CACHE_DIR}"
-          pip list
+          uv pip install ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
+            --upgrade \
+            --find-links="${TORCH_URL}"
+          uv pip list
+
       - name: Dump handy wheels
         if: github.event_name == 'push' && github.ref == 'refs/heads/master'
         continue-on-error: true
@@ -179,6 +177,9 @@ jobs:
           name: CPU-coverage
           fail_ci_if_error: false
 
+      - name: Minimize uv cache
+        run: uv cache prune --ci
+
   fabric-cpu-guardian:
     runs-on: ubuntu-latest
     needs: fabric-cpu
 
@@ -69,48 +69,50 @@ jobs:
       TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/"
       TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/"
       FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
-      PYPI_CACHE_DIR: "_pip-wheels"
       # TODO: Remove this - Enable running MPS tests on this platform
       DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
     steps:
       - uses: actions/checkout@v5
 
-      - name: Set up Python ${{ matrix.config.python-version }}
-        uses: actions/setup-python@v5
+      - name: Install uv and set Python version
+        uses: astral-sh/setup-uv@v6
         with:
           python-version: ${{ matrix.config.python-version || '3.9' }}
+          # TODO: Avoid activating environment like this
+          # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
+          activate-environment: true
+          enable-cache: true
 
-      - name: basic setup
-        run: pip install -q -r .actions/requirements.txt
+      - name: Basic setup
+        run: uv pip install -q -r .actions/requirements.txt
+
+      - name: Append Env. vars for Linux
+        if: ${{ runner.os == 'Linux' }}
+        run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
+
+      - name: Append Env. vars for MacOS
+        if: ${{ runner.os == 'macOS' }}
+        run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
 
       - name: Set min. dependencies
         if: ${{ matrix.config.requires == 'oldest' }}
         run: |
           cd requirements/pytorch
-          pip install -U "lightning-utilities[cli]"
+          uv pip install -U "lightning-utilities[cli]"
           python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt', 'test.txt']"
-          pip install "cython<3.0" wheel
-          pip install "pyyaml==5.4" --no-build-isolation
+          uv pip install "cython<3.0" wheel
+          uv pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
         if: ${{ matrix.config.requires != 'oldest' }}
         run: |
-          pip install -q -r requirements/ci.txt
+          uv pip install -q -r requirements/ci.txt
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
           for fpath in `ls requirements/**/*.txt`; do \
             python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \
           done
           cat requirements/pytorch/base.txt
 
-      - name: pip wheels cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ${{ env.PYPI_CACHE_DIR }}
-          key: pypi_wheels
-      - run: |
-          mkdir -p $PYPI_CACHE_DIR
-          ls -lh $PYPI_CACHE_DIR
-
       - name: Env. variables
         run: |
           # Switch PyTorch URL between stable and test/future
@@ -125,20 +127,22 @@ jobs:
       - name: Install package & dependencies
         timeout-minutes: 20
         run: |
-          pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
-            -U --upgrade-strategy=eager --prefer-binary \
+          uv pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
+            --upgrade \
             -r requirements/_integrations/accelerators.txt \
-            --extra-index-url="${TORCH_URL}" \
-            --find-links="${PYPI_CACHE_DIR}" \
+            --find-links="${TORCH_URL}" \
             --find-links="https://download.pytorch.org/whl/torch-tensorrt"
-          pip list
+          uv pip list
+
       - name: Drop LAI from extensions
         if: ${{ matrix.config.pkg-name != 'lightning' }}
         # Lightning is dependency of Habana or other accelerators/integrations so in case we test PL we need to remove it
-        run: pip uninstall -y lightning
+        run: uv pip uninstall lightning
+
       - name: Drop PL for LAI
         if: ${{ matrix.config.pkg-name == 'lightning' }}
-        run: pip uninstall -y pytorch-lightning
+        run: uv pip uninstall pytorch-lightning
+
       - name: Dump handy wheels
         if: github.event_name == 'push' && github.ref == 'refs/heads/master'
         continue-on-error: true
@@ -215,6 +219,9 @@ jobs:
           name: CPU-coverage
           fail_ci_if_error: false
 
+      - name: Minimize uv cache
+        run: uv cache prune --ci
+
   pl-cpu-guardian:
     runs-on: ubuntu-latest
     needs: pl-cpu
 
@@ -78,7 +78,7 @@ Build your own Trainer
         <div class="row">
 
 .. displayitem::
-    :header: Organize your model code with with LightningModule
+    :header: Organize your model code with LightningModule
     :description: Organize your code in a LightningModule and use it with Fabric
     :button_link: lightning_module.html
     :col_css: col-md-4
 
@@ -19,7 +19,7 @@ Intermediate skills
         <div class="row">
 
 .. displayitem::
-    :header: Organize your model code with with LightningModule
+    :header: Organize your model code with LightningModule
     :description: Organize your code in a LightningModule and use it with Fabric
     :button_link: ../guide/lightning_module.html
     :col_css: col-md-4
 
@@ -5,31 +5,71 @@
 GPU training (FAQ)
 ==================
 
-******************************************************************
-How should I adjust the learning rate when using multiple devices?
-******************************************************************
+***************************************************************
+How should I adjust the batch size when using multiple devices?
+***************************************************************
 
-When using distributed training make sure to modify your learning rate according to your effective
-batch size.
+Lightning automatically shards your data across multiple GPUs, meaning that each device only sees a unique subset of your
+data, but the `batch_size` in your DataLoader remains the same. This means that the effective batch size e.g. the
+total number of samples processed in one forward/backward pass is
 
-Let's say you have a batch size of 7 in your dataloader.
+.. math::
 
-.. testcode::
+    \text{Effective Batch Size} = \text{DataLoader Batch Size} \times \text{Number of Devices} \times \text{Number of Nodes}
 
-    class LitModel(LightningModule):
-        def train_dataloader(self):
-            return Dataset(..., batch_size=7)
-
-Whenever you use multiple devices and/or nodes, your effective batch size will be 7 * devices * num_nodes.
+A couple of examples to illustrate this:
 
 .. code-block:: python
 
-    # effective batch size = 7 * 8
+    dataloader = DataLoader(..., batch_size=7)
+
+    # Single GPU: effective batch size = 7
+    Trainer(accelerator="gpu", devices=1)
+
+    # Multi-GPU: effective batch size = 7 * 8 = 56
     Trainer(accelerator="gpu", devices=8, strategy=...)
 
-    # effective batch size = 7 * 8 * 10
+    # Multi-node: effective batch size = 7 * 8 * 10 = 560
     Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy=...)
 
+In general you should be able to use the same `batch_size` in your DataLoader regardless of the number of devices you are
+using.
+
+.. note::
+
+    If you want distributed training to work exactly the same as single GPU training, you need to set the `batch_size`
+    in your DataLoader to `original_batch_size / num_devices` to maintain the same effective batch size. However, this
+    can lead to poor GPU utilization.
+
+----
+
+******************************************************************
+How should I adjust the learning rate when using multiple devices?
+******************************************************************
+
+Because the effective batch size is larger when using multiple devices, you need to adjust your learning rate
+accordingly. Because the learning rate is a hyperparameter that controls how much to change the model in response to
+the estimated error each time the model weights are updated, it is important to scale it with the effective batch size.
+
+In general, there are two common scaling rules:
+
+1. **Linear scaling**: Increase the learning rate linearly with the number of devices.
+
+    .. code-block:: python
+
+        # Example: Linear scaling
+        base_lr = 1e-3
+        num_devices = 8
+        scaled_lr = base_lr * num_devices  # 8e-3
+
+2. **Square root scaling**: Increase the learning rate by the square root of the number of devices.
+
+    .. code-block:: python
+
+        # Example: Square root scaling
+        base_lr = 1e-3
+        num_devices = 8
+        scaled_lr = base_lr * (num_devices ** 0.5)  # 2.83e-3
 
 .. note:: Huge batch sizes are actually really bad for convergence. Check out:
         `Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour <https://arxiv.org/abs/1706.02677>`_