Merge branch 'master' into fsdp-grad-clip-by-norm

amorehead · web-flow · commit 6f04f9c4f1ae · 2025-09-03T09:22:10.000-07:00
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -62,49 +62,57 @@ jobs:
     env:
       PACKAGE_NAME: ${{ matrix.config.pkg-name }}
       FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
-      PYPI_CACHE_DIR: "_pip-wheels"
       TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/"
       TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/"
       # TODO: Remove this - Enable running MPS tests on this platform
       DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
     steps:
       - uses: actions/checkout@v5
 
-      - name: Set up Python ${{ matrix.config.python-version }}
-        uses: actions/setup-python@v5
+      - name: Install uv and set Python version
+        uses: astral-sh/setup-uv@v6
         with:
           python-version: ${{ matrix.config.python-version || '3.9' }}
+          # TODO: Avoid activating environment like this
+          # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
+          activate-environment: true
+          enable-cache: true
 
-      - name: basic setup
-        run: pip install -q -r .actions/requirements.txt
+      - name: Basic setup
+        run: uv pip install -q -r .actions/requirements.txt
+
+      - name: Append Env. vars for Linux
+        if: ${{ runner.os == 'Linux' }}
+        run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
+
+      - name: Append Env. vars for MacOS
+        if: ${{ runner.os == 'macOS' }}
+        run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
+
+      - name: Append Env. vars for Windows
+        if: ${{ runner.os == 'windows' }}
+        run: |
+          # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support"
+          echo "USE_LIBUV=0" >> $GITHUB_ENV
 
       - name: Set min. dependencies
         if: ${{ matrix.config.requires == 'oldest' }}
         run: |
           cd requirements/fabric
-          pip install -U "lightning-utilities[cli]"
+          uv pip install -U "lightning-utilities[cli]"
           python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt', 'test.txt']"
-          pip install "cython<3.0" wheel
-          pip install "pyyaml==5.4" --no-build-isolation
+          uv pip install "cython<3.0" wheel
+          uv pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
         if: ${{ matrix.config.requires != 'oldest' }}
         run: |
-          pip install -q -r requirements/ci.txt
+          uv pip install -q -r requirements/ci.txt
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
           for fpath in `ls requirements/**/*.txt`; do \
             python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \
           done
 
-      - name: pip wheels cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ${{ env.PYPI_CACHE_DIR }}
-          key: pypi_wheels
-      - run: |
-          mkdir -p $PYPI_CACHE_DIR
-          ls -lh $PYPI_CACHE_DIR
-
       - name: Expand Env. variables
         run: |
           # Switch PyTorch URL between stable and test/future
@@ -113,25 +121,15 @@ jobs:
           python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.config.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV
           # if you install mono-package set dependency only for this subpackage
           python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.config.pkg-name}}' != 'lightning' else 'fabric-'))" >> $GITHUB_ENV
-      - name: Append Env. vars for MacOS
-        if: ${{ runner.os == 'macOS' }}
-        run: |
-          # trying to avoid "gloo" issue with SIGABRT
-          echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
-      - name: Append Env. vars for Windows
-        if: ${{ runner.os == 'windows' }}
-        run: |
-          # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support"
-          echo "USE_LIBUV=0" >> $GITHUB_ENV
 
       - name: Install package & dependencies
         timeout-minutes: 20
         run: |
-          pip install -e ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
-            -U --upgrade-strategy=eager --prefer-binary \
-            --extra-index-url="${TORCH_URL}" \
-            --find-links="${PYPI_CACHE_DIR}"
-          pip list
+          uv pip install ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
+            --upgrade \
+            --find-links="${TORCH_URL}"
+          uv pip list
+
       - name: Dump handy wheels
         if: github.event_name == 'push' && github.ref == 'refs/heads/master'
         continue-on-error: true
@@ -179,6 +177,9 @@ jobs:
           name: CPU-coverage
           fail_ci_if_error: false
 
+      - name: Minimize uv cache
+        run: uv cache prune --ci
+
   fabric-cpu-guardian:
     runs-on: ubuntu-latest
     needs: fabric-cpu
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -89,6 +89,7 @@ jobs:
       - name: Append Env. vars for Linux
         if: ${{ runner.os == 'Linux' }}
         run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
+
       - name: Append Env. vars for MacOS
         if: ${{ runner.os == 'macOS' }}
         run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
diff --git a/docs/source-fabric/guide/index.rst b/docs/source-fabric/guide/index.rst
@@ -78,7 +78,7 @@ Build your own Trainer
         <div class="row">
 
 .. displayitem::
-    :header: Organize your model code with with LightningModule
+    :header: Organize your model code with LightningModule
     :description: Organize your code in a LightningModule and use it with Fabric
     :button_link: lightning_module.html
     :col_css: col-md-4
diff --git a/docs/source-fabric/levels/intermediate.rst b/docs/source-fabric/levels/intermediate.rst
@@ -19,7 +19,7 @@ Intermediate skills
         <div class="row">
 
 .. displayitem::
-    :header: Organize your model code with with LightningModule
+    :header: Organize your model code with LightningModule
     :description: Organize your code in a LightningModule and use it with Fabric
     :button_link: ../guide/lightning_module.html
     :col_css: col-md-4
diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py
@@ -34,6 +34,67 @@ class DeviceStatsMonitor(Callback):
     r"""Automatically monitors and logs device stats during training, validation and testing stage.
     ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``.
 
+    **Logged Metrics**
+
+    Logs device statistics with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}``.
+    The actual metrics depend on the active accelerator and the ``cpu_stats`` flag. Below are an overview of the
+    possible available metrics and their meaning.
+
+    - CPU (via ``psutil``)
+
+        - ``cpu_percent`` — System-wide CPU utilization (%)
+        - ``cpu_vm_percent`` — System-wide virtual memory (RAM) utilization (%)
+        - ``cpu_swap_percent`` — System-wide swap memory utilization (%)
+
+    - CUDA GPU (via ``torch.cuda.memory_stats``)
+
+        Logs memory statistics from PyTorch caching allocator (all in bytes).
+        GPU compute utilization is not logged by default.
+
+        - General Memory Usage:
+
+            - ``allocated_bytes.all.current`` — Current allocated GPU memory
+            - ``allocated_bytes.all.peak`` — Peak allocated GPU memory
+            - ``reserved_bytes.all.current`` — Current reserved GPU memory (allocated + cached)
+            - ``reserved_bytes.all.peak`` — Peak reserved GPU memory
+            - ``active_bytes.all.current`` — Current GPU memory in active use
+            - ``active_bytes.all.peak`` — Peak GPU memory in active use
+            - ``inactive_split_bytes.all.current`` — Memory in inactive, splittable blocks
+
+        - Allocator Pool Statistics* (for ``small_pool`` and ``large_pool``):
+
+            - ``allocated_bytes.{pool_type}.current`` / ``allocated_bytes.{pool_type}.peak``
+            - ``reserved_bytes.{pool_type}.current`` / ``reserved_bytes.{pool_type}.peak``
+            - ``active_bytes.{pool_type}.current`` / ``active_bytes.{pool_type}.peak``
+
+        - Allocator Events:
+
+            - ``num_ooms`` — Cumulative out-of-memory errors
+            - ``num_alloc_retries`` — Number of allocation retries
+            - ``num_device_alloc`` — Number of device allocations
+            - ``num_device_free`` — Number of device deallocations
+
+        For a full list of CUDA memory stats, see the
+        `PyTorch documentation <https://docs.pytorch.org/docs/stable//generated/torch.cuda.device_memory_used.html>`_.
+
+    - TPU (via ``torch_xla``)
+
+        - *Memory Metrics* (per device, e.g., ``xla:0``):
+
+            - ``memory.free.xla:0`` — Free HBM memory (MB)
+            - ``memory.used.xla:0`` — Used HBM memory (MB)
+            - ``memory.percent.xla:0`` — Percentage of HBM memory used (%)
+
+        - *XLA Operation Counters*:
+
+            - ``CachedCompile.xla``
+            - ``CreateXlaTensor.xla``
+            - ``DeviceDataCacheMiss.xla``
+            - ``UncachedCompile.xla``
+            - ``xla::add.xla``, ``xla::addmm.xla``, etc.
+
+        These counters can be retrieved using: ``torch_xla.debug.metrics.counter_names()``
+
     Args:
         cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU.
             If ``True``, it will log CPU stats regardless of the accelerator.
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
@@ -125,14 +125,13 @@ def __init__(
         exclude_frozen_parameters: bool = False,
     ) -> None:
         """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
-        billion parameter models. `For more information: https://pytorch-
-        lightning.readthedocs.io/en/stable/advanced/model_parallel.html#deepspeed`.
+        billion parameter models. *For more information:* :ref:`deepspeed_advanced`.
 
         .. warning::  This is an :ref:`experimental <versioning:Experimental API>` feature.
 
         Defaults have been set to enable ZeRO-Offload and some have been taken from the link below.
         These defaults have been set generally, but may require tuning for optimum performance based on your model size.
-        `For more information: https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training`.
+        *For more information:* https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training.
 
         Arguments: