Merge branch 'master' into dependabot-pip-requirements-psutil-lt-7.1.3

SkafteNicki · web-flow · commit d76b284b812d · 2025-10-29T06:39:43.000+01:00
diff --git a/.github/actions/pip-wheels/action.yml b/.github/actions/pip-wheels/action.yml
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -134,15 +134,6 @@ jobs:
             --find-links="${TORCH_URL}"
           uv pip list
 
-      - name: Dump handy wheels
-        if: github.event_name == 'push' && github.ref == 'refs/heads/master'
-        continue-on-error: true
-        uses: ./.github/actions/pip-wheels
-        with:
-          wheel-dir: ${{ env.PYPI_CACHE_DIR }}
-          torch-url: ${{ env.TORCH_URL }}
-          cache-key: "pypi_wheels"
-
       - name: Adjust tests
         if: ${{ matrix.config.pkg-name != 'lightning' }}
         run: |
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -147,15 +147,6 @@ jobs:
         if: ${{ matrix.config.pkg-name == 'lightning' }}
         run: uv pip uninstall pytorch-lightning
 
-      - name: Dump handy wheels
-        if: github.event_name == 'push' && github.ref == 'refs/heads/master'
-        continue-on-error: true
-        uses: ./.github/actions/pip-wheels
-        with:
-          wheel-dir: ${{ env.PYPI_CACHE_DIR }}
-          torch-url: ${{ env.TORCH_URL }}
-          cache-key: "pypi_wheels"
-
       - name: Cache datasets
         uses: actions/cache@v4
         with:
diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
@@ -47,7 +47,6 @@ defaults:
 env:
   FREEZE_REQUIREMENTS: "1"
   TORCH_URL: "https://download.pytorch.org/whl/cpu/"
-  PYPI_CACHE_DIR: "_pip-wheels"
   PYPI_LOCAL_DIR: "pypi_pkgs/"
 
 jobs:
@@ -95,12 +94,6 @@ jobs:
           aws s3 sync --no-sign-request s3://sphinx-packages/ ${PYPI_LOCAL_DIR}
           uv pip install lai-sphinx-theme -U -f ${PYPI_LOCAL_DIR}
 
-      - name: pip wheels cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ${{ env.PYPI_CACHE_DIR }}
-          key: pypi_wheels
-
       - name: Install pandoc & texlive
         if: ${{ matrix.pkg-name == 'pytorch' }}
         timeout-minutes: 5
@@ -111,10 +104,8 @@ jobs:
       - name: Install package & dependencies
         timeout-minutes: 20
         run: |
-          mkdir -p ${PYPI_CACHE_DIR}  # in case cache was not hit
-          ls -lh ${PYPI_CACHE_DIR}
           uv pip install .[all] -U -r requirements/${{ matrix.pkg-name }}/docs.txt \
-            -f ${PYPI_LOCAL_DIR} -f ${PYPI_CACHE_DIR} --extra-index-url="${TORCH_URL}"
+            -f ${PYPI_LOCAL_DIR} --extra-index-url="${TORCH_URL}"
           uv pip list
 
       - name: Install req. for Notebooks/tutorials
@@ -149,15 +140,6 @@ jobs:
           retention-days: ${{ env.ARTIFACT_DAYS }}
           include-hidden-files: true
 
-      #- name: Dump handy wheels
-      #  if: github.event_name == 'push' && github.ref == 'refs/heads/master'
-      #  continue-on-error: true
-      #  uses: ./.github/actions/pip-wheels
-      #  with:
-      #    wheel-dir: ${{ env.PYPI_CACHE_DIR }}
-      #    torch-url: ${{ env.TORCH_URL }}
-      #    cache-key: "pypi_wheels"
-
   deploy-docs:
     needs: docs-make
     if: github.repository_owner == 'Lightning-AI' && github.event_name != 'pull_request'
diff --git a/docs/source-pytorch/community/ecosystem-ci.rst b/docs/source-pytorch/community/ecosystem-ci.rst
diff --git a/docs/source-pytorch/community/index.rst b/docs/source-pytorch/community/index.rst
@@ -7,7 +7,6 @@
    ../generated/CONTRIBUTING.md
    ../generated/BECOMING_A_CORE_CONTRIBUTOR.md
    governance
-   ecosystem-ci
    ../versioning
    ../past_versions
    ../generated/CHANGELOG.md
@@ -70,13 +69,6 @@ Community
    :button_link: ../generated/CHANGELOG.html
    :height: 100
 
-.. displayitem::
-   :header: Ecosystem CI
-   :description: Automate issue discovery for your projects against Lightning nightly and releases
-   :col_css: col-md-12
-   :button_link: ecosystem-ci.html
-   :height: 100
-
 .. raw:: html
 
         </div>
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -61,6 +61,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed how `ThroughputMonitor` calculated training time ([#21291](https://github.com/Lightning-AI/pytorch-lightning/pull/21291))
 
 
+- Fixed synchronization of gradients in manual optimization with `DDPStrategy(static_graph=True)` ([#21251](https://github.com/Lightning-AI/pytorch-lightning/pull/21251))
+
+
+
 ---
 
 ## [2.5.5] - 2025-09-05
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
@@ -103,6 +103,7 @@ def __init__(
         self._process_group_backend: Optional[str] = process_group_backend
         self._timeout: Optional[timedelta] = timeout
         self._start_method = start_method
+        self._pl_static_graph_delay_done = False
 
     @property
     def is_distributed(self) -> bool:  # pragma: no-cover
@@ -319,6 +320,27 @@ def pre_backward(self, closure_loss: Tensor) -> None:
         if not self.lightning_module.automatic_optimization:
             prepare_for_backward(self.model, closure_loss)
 
+    @override
+    def post_backward(self, closure_loss: Tensor) -> None:
+        # Only for first static-graph iteration with manual optimization
+        model = self.model
+        lm = self.lightning_module
+        if not isinstance(model, DistributedDataParallel):
+            return
+        if lm is None or lm.automatic_optimization:
+            return
+        if not getattr(model, "static_graph", False):
+            return
+        if self._pl_static_graph_delay_done:
+            return
+
+        # Call DDP's own first-iter static-graph flush.
+        # This is what actually launches the bucket all-reduces.
+        reducer = model.reducer
+        reducer._delay_all_reduce()
+
+        self._pl_static_graph_delay_done = True
+
     @override
     def model_to_device(self) -> None:
         log.debug(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py
@@ -448,3 +448,50 @@ def creates_processes_externally(self):
         RuntimeError, match="Lightning attempted to launch new distributed processes with `local_rank > 0`."
     ):
         trainer.fit(model)
+
+
+@RunIf(min_cuda_gpus=2, standalone=True)
+@pytest.mark.parametrize("automatic_optimization", [True, False])
+@pytest.mark.parametrize("static_graph", [True, False])
+def test_ddp_gradients_synced(tmp_path, automatic_optimization, static_graph):
+    """Ensure gradients are synchronized across ranks for both optimization modes and static_graph settings."""
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = automatic_optimization
+
+        def training_step(self, batch, batch_idx):
+            if self.automatic_optimization:
+                return super().training_step(batch, batch_idx)
+
+            # manual optimization path
+            opt = self.optimizers()
+            opt.zero_grad()
+            out = super().training_step(batch, batch_idx)
+            loss = out["loss"]
+            self.manual_backward(loss)
+            opt.step()
+            return out
+
+        def on_train_batch_end(self, *args, **kwargs):
+            # record grad sum for sync check
+            grad_sum = self.layer.bias.grad.detach().sum()
+            self.log("grad_sum_min", grad_sum, sync_dist=True, reduce_fx="min")
+            self.log("grad_sum_max", grad_sum, sync_dist=True, reduce_fx="max")
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        accelerator="gpu",
+        devices=2,
+        strategy=DDPStrategy(static_graph=static_graph),
+        max_steps=1,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
+    trainer.fit(TestModel(), datamodule=BoringDataModule())
+
+    # assert all ranks saw identical grads
+    gmin = trainer.callback_metrics["grad_sum_min"]
+    gmax = trainer.callback_metrics["grad_sum_max"]
+    assert torch.allclose(gmin, gmax)