Skip to content

Commit d76b284

Browse files
authored
Merge branch 'master' into dependabot-pip-requirements-psutil-lt-7.1.3
2 parents 2315a6c + 10675b4 commit d76b284

File tree

9 files changed

+74
-156
lines changed

9 files changed

+74
-156
lines changed

.github/actions/pip-wheels/action.yml

Lines changed: 0 additions & 82 deletions
This file was deleted.

.github/workflows/ci-tests-fabric.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,6 @@ jobs:
134134
--find-links="${TORCH_URL}"
135135
uv pip list
136136
137-
- name: Dump handy wheels
138-
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
139-
continue-on-error: true
140-
uses: ./.github/actions/pip-wheels
141-
with:
142-
wheel-dir: ${{ env.PYPI_CACHE_DIR }}
143-
torch-url: ${{ env.TORCH_URL }}
144-
cache-key: "pypi_wheels"
145-
146137
- name: Adjust tests
147138
if: ${{ matrix.config.pkg-name != 'lightning' }}
148139
run: |

.github/workflows/ci-tests-pytorch.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -147,15 +147,6 @@ jobs:
147147
if: ${{ matrix.config.pkg-name == 'lightning' }}
148148
run: uv pip uninstall pytorch-lightning
149149

150-
- name: Dump handy wheels
151-
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
152-
continue-on-error: true
153-
uses: ./.github/actions/pip-wheels
154-
with:
155-
wheel-dir: ${{ env.PYPI_CACHE_DIR }}
156-
torch-url: ${{ env.TORCH_URL }}
157-
cache-key: "pypi_wheels"
158-
159150
- name: Cache datasets
160151
uses: actions/cache@v4
161152
with:

.github/workflows/docs-build.yml

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ defaults:
4747
env:
4848
FREEZE_REQUIREMENTS: "1"
4949
TORCH_URL: "https://download.pytorch.org/whl/cpu/"
50-
PYPI_CACHE_DIR: "_pip-wheels"
5150
PYPI_LOCAL_DIR: "pypi_pkgs/"
5251

5352
jobs:
@@ -95,12 +94,6 @@ jobs:
9594
aws s3 sync --no-sign-request s3://sphinx-packages/ ${PYPI_LOCAL_DIR}
9695
uv pip install lai-sphinx-theme -U -f ${PYPI_LOCAL_DIR}
9796
98-
- name: pip wheels cache
99-
uses: actions/cache/restore@v4
100-
with:
101-
path: ${{ env.PYPI_CACHE_DIR }}
102-
key: pypi_wheels
103-
10497
- name: Install pandoc & texlive
10598
if: ${{ matrix.pkg-name == 'pytorch' }}
10699
timeout-minutes: 5
@@ -111,10 +104,8 @@ jobs:
111104
- name: Install package & dependencies
112105
timeout-minutes: 20
113106
run: |
114-
mkdir -p ${PYPI_CACHE_DIR} # in case cache was not hit
115-
ls -lh ${PYPI_CACHE_DIR}
116107
uv pip install .[all] -U -r requirements/${{ matrix.pkg-name }}/docs.txt \
117-
-f ${PYPI_LOCAL_DIR} -f ${PYPI_CACHE_DIR} --extra-index-url="${TORCH_URL}"
108+
-f ${PYPI_LOCAL_DIR} --extra-index-url="${TORCH_URL}"
118109
uv pip list
119110
120111
- name: Install req. for Notebooks/tutorials
@@ -149,15 +140,6 @@ jobs:
149140
retention-days: ${{ env.ARTIFACT_DAYS }}
150141
include-hidden-files: true
151142

152-
#- name: Dump handy wheels
153-
# if: github.event_name == 'push' && github.ref == 'refs/heads/master'
154-
# continue-on-error: true
155-
# uses: ./.github/actions/pip-wheels
156-
# with:
157-
# wheel-dir: ${{ env.PYPI_CACHE_DIR }}
158-
# torch-url: ${{ env.TORCH_URL }}
159-
# cache-key: "pypi_wheels"
160-
161143
deploy-docs:
162144
needs: docs-make
163145
if: github.repository_owner == 'Lightning-AI' && github.event_name != 'pull_request'

docs/source-pytorch/community/ecosystem-ci.rst

Lines changed: 0 additions & 29 deletions
This file was deleted.

docs/source-pytorch/community/index.rst

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
../generated/CONTRIBUTING.md
88
../generated/BECOMING_A_CORE_CONTRIBUTOR.md
99
governance
10-
ecosystem-ci
1110
../versioning
1211
../past_versions
1312
../generated/CHANGELOG.md
@@ -70,13 +69,6 @@ Community
7069
:button_link: ../generated/CHANGELOG.html
7170
:height: 100
7271

73-
.. displayitem::
74-
:header: Ecosystem CI
75-
:description: Automate issue discovery for your projects against Lightning nightly and releases
76-
:col_css: col-md-12
77-
:button_link: ecosystem-ci.html
78-
:height: 100
79-
8072
.. raw:: html
8173

8274
</div>

src/lightning/pytorch/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
6161
- Fixed how `ThroughputMonitor` calculated training time ([#21291](https://github.com/Lightning-AI/pytorch-lightning/pull/21291))
6262

6363

64+
- Fixed synchronization of gradients in manual optimization with `DDPStrategy(static_graph=True)` ([#21251](https://github.com/Lightning-AI/pytorch-lightning/pull/21251))
65+
66+
67+
6468
---
6569

6670
## [2.5.5] - 2025-09-05

src/lightning/pytorch/strategies/ddp.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def __init__(
103103
self._process_group_backend: Optional[str] = process_group_backend
104104
self._timeout: Optional[timedelta] = timeout
105105
self._start_method = start_method
106+
self._pl_static_graph_delay_done = False
106107

107108
@property
108109
def is_distributed(self) -> bool: # pragma: no-cover
@@ -319,6 +320,27 @@ def pre_backward(self, closure_loss: Tensor) -> None:
319320
if not self.lightning_module.automatic_optimization:
320321
prepare_for_backward(self.model, closure_loss)
321322

323+
@override
324+
def post_backward(self, closure_loss: Tensor) -> None:
325+
# Only for first static-graph iteration with manual optimization
326+
model = self.model
327+
lm = self.lightning_module
328+
if not isinstance(model, DistributedDataParallel):
329+
return
330+
if lm is None or lm.automatic_optimization:
331+
return
332+
if not getattr(model, "static_graph", False):
333+
return
334+
if self._pl_static_graph_delay_done:
335+
return
336+
337+
# Call DDP's own first-iter static-graph flush.
338+
# This is what actually launches the bucket all-reduces.
339+
reducer = model.reducer
340+
reducer._delay_all_reduce()
341+
342+
self._pl_static_graph_delay_done = True
343+
322344
@override
323345
def model_to_device(self) -> None:
324346
log.debug(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")

tests/tests_pytorch/strategies/test_ddp_integration.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,3 +448,50 @@ def creates_processes_externally(self):
448448
RuntimeError, match="Lightning attempted to launch new distributed processes with `local_rank > 0`."
449449
):
450450
trainer.fit(model)
451+
452+
453+
@RunIf(min_cuda_gpus=2, standalone=True)
454+
@pytest.mark.parametrize("automatic_optimization", [True, False])
455+
@pytest.mark.parametrize("static_graph", [True, False])
456+
def test_ddp_gradients_synced(tmp_path, automatic_optimization, static_graph):
457+
"""Ensure gradients are synchronized across ranks for both optimization modes and static_graph settings."""
458+
459+
class TestModel(BoringModel):
460+
def __init__(self):
461+
super().__init__()
462+
self.automatic_optimization = automatic_optimization
463+
464+
def training_step(self, batch, batch_idx):
465+
if self.automatic_optimization:
466+
return super().training_step(batch, batch_idx)
467+
468+
# manual optimization path
469+
opt = self.optimizers()
470+
opt.zero_grad()
471+
out = super().training_step(batch, batch_idx)
472+
loss = out["loss"]
473+
self.manual_backward(loss)
474+
opt.step()
475+
return out
476+
477+
def on_train_batch_end(self, *args, **kwargs):
478+
# record grad sum for sync check
479+
grad_sum = self.layer.bias.grad.detach().sum()
480+
self.log("grad_sum_min", grad_sum, sync_dist=True, reduce_fx="min")
481+
self.log("grad_sum_max", grad_sum, sync_dist=True, reduce_fx="max")
482+
483+
trainer = Trainer(
484+
default_root_dir=tmp_path,
485+
accelerator="gpu",
486+
devices=2,
487+
strategy=DDPStrategy(static_graph=static_graph),
488+
max_steps=1,
489+
enable_progress_bar=False,
490+
enable_model_summary=False,
491+
)
492+
trainer.fit(TestModel(), datamodule=BoringDataModule())
493+
494+
# assert all ranks saw identical grads
495+
gmin = trainer.callback_metrics["grad_sum_min"]
496+
gmax = trainer.callback_metrics["grad_sum_max"]
497+
assert torch.allclose(gmin, gmax)

0 commit comments

Comments
 (0)