From d83abd4ad8f8db506cb764fe8bec23107f5afe00 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 3 Sep 2025 10:56:15 +0200 Subject: [PATCH 01/40] chlog: new section --- src/lightning/fabric/CHANGELOG.md | 12 ++++++++++++ src/lightning/pytorch/CHANGELOG.md | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index b1102cdce06b7..8fe4e4e2f6d17 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -6,6 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). --- +## [unReleased] - 2025-09-DD + +### Changed + +- + + +### Fixed + +- + + ## [2.5.4] - 2025-08-29 ### Changed diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 01b64c38051b3..5ad62005c6d40 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -6,6 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). --- +## [unReleased] - 2025-09-DD + +### Changed + +- + + +### Fixed + +- + + ## [2.5.4] - 2025-08-29 ### Fixed From 58ff367657bed1f07489c29e5b3b095d3facab82 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:11:00 +0200 Subject: [PATCH 02/40] docs: simplify version table (#21123) * docs: simplify version table * Apply suggestions from code review * update (cherry picked from commit 691f084fd2dd88bdc55cafab694748d6ae1b4c22) --- docs/source-pytorch/versioning.rst | 71 ++++++++++++++++-------------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/docs/source-pytorch/versioning.rst b/docs/source-pytorch/versioning.rst index 4a04bd1534de9..296f8052e863d 100644 --- a/docs/source-pytorch/versioning.rst +++ b/docs/source-pytorch/versioning.rst @@ -67,6 +67,13 @@ PyTorch Lightning follows `NEP 29 Date: Fri, 29 Aug 2025 14:23:00 +0200 Subject: [PATCH 03/40] Adding test for legacy checkpoint created with 2.5.4 (#21128) [create-pull-request] automated change Co-authored-by: Borda Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 0ebf1dd0172de821a17b3cb964fc3d5a9e8e223a) --- tests/legacy/back-compatible-versions.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/legacy/back-compatible-versions.txt b/tests/legacy/back-compatible-versions.txt index 9032e5c13cc54..885dd6f85b36a 100644 --- a/tests/legacy/back-compatible-versions.txt +++ b/tests/legacy/back-compatible-versions.txt @@ -107,3 +107,4 @@ 2.5.1 2.5.2 2.5.3 +2.5.4 From 1df18fff44504060c7ce98e7a90b8f2644142694 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Sat, 30 Aug 2025 00:20:25 +0200 Subject: [PATCH 04/40] resolve failing tests with pt-2.1 (#21130) * pip uninstall -y deepspeed * TORCH_VER * Apply suggestions from code review * A100 & L4 * machine (cherry picked from commit d85c474eb4a3f5e0b270ef058a32a3474595a84d) --- .lightning/workflows/fabric.yml | 19 +++++++++++++------ .lightning/workflows/pytorch.yml | 21 ++++++++++++++------- dockers/base-cuda/Dockerfile | 2 +- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml index 438f56ef7fe94..e6d97ca1b1887 100644 --- a/.lightning/workflows/fabric.yml +++ b/.lightning/workflows/fabric.yml @@ -4,20 +4,22 @@ trigger: pull_request: branches: ["master"] -timeout: "75" # minutes -machine: "L4_X_2" +timeout: "55" # minutes parametrize: matrix: {} include: - # note that this is setting also all oldest requirements which is linked to Torch == 2.0 + # note that this is setting also all oldest requirements which is linked to Torch == 2.1 - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" PACKAGE_NAME: "fabric" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" + machine: "A100_X_2" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "fabric" + machine: "L4_X_2" # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" # PACKAGE_NAME: "fabric" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "lightning" + machine: "L4_X_2" exclude: [] env: @@ -30,6 +32,7 @@ run: | python --version pip --version pip install -q fire wget packaging + pip list set -ex CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda" @@ -40,12 +43,15 @@ run: | echo "Torch URL: ${TORCH_URL}" COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))') echo "collecting coverage for: ${COVERAGE_SOURCE}" + TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])") if [ "${TORCH_VER}" == "2.1" ]; then echo "Set oldest versions" - cd requirements/fabric + pip uninstall -y deepspeed pip install -U "lightning-utilities[cli]" + cd requirements/fabric python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']" + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt cd ../.. pip install "cython<3.0" wheel # for compatibility fi @@ -92,6 +98,7 @@ run: | export PL_RUN_STANDALONE_TESTS=1 wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh bash ./run_standalone_tests.sh "tests_fabric" + export PL_RUN_STANDALONE_TESTS=0 # echo "Reporting coverage" # todo # python -m coverage report diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index 5c92bf881d969..21551565b1fed 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -4,20 +4,22 @@ trigger: pull_request: branches: ["master"] -timeout: "75" # minutes -machine: "L4_X_2" +timeout: "55" # minutes parametrize: matrix: {} include: - # note that this is setting also all oldest requirements which is linked to Torch == 2.0 + # note that this is setting also all oldest requirements which is linked to Torch == 2.1 - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" PACKAGE_NAME: "pytorch" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" + machine: "A100_X_2" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "pytorch" + machine: "L4_X_2" # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" # PACKAGE_NAME: "pytorch" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "lightning" + machine: "L4_X_2" exclude: [] env: @@ -30,6 +32,7 @@ run: | python --version pip --version pip install -q fire wget packaging + pip list set -ex CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda" @@ -40,12 +43,15 @@ run: | echo "Torch URL: ${TORCH_URL}" COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))') echo "collecting coverage for: ${COVERAGE_SOURCE}" + TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])") if [ "${TORCH_VER}" == "2.1" ]; then - recho "Set oldest versions" - cd requirements/pytorch + echo "Set oldest versions" + pip uninstall -y deepspeed pip install -U "lightning-utilities[cli]" + cd requirements/pytorch python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']" + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt cd ../.. pip install "cython<3.0" wheel # for compatibility fi @@ -108,6 +114,7 @@ run: | export PL_RUN_STANDALONE_TESTS=1 wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh bash ./run_standalone_tests.sh "tests_pytorch" + export PL_RUN_STANDALONE_TESTS=0 echo "Testing: PyTorch standalone tasks" cd tests_pytorch/ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 2b6f48771c7f7..41faf0ca55113 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG UBUNTU_VERSION=22.04 -ARG CUDA_VERSION=11.7.1 +ARG CUDA_VERSION=12.1.1 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} From 94f8084d809d22f7929283b3eed0e98c223f7cfb Mon Sep 17 00:00:00 2001 From: littlebullGit Date: Fri, 29 Aug 2025 18:24:07 -0400 Subject: [PATCH 05/40] fix(callbacks): Defer step/time-triggered ModelCheckpoint saves until validation metrics are available (#21106) * fix(callbacks): defer step/time-triggered ModelCheckpoint saves until validation metrics are available Root cause: - With `every_n_train_steps` (or `train_time_interval`), checkpoints could save at train batch end before validation ran, so the monitored val metric was missing/stale and `best_model_score` was incorrect. (Refs #20919) Fix: - In [src/lightning/pytorch/callbacks/model_checkpoint.py:ModelCheckpoint.on_train_batch_end]: - Defer saves when the monitored key is missing from [trainer.callback_metrics] - If on the last train batch and not saving at train-epoch-end, defer only when validation will run next: - `trainer.enable_validation` is True - `trainer.num_val_batches` > 0 - `trainer.check_val_every_n_epoch` schedule matches the upcoming epoch - Perform deferred saves in [on_validation_end], ensuring fresh validation metrics are used. - Allow zero `timedelta` for `train_time_interval` and broadcast the time-trigger decision across ranks. - Do not defer when monitoring a train metric or when no validation is scheduled. Tests: - Repro (previously failing, now passing): - [tests/tests_pytorch/callbacks/test_model_checkpoint_step_interval_val_metric.py] - Additional validations: - [tests/tests_pytorch/callbacks/test_model_checkpoint_additional_cases.py] - [tests/tests_pytorch/callbacks/test_model_checkpoint_edge_cases.py] Outcome: - `best_model_score` matches the validation metric after the epoch. - Step/time-interval checkpointing behaves correctly without premature or skipped saves. * test: disable logger in model checkpoint tests to avoid side effects * chlog --------- Co-authored-by: Jirka B (cherry picked from commit b1cc925d941c0a829a804c8286fa06d7e4dcc2ce) --- src/lightning/pytorch/CHANGELOG.md | 3 +- .../pytorch/callbacks/model_checkpoint.py | 56 ++++- .../test_model_checkpoint_additional_cases.py | 208 ++++++++++++++++++ .../test_model_checkpoint_edge_cases.py | 174 +++++++++++++++ ...del_checkpoint_step_interval_val_metric.py | 106 +++++++++ 5 files changed, 543 insertions(+), 4 deletions(-) create mode 100644 tests/tests_pytorch/callbacks/test_model_checkpoint_additional_cases.py create mode 100644 tests/tests_pytorch/callbacks/test_model_checkpoint_edge_cases.py create mode 100644 tests/tests_pytorch/callbacks/test_model_checkpoint_step_interval_val_metric.py diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 5ad62005c6d40..cdaea3e72ecbb 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -15,7 +15,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106)) + ## [2.5.4] - 2025-08-29 diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py index 68fed2ff82d31..452e8bdecbba3 100644 --- a/src/lightning/pytorch/callbacks/model_checkpoint.py +++ b/src/lightning/pytorch/callbacks/model_checkpoint.py @@ -260,6 +260,9 @@ def __init__( self.best_model_path = "" self.last_model_path = "" self._last_checkpoint_saved = "" + # When using step/time-based checkpointing with a validation-only monitored metric, + # defer the save until validation has produced the metric + self._defer_save_until_validation: bool = False self.kth_value: Tensor self.dirpath: Optional[_PATH] @@ -306,14 +309,17 @@ def on_train_batch_end( batch_idx: int, ) -> None: """Save checkpoint on train batch end if we meet the criteria for `every_n_train_steps`""" - if self._should_skip_saving_checkpoint(trainer): - return + # Do not return early here because we may need to set deferral flags even + # if a save already happened at this global step. We'll enforce the skip + # just before actually saving below. + skip_due_to_state = self._should_skip_saving_checkpoint(trainer) skip_batch = self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0) train_time_interval = self._train_time_interval skip_time = True now = time.monotonic() - if train_time_interval: + # Important: allow zero timedelta as a valid interval + if train_time_interval is not None: prev_time_check = self._last_time_checked skip_time = prev_time_check is None or (now - prev_time_check) < train_time_interval.total_seconds() # in case we have time differences across ranks @@ -326,6 +332,42 @@ def on_train_batch_end( self._last_time_checked = now monitor_candidates = self._monitor_candidates(trainer) + # If monitoring a metric that is not yet available (e.g., validation-only), + # defer saving until validation end so the metric is present. + if self.monitor is not None and self.monitor not in monitor_candidates: + # Defer both top-k and last to avoid blocking with `_last_global_step_saved` + self._defer_save_until_validation = True + return + + # Even if the monitored key exists, it could be stale from a previous validation. + # If validation is scheduled to run right after this batch (e.g., last batch of epoch) + # and we are not saving at train epoch end, defer to `on_validation_end` to use fresh metrics. + if ( + self.monitor is not None + and not self._should_save_on_train_epoch_end(trainer) + and getattr(trainer.fit_loop.epoch_loop.batch_progress, "is_last_batch", False) + ): + # Only defer if a validation loop is expected to run after this batch. + will_run_val = False + if getattr(trainer, "enable_validation", False): + num_val_batches = ( + sum(trainer.num_val_batches) + if isinstance(trainer.num_val_batches, list) + else trainer.num_val_batches + ) + if num_val_batches and num_val_batches > 0: + cve = trainer.check_val_every_n_epoch + if cve is None or ((trainer.current_epoch + 1) % cve == 0): + will_run_val = True + + if will_run_val: + self._defer_save_until_validation = True + return + + # Only proceed to save if not skipping due to trainer/callback state + if skip_due_to_state: + return + self._save_topk_checkpoint(trainer, monitor_candidates) self._save_last_checkpoint(trainer, monitor_candidates) @@ -343,6 +385,14 @@ def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModul """Save a checkpoint at the end of the validation stage.""" if not self._should_skip_saving_checkpoint(trainer) and not self._should_save_on_train_epoch_end(trainer): monitor_candidates = self._monitor_candidates(trainer) + # If a step/time-triggered save was deferred due to a missing monitored metric, + # perform the save now that validation metrics are available. + if self._defer_save_until_validation: + self._save_topk_checkpoint(trainer, monitor_candidates) + self._save_last_checkpoint(trainer, monitor_candidates) + self._defer_save_until_validation = False + return + if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0: self._save_topk_checkpoint(trainer, monitor_candidates) self._save_last_checkpoint(trainer, monitor_candidates) diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_additional_cases.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_additional_cases.py new file mode 100644 index 0000000000000..fba9e865debfd --- /dev/null +++ b/tests/tests_pytorch/callbacks/test_model_checkpoint_additional_cases.py @@ -0,0 +1,208 @@ +import math +from datetime import timedelta + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, Dataset + +from lightning.pytorch import LightningModule, Trainer, seed_everything +from lightning.pytorch.callbacks import ModelCheckpoint + + +class TinyDataset(Dataset): + def __init__(self, n: int = 4): + self.x = torch.arange(n, dtype=torch.float32).view(-1, 1) + self.y = self.x.clone() + + def __len__(self): + return len(self.x) + + def __getitem__(self, idx): + return self.x[idx], self.y[idx] + + +class TrainMetricModule(LightningModule): + def __init__(self): + super().__init__() + self.layer = nn.Linear(1, 1) + self._counter = 0.0 + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.layer(x) + loss = F.mse_loss(y_hat, y) + # strictly increasing train metric per step + self._counter += 1.0 + self.log("train_score", torch.tensor(self._counter), on_step=True, on_epoch=False, prog_bar=False, logger=True) + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + pass + + def configure_optimizers(self): + return torch.optim.SGD(self.parameters(), lr=0.01) + + +def _make_loaders(n=4): + ds = TinyDataset(n=n) + train_loader = DataLoader(ds, batch_size=2, shuffle=False) + val_loader = DataLoader(ds, batch_size=2, shuffle=False) + return train_loader, val_loader + + +def test_model_checkpoint_every_n_train_steps_with_train_metric_saves_at_step(tmp_path): + """When monitoring a train-step metric, step-interval checkpointing should save at the step boundary (no deferral) + and best_model_score should match the last train metric value.""" + seed_everything(123) + + train_loader, val_loader = _make_loaders(n=4) + model = TrainMetricModule() + + ckpt = ModelCheckpoint( + dirpath=tmp_path, + monitor="train_score", + mode="max", + save_top_k=1, + every_n_train_steps=1, + train_time_interval=None, + every_n_epochs=0, + save_on_train_epoch_end=False, + save_weights_only=True, + ) + + # 2 batches/epoch, run 2 epochs to have multiple step saves + trainer = Trainer( + max_epochs=2, + accelerator="cpu", + devices=1, + callbacks=[ckpt], + num_sanity_val_steps=0, + log_every_n_steps=1, + limit_train_batches=2, + limit_val_batches=0, # no validation needed for this test + enable_checkpointing=True, + enable_model_summary=False, + logger=False, + ) + + trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader) + + assert ckpt.best_model_score is not None + # 2 epochs * 2 steps/epoch = 4 steps total; metric increments by 1 each step + expected = 4.0 + actual = float(ckpt.best_model_score) + assert math.isclose(actual, expected, rel_tol=0, abs_tol=1e-6) + + +@pytest.mark.parametrize("val_scores", [[0.2, 0.4, 0.9]]) +def test_model_checkpoint_time_interval_with_val_metric_defers_until_validation(tmp_path, val_scores): + """With time-interval-based checkpointing, and a validation-only metric, ensure we don't save using stale metrics + at step boundaries; saving should occur at validation end.""" + seed_everything(123) + + train_loader, val_loader = _make_loaders(n=4) + + model = ValMetricModule(val_scores=val_scores) + + ckpt = ModelCheckpoint( + dirpath=tmp_path, + monitor="auroc", + mode="max", + save_top_k=1, + every_n_train_steps=0, # disable step-based + train_time_interval=timedelta(seconds=0), # trigger as often as possible + every_n_epochs=0, + save_on_train_epoch_end=False, + save_weights_only=True, + ) + + trainer = Trainer( + max_epochs=len(val_scores), + accelerator="cpu", + devices=1, + callbacks=[ckpt], + num_sanity_val_steps=0, + log_every_n_steps=1, + limit_train_batches=2, + limit_val_batches=1, + enable_checkpointing=True, + enable_model_summary=False, + logger=False, + ) + + trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader) + + assert ckpt.best_model_score is not None + expected = max(val_scores) + actual = float(ckpt.best_model_score) + assert math.isclose(actual, expected, rel_tol=0, abs_tol=1e-6) + + +class ValMetricModule(LightningModule): + def __init__(self, val_scores: list[float]): + super().__init__() + self.layer = nn.Linear(1, 1) + self._val_scores = [float(s) for s in val_scores] + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.layer(x) + loss = F.mse_loss(y_hat, y) + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + pass + + def on_validation_epoch_end(self): + score = self._val_scores[self.current_epoch] + self.log("auroc", torch.tensor(score, dtype=torch.float32), prog_bar=False, logger=True) + + def configure_optimizers(self): + return torch.optim.SGD(self.parameters(), lr=0.01) + + +@pytest.mark.parametrize("val_scores", [[0.1, 0.5, 1.0, 3.0]]) +def test_model_checkpoint_defer_until_next_validation_when_val_every_2_epochs(tmp_path, val_scores): + """With validation running every 2 epochs, step-triggered saves at the end of non-validation epochs should be + deferred and then performed at the next validation end when the metric is available.""" + seed_everything(123) + + train_loader, val_loader = _make_loaders(n=4) + + model = ValMetricModule(val_scores=val_scores) + + ckpt = ModelCheckpoint( + dirpath=tmp_path, + monitor="auroc", + mode="max", + save_top_k=1, + every_n_train_steps=2, # end of each epoch + train_time_interval=None, + every_n_epochs=0, + save_on_train_epoch_end=False, + save_weights_only=True, + ) + + trainer = Trainer( + max_epochs=len(val_scores), + accelerator="cpu", + devices=1, + callbacks=[ckpt], + num_sanity_val_steps=0, + log_every_n_steps=1, + limit_train_batches=2, + limit_val_batches=1, + enable_checkpointing=True, + enable_model_summary=False, + logger=False, + check_val_every_n_epoch=2, # only validate every 2 epochs + ) + + trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader) + + assert ckpt.best_model_score is not None + expected = max(val_scores) # last/maximum value occurs at final validation epoch + actual = float(ckpt.best_model_score) + assert math.isclose(actual, expected, rel_tol=0, abs_tol=1e-6) diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_edge_cases.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_edge_cases.py new file mode 100644 index 0000000000000..a265f8bc5f194 --- /dev/null +++ b/tests/tests_pytorch/callbacks/test_model_checkpoint_edge_cases.py @@ -0,0 +1,174 @@ +import math +from datetime import timedelta + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, Dataset + +from lightning.pytorch import LightningModule, Trainer, seed_everything +from lightning.pytorch.callbacks import ModelCheckpoint + + +class TinyDataset(Dataset): + def __init__(self, n: int = 8): + self.x = torch.arange(n, dtype=torch.float32).view(-1, 1) + self.y = self.x.clone() + + def __len__(self): + return len(self.x) + + def __getitem__(self, idx): + return self.x[idx], self.y[idx] + + +def _make_loaders(n=8, batch_size=2): + ds = TinyDataset(n=n) + train_loader = DataLoader(ds, batch_size=batch_size, shuffle=False) + val_loader = DataLoader(ds, batch_size=batch_size, shuffle=False) + return train_loader, val_loader + + +class MultiValPerEpochModule(LightningModule): + """Logs a validation metric on every validation run, even if validation is run multiple times per epoch.""" + + def __init__(self, val_scores: list[float]): + super().__init__() + self.layer = nn.Linear(1, 1) + self._val_scores = [float(s) for s in val_scores] + self._val_call_idx = 0 + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.layer(x) + loss = F.mse_loss(y_hat, y) + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + pass + + def on_validation_epoch_end(self): + score = self._val_scores[self._val_call_idx] + self._val_call_idx += 1 + self.log("auroc", torch.tensor(score, dtype=torch.float32), prog_bar=False, logger=True) + + def configure_optimizers(self): + return torch.optim.SGD(self.parameters(), lr=0.01) + + +class ValOnceEveryTwoEpochsModule(LightningModule): + """Logs a validation metric only when validation runs (e.g., every 2 epochs), indexed by current_epoch.""" + + def __init__(self, val_scores: list[float]): + super().__init__() + self.layer = nn.Linear(1, 1) + self._val_scores = [float(s) for s in val_scores] + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.layer(x) + loss = F.mse_loss(y_hat, y) + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + pass + + def on_validation_epoch_end(self): + # current_epoch indexes into provided scores; only called when validation runs + score = self._val_scores[self.current_epoch] + self.log("auroc", torch.tensor(score, dtype=torch.float32), prog_bar=False, logger=True) + + def configure_optimizers(self): + return torch.optim.SGD(self.parameters(), lr=0.01) + + +@pytest.mark.parametrize("val_scores", [[0.1, 0.9]]) +def test_checkpoint_defers_with_mid_epoch_validation(tmp_path, val_scores): + """With val_check_interval=0.5 (validation mid-epoch and at epoch end), and step-based checkpointing, saves must be + deferred until each validation end so monitored validation metrics are fresh.""" + seed_everything(123) + + # 4 train batches per epoch (batch_size=2 over n=8), so two validations: after 2 batches and after 4 batches + train_loader, val_loader = _make_loaders(n=8, batch_size=2) + + model = MultiValPerEpochModule(val_scores=val_scores) + + ckpt = ModelCheckpoint( + dirpath=tmp_path, + monitor="auroc", + mode="max", + save_top_k=1, + every_n_train_steps=1, # would trigger every step, but must defer to validation + train_time_interval=None, + every_n_epochs=0, + save_on_train_epoch_end=False, + save_weights_only=True, + ) + + trainer = Trainer( + max_epochs=1, + accelerator="cpu", + devices=1, + callbacks=[ckpt], + num_sanity_val_steps=0, + log_every_n_steps=1, + limit_train_batches=4, # ensure exactly 4 steps => two validations at 0.5 and 1.0 + limit_val_batches=1, + enable_checkpointing=True, + enable_model_summary=False, + logger=False, + val_check_interval=0.5, + ) + + trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader) + + assert ckpt.best_model_score is not None + expected = max(val_scores) + actual = float(ckpt.best_model_score) + assert math.isclose(actual, expected, rel_tol=0, abs_tol=1e-6) + + +@pytest.mark.parametrize("val_scores", [[0.2, 0.6]]) +def test_time_interval_defers_across_epoch_until_first_validation(tmp_path, val_scores): + """With time-interval saving and validation only every 2 epochs, ensure no save uses stale/missing validation + metrics; the first save should happen at the first validation end (epoch 2).""" + seed_everything(123) + + train_loader, val_loader = _make_loaders(n=4, batch_size=2) + + model = ValOnceEveryTwoEpochsModule(val_scores=val_scores) + + ckpt = ModelCheckpoint( + dirpath=tmp_path, + monitor="auroc", + mode="max", + save_top_k=1, + every_n_train_steps=0, # disable step-based + train_time_interval=timedelta(seconds=0), # trigger frequently + every_n_epochs=0, + save_on_train_epoch_end=False, + save_weights_only=True, + ) + + trainer = Trainer( + max_epochs=2, + accelerator="cpu", + devices=1, + callbacks=[ckpt], + num_sanity_val_steps=0, + log_every_n_steps=1, + limit_train_batches=2, + limit_val_batches=1, + enable_checkpointing=True, + enable_model_summary=False, + logger=False, + check_val_every_n_epoch=2, # first validation only after 2nd epoch + ) + + trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader) + + assert ckpt.best_model_score is not None + expected = val_scores[1] # validation runs only once at epoch 2, logging index 1 + actual = float(ckpt.best_model_score) + assert math.isclose(actual, expected, rel_tol=0, abs_tol=1e-6) diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_step_interval_val_metric.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_step_interval_val_metric.py new file mode 100644 index 0000000000000..c3fa0bfcd2e38 --- /dev/null +++ b/tests/tests_pytorch/callbacks/test_model_checkpoint_step_interval_val_metric.py @@ -0,0 +1,106 @@ +import math + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, Dataset + +from lightning.pytorch import LightningModule, Trainer, seed_everything +from lightning.pytorch.callbacks import ModelCheckpoint + + +class TinyDataset(Dataset): + def __init__(self, n: int = 4): + self.x = torch.arange(n, dtype=torch.float32).view(-1, 1) + self.y = self.x.clone() + + def __len__(self): + return len(self.x) + + def __getitem__(self, idx): + return self.x[idx], self.y[idx] + + +class ValMetricModule(LightningModule): + def __init__(self, val_scores: list[float]): + super().__init__() + self.layer = nn.Linear(1, 1) + self._val_scores = [float(s) for s in val_scores] + + # LightningModule API (minimal) + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.layer(x) + loss = F.mse_loss(y_hat, y) + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + # do nothing per-step; we log at epoch end + pass + + def on_validation_epoch_end(self): + # Log a validation metric only at validation epoch end + # Values increase across epochs; best should be the last epoch + score = self._val_scores[self.current_epoch] + # use logger=True so it lands in trainer.callback_metrics + self.log("auroc", torch.tensor(score, dtype=torch.float32), prog_bar=False, logger=True) + + def configure_optimizers(self): + return torch.optim.SGD(self.parameters(), lr=0.01) + + +@pytest.mark.parametrize("val_scores", [[0.1, 0.5, 1.0]]) +def test_model_checkpoint_every_n_train_steps_with_val_metric_saves_after_val(tmp_path, val_scores): + """Reproduces #20919: Using every_n_train_steps with a validation-only metric should save the best checkpoint only + after the metric is computed at validation, not earlier at the train-step boundary. + + Expectation: best_model_score equals the last (max) val score. + + """ + seed_everything(123) + + # 2 train batches per epoch (so checkpoint triggers at the epoch boundary) + ds = TinyDataset(n=4) + train_loader = DataLoader(ds, batch_size=2, shuffle=False) + val_loader = DataLoader(ds, batch_size=2, shuffle=False) + + model = ValMetricModule(val_scores=val_scores) + + ckpt = ModelCheckpoint( + dirpath=tmp_path, + monitor="auroc", + mode="max", + save_top_k=1, + # critical: trigger on train steps, not on epoch end + every_n_train_steps=2, # equal to number of train batches per epoch + train_time_interval=None, + every_n_epochs=0, + save_on_train_epoch_end=False, + save_weights_only=True, + ) + + trainer = Trainer( + max_epochs=len(val_scores), + accelerator="cpu", + devices=1, + callbacks=[ckpt], + num_sanity_val_steps=0, + log_every_n_steps=1, + limit_train_batches=2, + limit_val_batches=1, + enable_checkpointing=True, + enable_model_summary=False, + logger=False, + ) + + trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader) + + assert ckpt.best_model_score is not None + # Should equal the last (max) validation score + expected = max(val_scores) + actual = float(ckpt.best_model_score) + assert math.isclose(actual, expected, rel_tol=0, abs_tol=1e-6), ( + f"best_model_score should be {expected} (last/maximum val score), got {actual}.\n" + f"This indicates the checkpoint was saved before the validation metric was computed." + ) From e5371287f1ddbcdf0115b892fae52643e7d69562 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 09:34:31 +0200 Subject: [PATCH 06/40] build(deps): bump google-github-actions/auth from 2 to 3 (#21139) Bumps [google-github-actions/auth](https://github.com/google-github-actions/auth) from 2 to 3. - [Release notes](https://github.com/google-github-actions/auth/releases) - [Changelog](https://github.com/google-github-actions/auth/blob/main/CHANGELOG.md) - [Commits](https://github.com/google-github-actions/auth/compare/v2...v3) --- updated-dependencies: - dependency-name: google-github-actions/auth dependency-version: '3' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> (cherry picked from commit c504088b033e8d1fa9df2a73870a58fe12cceeef) --- .github/workflows/docs-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index 69d35e605db5b..925e7ca45ccec 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -165,7 +165,7 @@ jobs: path: docs/build/html/ - name: Authenticate to Google Cloud - uses: google-github-actions/auth@v2 + uses: google-github-actions/auth@v3 with: credentials_json: ${{ secrets.GCS_SA_KEY }} From 98a4d0a6d542402e432499368e66a60e4f130336 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 09:34:49 +0200 Subject: [PATCH 07/40] build(deps): bump google-github-actions/setup-gcloud from 2 to 3 (#21140) Bumps [google-github-actions/setup-gcloud](https://github.com/google-github-actions/setup-gcloud) from 2 to 3. - [Release notes](https://github.com/google-github-actions/setup-gcloud/releases) - [Changelog](https://github.com/google-github-actions/setup-gcloud/blob/main/CHANGELOG.md) - [Commits](https://github.com/google-github-actions/setup-gcloud/compare/v2...v3) --- updated-dependencies: - dependency-name: google-github-actions/setup-gcloud dependency-version: '3' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> (cherry picked from commit 9a41f1f68e3c2dc322c0bec66893822111426a57) --- .github/workflows/docs-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index 925e7ca45ccec..5fb961798ad22 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -170,7 +170,7 @@ jobs: credentials_json: ${{ secrets.GCS_SA_KEY }} - name: Setup gcloud - uses: google-github-actions/setup-gcloud@v2 + uses: google-github-actions/setup-gcloud@v3 with: project_id: ${{ secrets.GCS_PROJECT }} From 84018281609758d4506e12d55d77e3e70046f0b0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 09:36:34 +0200 Subject: [PATCH 08/40] build(deps): update ipython[notebook] requirement from <9.5.0 to <9.6.0 in /requirements (#21134) build(deps): update ipython[notebook] requirement in /requirements Updates the requirements on [ipython[notebook]](https://github.com/ipython/ipython) to permit the latest version. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](https://github.com/ipython/ipython/compare/rel-0.8.4...9.5.0) --- updated-dependencies: - dependency-name: ipython[notebook] dependency-version: 9.5.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> (cherry picked from commit 2e4c99c2a0a2425c053e01c92065ae927bb53a4f) --- requirements/pytorch/docs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt index a3e2e88967f75..7ee6c8bb309cb 100644 --- a/requirements/pytorch/docs.txt +++ b/requirements/pytorch/docs.txt @@ -1,7 +1,7 @@ -r ../docs.txt nbformat # used for generate empty notebook -ipython[notebook] <9.5.0 +ipython[notebook] <9.6.0 setuptools<81.0 # workaround for `error in ipython setup command: use_2to3 is invalid.` onnxscript >= 0.2.2, < 0.5.0 From b49838b2daa857d6f32e96daaf2770670f11b26f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 11:45:18 +0200 Subject: [PATCH 09/40] build(deps): bump coverage from 7.10.5 to 7.10.6 in /requirements (#21135) Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.10.5 to 7.10.6. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.10.5...7.10.6) --- updated-dependencies: - dependency-name: coverage dependency-version: 7.10.6 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit a93a5bbc0bf0058c06a562edf8f5c322bfd0ac5c) --- requirements/fabric/test.txt | 2 +- requirements/pytorch/test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt index a9b4271cac2c3..62a1333d44286 100644 --- a/requirements/fabric/test.txt +++ b/requirements/fabric/test.txt @@ -1,4 +1,4 @@ -coverage ==7.10.5 +coverage ==7.10.6 numpy >=1.21.0, <1.27.0 pytest ==8.4.1 pytest-cov ==6.2.1 diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index 1fd3ec790055f..440d1a4814294 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -1,4 +1,4 @@ -coverage ==7.10.5 +coverage ==7.10.6 pytest ==8.4.1 pytest-cov ==6.2.1 pytest-timeout ==2.4.0 From 9de35b1b5ec5c4b098c79500cc352429713434fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 11:45:26 +0200 Subject: [PATCH 10/40] build(deps): bump pytest-rerunfailures from 15.1 to 16.0 in /requirements (#21137) build(deps): bump pytest-rerunfailures in /requirements Bumps [pytest-rerunfailures](https://github.com/pytest-dev/pytest-rerunfailures) from 15.1 to 16.0. - [Changelog](https://github.com/pytest-dev/pytest-rerunfailures/blob/master/CHANGES.rst) - [Commits](https://github.com/pytest-dev/pytest-rerunfailures/compare/15.1...16.0) --- updated-dependencies: - dependency-name: pytest-rerunfailures dependency-version: '16.0' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 824dfb1e72965a0cb9feac91c730f99c383ee059) --- requirements/fabric/test.txt | 2 +- requirements/pytorch/test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt index 62a1333d44286..222e9cf412b3f 100644 --- a/requirements/fabric/test.txt +++ b/requirements/fabric/test.txt @@ -3,7 +3,7 @@ numpy >=1.21.0, <1.27.0 pytest ==8.4.1 pytest-cov ==6.2.1 pytest-timeout ==2.4.0 -pytest-rerunfailures ==15.1 +pytest-rerunfailures ==16.0 pytest-random-order ==1.2.0 click ==8.1.8; python_version < "3.11" click ==8.2.1; python_version > "3.10" diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index 440d1a4814294..444d814782c38 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -2,7 +2,7 @@ coverage ==7.10.6 pytest ==8.4.1 pytest-cov ==6.2.1 pytest-timeout ==2.4.0 -pytest-rerunfailures ==15.1 +pytest-rerunfailures ==16.0 pytest-random-order ==1.2.0 # needed in tests From 12a75bbb42750e23d6b77d59f260cc0c4c39438d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 12:10:25 +0200 Subject: [PATCH 11/40] build(deps): update onnx requirement from <1.19.0,>1.12.0 to >1.12.0,<1.20.0 in /requirements (#21136) build(deps): update onnx requirement in /requirements Updates the requirements on [onnx](https://github.com/onnx/onnx) to permit the latest version. - [Release notes](https://github.com/onnx/onnx/releases) - [Changelog](https://github.com/onnx/onnx/blob/main/docs/Changelog-ml.md) - [Commits](https://github.com/onnx/onnx/compare/v1.13.0...v1.19.0) --- updated-dependencies: - dependency-name: onnx dependency-version: 1.19.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> (cherry picked from commit 9ace8b7b7eb671f68ef7b92ee0f759e94046faad) --- requirements/pytorch/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index 444d814782c38..86b765c37237f 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -9,7 +9,7 @@ pytest-random-order ==1.2.0 cloudpickle >=1.3, <3.2.0 scikit-learn >0.22.1, <1.8.0 numpy >1.20.0, <1.27.0 -onnx >1.12.0, <1.19.0 +onnx >1.12.0, <1.20.0 onnxruntime >=1.12.0, <1.23.0 onnxscript >= 0.1.0, < 0.5.0 psutil <7.0.1 # for `DeviceStatsMonitor` From 7160273a41d83d3375d6eddd239ee56e2b67a9be Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 12:25:52 +0200 Subject: [PATCH 12/40] build(deps): update typing-extensions requirement from <4.15.0,>4.5.0 to >4.5.0,<4.16.0 in /requirements (#21133) build(deps): update typing-extensions requirement in /requirements Updates the requirements on [typing-extensions](https://github.com/python/typing_extensions) to permit the latest version. - [Release notes](https://github.com/python/typing_extensions/releases) - [Changelog](https://github.com/python/typing_extensions/blob/main/CHANGELOG.md) - [Commits](https://github.com/python/typing_extensions/compare/4.6.0...4.15.0) --- updated-dependencies: - dependency-name: typing-extensions dependency-version: 4.15.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 46445cb49f0dc187176d2cb2f5d7b6377c32a34b) --- requirements/fabric/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index b740cf2d7cdd6..00d2b9bdaf427 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -4,5 +4,5 @@ torch >=2.1.0, <2.9.0 fsspec[http] >=2022.5.0, <2025.8.0 packaging >=20.0, <=25.0 -typing-extensions >4.5.0, <4.15.0 +typing-extensions >4.5.0, <4.16.0 lightning-utilities >=0.10.0, <0.16.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index ef798883c12ef..6e684d995c1b7 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -7,5 +7,5 @@ PyYAML >5.4, <6.1.0 fsspec[http] >=2022.5.0, <2025.8.0 torchmetrics >0.7.0, <1.9.0 packaging >=20.0, <=25.0 -typing-extensions >4.5.0, <4.15.0 +typing-extensions >4.5.0, <4.16.0 lightning-utilities >=0.10.0, <0.16.0 From f8d700bc02dd8d5deff236bb601d5f244de20c2c Mon Sep 17 00:00:00 2001 From: Shion Matsumoto Date: Mon, 1 Sep 2025 06:45:37 -0400 Subject: [PATCH 13/40] Simplify workflow matrix (#21132) * simplify * add config field (cherry picked from commit db77fa7a44a0c3c49a6d5860fec075de8785dc7d) --- .github/workflows/ci-tests-pytorch.yml | 71 +++++++++++--------------- 1 file changed, 29 insertions(+), 42 deletions(-) diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index e5e9ddd23c06e..011cd0b6a9cbc 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -42,42 +42,29 @@ jobs: strategy: fail-fast: false matrix: - include: + os: [macOS-14, ubuntu-22.04, windows-2022] + config: # only run PyTorch latest - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } + - { pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } + - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } + - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } + - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } + - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } + # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } - # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } - - { os: "windows-2022", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } + - { pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } + # "pytorch" installs the standalone package - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.7" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.7" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.7" } + - { pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.7" } + # adding recently cut Torch 2.7 - FUTURE - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.8" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.8" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.8" } + - { pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.8" } + + # "oldest" versions tests, only on minimum Python + - { pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } timeout-minutes: 50 env: - PACKAGE_NAME: ${{ matrix.pkg-name }} + PACKAGE_NAME: ${{ matrix.config.pkg-name }} TORCH_URL: "https://download.pytorch.org/whl/cpu/" TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" @@ -88,10 +75,10 @@ jobs: steps: - uses: actions/checkout@v5 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.config.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version || '3.9' }} + python-version: ${{ matrix.config.python-version || '3.9' }} - name: basic setup run: pip install -q -r .actions/requirements.txt @@ -111,7 +98,7 @@ jobs: pip install -q -r requirements/ci.txt python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py for fpath in `ls requirements/**/*.txt`; do \ - python ./adjust-torch-versions.py $fpath ${{ matrix.pytorch-version }}; \ + python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \ done cat requirements/pytorch/base.txt @@ -127,13 +114,13 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL between stable and test/future - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.config.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope - python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV + python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.config.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage - python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.pkg-name}}' != 'lightning' else 'pytorch-'))" >> $GITHUB_ENV + python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.config.pkg-name}}' != 'lightning' else 'pytorch-'))" >> $GITHUB_ENV # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support" - python -c "print('USE_LIBUV=0' if '${{matrix.os}}' == 'windows-2022' and '${{matrix.pytorch-version}}' == '2.4' else '')" >> $GITHUB_ENV + python -c "print('USE_LIBUV=0' if '${{matrix.os}}' == 'windows-2022' and '${{matrix.config.pytorch-version}}' == '2.4' else '')" >> $GITHUB_ENV - name: Install package & dependencies timeout-minutes: 20 @@ -145,11 +132,11 @@ jobs: --find-links="${PYPI_CACHE_DIR}" pip list - name: Drop LAI from extensions - if: ${{ matrix.pkg-name != 'lightning' }} + if: ${{ matrix.config.pkg-name != 'lightning' }} # Lightning is dependency of Habana or other accelerators/integrations so in case we test PL we need to remove it run: pip uninstall -y lightning - name: Drop PL for LAI - if: ${{ matrix.pkg-name == 'lightning' }} + if: ${{ matrix.config.pkg-name == 'lightning' }} run: pip uninstall -y pytorch-lightning - name: Dump handy wheels if: github.event_name == 'push' && github.ref == 'refs/heads/master' @@ -170,10 +157,10 @@ jobs: run: | set -e python requirements/pytorch/check-avail-extras.py - python -c "from torch import __version__ as ver; assert ver.startswith('${{ matrix.pytorch-version }}'), ver" + python -c "from torch import __version__ as ver; assert ver.startswith('${{ matrix.config.pytorch-version }}'), ver" - name: Adjust tests / env. -> PL - if: ${{ matrix.pkg-name != 'lightning' }} + if: ${{ matrix.config.pkg-name != 'lightning' }} run: | python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ --source_import="lightning.fabric,lightning.pytorch" \ @@ -223,7 +210,7 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} file: tests/tests_pytorch/coverage.xml - flags: ${{ env.COVERAGE_SCOPE }},cpu,pytest-full,python${{ matrix.python-version }},pytorch${{ matrix.pytorch-version }} + flags: ${{ env.COVERAGE_SCOPE }},cpu,pytest-full,python${{ matrix.config.python-version }},pytorch${{ matrix.config.pytorch-version }} name: CPU-coverage fail_ci_if_error: false From 4dbacdccb836d94b2fd11dd3c90f38afe0fd4e74 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Mon, 1 Sep 2025 14:19:07 +0200 Subject: [PATCH 14/40] Docs on hook call order (#21120) * add hook order * add to index * Apply suggestions from code review * BoringModel * testoutput --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Jirka B (cherry picked from commit 3d56296978f3bd4d1f0957dbf361b9a0a53731fd) --- docs/source-pytorch/common/hooks.rst | 313 +++++++++++++++++++++++++ docs/source-pytorch/glossary/index.rst | 8 + 2 files changed, 321 insertions(+) create mode 100644 docs/source-pytorch/common/hooks.rst diff --git a/docs/source-pytorch/common/hooks.rst b/docs/source-pytorch/common/hooks.rst new file mode 100644 index 0000000000000..79b1118a0dcf4 --- /dev/null +++ b/docs/source-pytorch/common/hooks.rst @@ -0,0 +1,313 @@ +########################## +Hooks in PyTorch Lightning +########################## + +Hooks in Pytorch Lightning allow you to customize the training, validation, and testing logic of your models. They +provide a way to insert custom behavior at specific points during the training process without modifying the core +training loop. There are several categories of hooks available in PyTorch Lightning: + +1. **Setup/Teardown Hooks**: Called at the beginning and end of training phases +2. **Training Hooks**: Called during the training loop +3. **Validation Hooks**: Called during validation +4. **Test Hooks**: Called during testing +5. **Prediction Hooks**: Called during prediction +6. **Optimizer Hooks**: Called around optimizer operations +7. **Checkpoint Hooks**: Called during checkpoint save/load operations +8. **Exception Hooks**: Called when exceptions occur + +Nearly all hooks can be implemented in three places within your code: + +- **LightningModule**: The main module where you define your model and training logic. +- **Callbacks**: Custom classes that can be passed to the Trainer to handle specific events. +- **Strategy**: Custom strategies for distributed training. + +Importantly, because logic can be place in the same hook but in different places the call order of hooks is in +important to understand. The following order is always used: + +1. Callbacks, called in the order they are passed to the Trainer. +2. ``LightningModule`` +3. Strategy + +.. testcode:: + + from lightning.pytorch import Trainer + from lightning.pytorch.callbacks import Callback + from lightning.pytorch.demos import BoringModel + + class MyModel(BoringModel): + def on_train_start(self): + print("Model: Training is starting!") + + class MyCallback(Callback): + def on_train_start(self, trainer, pl_module): + print("Callback: Training is starting!") + + model = MyModel() + callback = MyCallback() + trainer = Trainer(callbacks=[callback], logger=False, max_epochs=1) + trainer.fit(model) + +.. testoutput:: + :hide: + :options: +ELLIPSIS, +NORMALIZE_WHITESPACE + + ┏━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┓ + ┃ ┃ Name ┃ Type ┃ Params ┃ Mode ┃ FLOPs ┃ + ┡━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━┩ + │ 0 │ layer │ Linear │ 66 │ train │ 0 │ + └───┴───────┴────────┴────────┴───────┴───────┘ + ... + Callback: Training is starting! + Model: Training is starting! + Epoch 0/0 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64/64 ... + + +.. note:: + There are a few exceptions to this pattern: + + - **on_train_epoch_end**: Non-monitoring callbacks are called first, then ``LightningModule``, then monitoring callbacks + - **Optimizer hooks** (on_before_backward, on_after_backward, on_before_optimizer_step): Only callbacks and ``LightningModule`` are called + - Some internal hooks may only call ``LightningModule`` or Strategy + +************************ +Training Loop Hook Order +************************ + +The following diagram shows the execution order of hooks during a typical training loop e.g. calling `trainer.fit()`, +with the source of each hook indicated: + +.. code-block:: text + + Training Process Flow: + + trainer.fit() + │ + ├── setup(stage="fit") + │ └── [Callbacks only] + │ + ├── on_fit_start() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + ├── on_sanity_check_start() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ ├── on_validation_start() + │ │ ├── [Callbacks] + │ │ ├── [LightningModule] + │ │ └── [Strategy] + │ ├── on_validation_epoch_start() + │ │ ├── [Callbacks] + │ │ ├── [LightningModule] + │ │ └── [Strategy] + │ │ ├── [for each validation batch] + │ │ │ ├── on_validation_batch_start() + │ │ │ │ ├── [Callbacks] + │ │ │ │ ├── [LightningModule] + │ │ │ │ └── [Strategy] + │ │ │ └── on_validation_batch_end() + │ │ │ ├── [Callbacks] + │ │ │ ├── [LightningModule] + │ │ │ └── [Strategy] + │ │ └── [end validation batches] + │ ├── on_validation_epoch_end() + │ │ ├── [Callbacks] + │ │ ├── [LightningModule] + │ │ └── [Strategy] + │ └── on_validation_end() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + ├── on_sanity_check_end() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + ├── on_train_start() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + ├── [Training Epochs Loop] + │ │ + │ ├── on_train_epoch_start() + │ │ ├── [Callbacks] + │ │ └── [LightningModule] + │ │ + │ ├── [Training Batches Loop] + │ │ │ + │ │ ├── on_train_batch_start() + │ │ │ ├── [Callbacks] + │ │ │ ├── [LightningModule] + │ │ │ └── [Strategy] + │ │ │ + │ │ ├── on_before_zero_grad() + │ │ │ ├── [Callbacks] + │ │ │ └── [LightningModule] + │ │ │ + │ │ ├── [Forward Pass - training_step()] + │ │ │ └── [Strategy only] + │ │ │ + │ │ ├── on_before_backward() + │ │ │ ├── [Callbacks] + │ │ │ └── [LightningModule] + │ │ │ + │ │ ├── [Backward Pass] + │ │ │ └── [Strategy only] + │ │ │ + │ │ ├── on_after_backward() + │ │ │ ├── [Callbacks] + │ │ │ └── [LightningModule] + │ │ │ + │ │ ├── on_before_optimizer_step() + │ │ │ ├── [Callbacks] + │ │ │ └── [LightningModule] + │ │ │ + │ │ ├── [Optimizer Step] + │ │ │ └── [LightningModule only - optimizer_step()] + │ │ │ + │ │ └── on_train_batch_end() + │ │ ├── [Callbacks] + │ │ └── [LightningModule] + │ │ + │ │ [Optional: Validation during training] + │ │ ├── on_validation_start() + │ │ │ ├── [Callbacks] + │ │ │ ├── [LightningModule] + │ │ │ └── [Strategy] + │ │ ├── on_validation_epoch_start() + │ │ │ ├── [Callbacks] + │ │ │ ├── [LightningModule] + │ │ │ └── [Strategy] + │ │ │ ├── [for each validation batch] + │ │ │ │ ├── on_validation_batch_start() + │ │ │ │ │ ├── [Callbacks] + │ │ │ │ │ ├── [LightningModule] + │ │ │ │ │ └── [Strategy] + │ │ │ │ └── on_validation_batch_end() + │ │ │ │ ├── [Callbacks] + │ │ │ │ ├── [LightningModule] + │ │ │ │ └── [Strategy] + │ │ │ └── [end validation batches] + │ │ ├── on_validation_epoch_end() + │ │ │ ├── [Callbacks] + │ │ │ ├── [LightningModule] + │ │ │ └── [Strategy] + │ │ └── on_validation_end() + │ │ ├── [Callbacks] + │ │ ├── [LightningModule] + │ │ └── [Strategy] + │ │ + │ └── on_train_epoch_end() **SPECIAL CASE** + │ ├── [Callbacks - Non-monitoring only] + │ ├── [LightningModule] + │ └── [Callbacks - Monitoring only] + │ + ├── [End Training Epochs] + │ + ├── on_train_end() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + ├── on_fit_end() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + └── teardown(stage="fit") + └── [Callbacks only] + +*********************** +Testing Loop Hook Order +*********************** + +When running tests with ``trainer.test()``: + +.. code-block:: text + + trainer.test() + │ + ├── setup(stage="test") + │ └── [Callbacks only] + ├── on_test_start() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + ├── [Test Epochs Loop] + │ │ + │ ├── on_test_epoch_start() + │ │ ├── [Callbacks] + │ │ ├── [LightningModule] + │ │ └── [Strategy] + │ │ + │ ├── [Test Batches Loop] + │ │ │ + │ │ ├── on_test_batch_start() + │ │ │ ├── [Callbacks] + │ │ │ ├── [LightningModule] + │ │ │ └── [Strategy] + │ │ │ + │ │ └── on_test_batch_end() + │ │ ├── [Callbacks] + │ │ ├── [LightningModule] + │ │ └── [Strategy] + │ │ + │ └── on_test_epoch_end() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + ├── on_test_end() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + └── teardown(stage="test") + └── [Callbacks only] + +************************** +Prediction Loop Hook Order +************************** + +When running predictions with ``trainer.predict()``: + +.. code-block:: text + + trainer.predict() + │ + ├── setup(stage="predict") + │ └── [Callbacks only] + ├── on_predict_start() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + │ + ├── [Prediction Epochs Loop] + │ │ + │ ├── on_predict_epoch_start() + │ │ ├── [Callbacks] + │ │ └── [LightningModule] + │ │ + │ ├── [Prediction Batches Loop] + │ │ │ + │ │ ├── on_predict_batch_start() + │ │ │ ├── [Callbacks] + │ │ │ └── [LightningModule] + │ │ │ + │ │ └── on_predict_batch_end() + │ │ ├── [Callbacks] + │ │ └── [LightningModule] + │ │ + │ └── on_predict_epoch_end() + │ ├── [Callbacks] + │ └── [LightningModule] + │ + ├── on_predict_end() + │ ├── [Callbacks] + │ ├── [LightningModule] + │ └── [Strategy] + └── teardown(stage="predict") + └── [Callbacks only] diff --git a/docs/source-pytorch/glossary/index.rst b/docs/source-pytorch/glossary/index.rst index 45683c67c1708..c904932737b02 100644 --- a/docs/source-pytorch/glossary/index.rst +++ b/docs/source-pytorch/glossary/index.rst @@ -20,6 +20,7 @@ FSDP <../advanced/model_parallel/fsdp> GPU <../accelerators/gpu> Half precision <../common/precision> + Hooks <../common/hooks> HPU <../integrations/hpu/index> Inference <../deploy/production_intermediate> Lightning CLI <../cli/lightning_cli> @@ -179,6 +180,13 @@ Glossary :button_link: ../common/precision.html :height: 100 +.. displayitem:: + :header: Hooks + :description: How to customize the training, validation, and testing logic + :col_css: col-md-12 + :button_link: ../common/hooks.html + :height: 100 + .. displayitem:: :header: HPU :description: Habana Gaudi AI Processor Unit for faster training From 8f8ed372801c2fff6ff23d4af4614052d9192196 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Mon, 1 Sep 2025 15:16:17 +0200 Subject: [PATCH 15/40] Add missing device id for pytorch 2.8 (#21105) * add missing device id for pytorch 2.8 * skip device id for older pytorch versions * add testing * fix mypy without touching submodule * fix failing tests * Apply suggestions from code review --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Deependu Co-authored-by: Jirka B Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit 6fc44c9254e9ab2c4e49643f34f1c18d05c36e91) --- src/lightning/fabric/CHANGELOG.md | 4 +- src/lightning/fabric/strategies/ddp.py | 6 ++- src/lightning/fabric/strategies/fsdp.py | 5 +- .../fabric/strategies/model_parallel.py | 5 +- src/lightning/pytorch/CHANGELOG.md | 2 + src/lightning/pytorch/strategies/ddp.py | 7 ++- src/lightning/pytorch/strategies/fsdp.py | 7 ++- .../pytorch/strategies/model_parallel.py | 7 ++- tests/tests_fabric/strategies/test_ddp.py | 49 ++++++++++++++++++- tests/tests_fabric/strategies/test_fsdp.py | 7 ++- .../strategies/test_model_parallel.py | 6 ++- tests/tests_pytorch/strategies/test_ddp.py | 36 +++++++++++++- .../strategies/test_ddp_integration.py | 2 +- tests/tests_pytorch/strategies/test_fsdp.py | 7 ++- .../strategies/test_model_parallel.py | 6 ++- 15 files changed, 136 insertions(+), 20 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 8fe4e4e2f6d17..54acb25eaf682 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -15,14 +15,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) ## [2.5.4] - 2025-08-29 ### Changed -- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#20913](https://github.com/Lightning-AI/pytorch-lightning/pull/21119)) +- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#21119](https://github.com/Lightning-AI/pytorch-lightning/pull/21119)) ## [2.5.3] - 2025-08-13 diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index ce47e4e403c34..e826b910c16d3 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -41,6 +41,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.rank_zero import rank_zero_only _DDP_FORK_ALIASES = ( @@ -212,7 +213,10 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + kwargs: dict[str, Any] = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 74af1dd0e8f43..25b50d6a67332 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -662,7 +662,10 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + kwargs: dict[str, Any] = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index ace23a9c7a2c5..0d49ddf91a0bc 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -302,7 +302,10 @@ def _setup_distributed(self) -> None: self._set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + kwargs: dict[str, Any] = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index cdaea3e72ecbb..81c34c1ee2a91 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106)) +- Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) + ## [2.5.4] - 2025-08-29 diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index fd3f66ef42471..92206e1accc31 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -36,7 +36,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _IS_WINDOWS +from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.optimizer import _optimizers_to_device from lightning.fabric.utilities.seed import reset_seed from lightning.fabric.utilities.types import ReduceOp @@ -200,7 +200,10 @@ def setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + kwargs: dict[str, Any] = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 55ea354a5cb60..3fbd0f9cd5f0a 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -61,7 +61,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.init import _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors from lightning.fabric.utilities.optimizer import _optimizers_to_device @@ -260,7 +260,10 @@ def setup_environment(self) -> None: self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + kwargs: dict[str, Any] = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here if isinstance(self.kwargs.get("device_mesh"), tuple): diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py index 82fec205af731..e0286dbe2e0e6 100644 --- a/src/lightning/pytorch/strategies/model_parallel.py +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -39,7 +39,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3, _TORCH_GREATER_EQUAL_2_4 from lightning.fabric.utilities.init import _materialize_distributed_module from lightning.fabric.utilities.load import _METADATA_FILENAME from lightning.fabric.utilities.optimizer import _optimizers_to_device @@ -350,7 +350,10 @@ def _setup_distributed(self) -> None: self.set_world_ranks() self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None - _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + kwargs: dict[str, Any] = {"timeout": self._timeout} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None + _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs) def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index fa5c975228a5e..f302da5d1bc4f 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -25,6 +25,7 @@ from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies import DDPStrategy from lightning.fabric.strategies.ddp import _DDPBackwardSyncControl +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from tests_fabric.helpers.runif import RunIf @@ -168,6 +169,52 @@ def test_set_timeout(init_process_group_mock): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs + ) + + +@mock.patch("torch.distributed.init_process_group") +def test_device_id_passed_for_cuda_devices(init_process_group_mock): + """Test that device_id is passed to init_process_group for CUDA devices but not for CPU.""" + # Test with CPU device - device_id should be None + cpu_strategy = DDPStrategy(parallel_devices=[torch.device("cpu")]) + cpu_strategy.cluster_environment = LightningEnvironment() + cpu_strategy.accelerator = Mock() + cpu_strategy.setup_environment() + + process_group_backend = cpu_strategy._get_process_group_backend() + global_rank = cpu_strategy.cluster_environment.global_rank() + world_size = cpu_strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = cpu_strategy.root_device if cpu_strategy.root_device.type != "cpu" else None + init_process_group_mock.assert_called_with( + process_group_backend, rank=global_rank, world_size=world_size, timeout=cpu_strategy._timeout, **kwargs + ) + + init_process_group_mock.reset_mock() + + # Test with CUDA device - device_id should be the device + cuda_device = torch.device("cuda", 0) + cuda_strategy = DDPStrategy(parallel_devices=[cuda_device]) + cuda_strategy.cluster_environment = LightningEnvironment() + cuda_strategy.accelerator = Mock() + cuda_strategy.setup_environment() + + process_group_backend = cuda_strategy._get_process_group_backend() + global_rank = cuda_strategy.cluster_environment.global_rank() + world_size = cuda_strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = cuda_strategy.root_device if cuda_strategy.root_device.type != "cpu" else None + init_process_group_mock.assert_called_with( + process_group_backend, + rank=global_rank, + world_size=world_size, + timeout=cuda_strategy._timeout, + **kwargs, ) diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index d5f82752a9176..6be379d36582c 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -31,7 +31,7 @@ _get_full_state_dict_context, _is_sharded_checkpoint, ) -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3 def test_custom_mixed_precision(): @@ -381,8 +381,11 @@ def test_set_timeout(init_process_group_mock): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py index 78622adf66fa6..4b6cb6f8fad85 100644 --- a/tests/tests_fabric/strategies/test_model_parallel.py +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -25,6 +25,7 @@ from lightning.fabric.strategies import ModelParallelStrategy from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint from lightning.fabric.strategies.model_parallel import _ParallelBackwardSyncControl +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from tests_fabric.helpers.runif import RunIf @@ -316,8 +317,11 @@ def test_set_timeout(init_process_group_mock, _): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 915e57440b40f..823d77d0d5848 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -20,6 +20,7 @@ from torch.nn.parallel import DistributedDataParallel from lightning.fabric.plugins.environments import LightningEnvironment +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.plugins import DoublePrecision, HalfPrecision, Precision @@ -132,8 +133,41 @@ def test_set_timeout(mock_init_process_group): process_group_backend = trainer.strategy._get_process_group_backend() global_rank = trainer.strategy.cluster_environment.global_rank() world_size = trainer.strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = trainer.strategy.root_device if trainer.strategy.root_device.type != "cpu" else None mock_init_process_group.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs + ) + + +@mock.patch("torch.distributed.init_process_group") +def test_device_id_passed_for_cuda_devices_pytorch(mock_init_process_group): + """Test that device_id is passed to init_process_group for CUDA devices but not for CPU.""" + # Test with CPU device - device_id should be None + model = BoringModel() + ddp_strategy = DDPStrategy() + trainer = Trainer( + max_epochs=1, + accelerator="cpu", + strategy=ddp_strategy, + ) + trainer.strategy.connect(model) + trainer.lightning_module.trainer = trainer + trainer.strategy.setup_environment() + + process_group_backend = trainer.strategy._get_process_group_backend() + global_rank = trainer.strategy.cluster_environment.global_rank() + world_size = trainer.strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = trainer.strategy.root_device if trainer.strategy.root_device.type != "cpu" else None + mock_init_process_group.assert_called_with( + process_group_backend, + rank=global_rank, + world_size=world_size, + timeout=trainer.strategy._timeout, + **kwargs, ) diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py index 048403366ebc7..fc3a8cfebbac0 100644 --- a/tests/tests_pytorch/strategies/test_ddp_integration.py +++ b/tests/tests_pytorch/strategies/test_ddp_integration.py @@ -66,7 +66,7 @@ def test_multi_gpu_model_ddp_fit_test(tmp_path): assert out["test_acc"] > 0.7 -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, max_torch="2.7") @mock.patch("torch.cuda.set_device") @mock.patch("lightning.pytorch.accelerators.cuda._check_cuda_matmul_precision") @mock.patch("lightning.pytorch.accelerators.cuda._clear_cuda_memory") diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 560ab19f823ca..f7c15b5930be8 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -18,7 +18,7 @@ from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.pytorch import Trainer from lightning.pytorch.callbacks import ModelCheckpoint @@ -532,8 +532,11 @@ def test_set_timeout(init_process_group_mock): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py index 86a95944ac20d..c803c10afa4b4 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel.py +++ b/tests/tests_pytorch/strategies/test_model_parallel.py @@ -22,6 +22,7 @@ import torch.nn as nn from lightning.fabric.strategies.model_parallel import _is_sharded_checkpoint +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.pytorch import LightningModule from lightning.pytorch.plugins.environments import LightningEnvironment from lightning.pytorch.strategies import ModelParallelStrategy @@ -202,8 +203,11 @@ def test_set_timeout(init_process_group_mock, _): process_group_backend = strategy._get_process_group_backend() global_rank = strategy.cluster_environment.global_rank() world_size = strategy.cluster_environment.world_size() + kwargs = {} + if _TORCH_GREATER_EQUAL_2_3: + kwargs["device_id"] = strategy.root_device if strategy.root_device.type != "cpu" else None init_process_group_mock.assert_called_with( - process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta, **kwargs ) From 82ee1a13155b71832783b1cdbd7abed43d97efd5 Mon Sep 17 00:00:00 2001 From: Shion Matsumoto Date: Mon, 1 Sep 2025 11:47:03 -0400 Subject: [PATCH 16/40] Simplify Fabric tests workflow matrix (#21142) (cherry picked from commit e760ad5643de30a3ac01672420b8c9599bd47ca4) --- .github/workflows/ci-tests-fabric.yml | 63 +++++++++++---------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index f4c66f425cc71..939f986cc187a 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -38,42 +38,29 @@ jobs: strategy: fail-fast: false matrix: - include: + os: [macOS-14, ubuntu-22.04, windows-2022] + config: # only run PyTorch latest - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } + - { pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } + - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } + - { pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } + - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" } + - { pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } + # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } - - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } - # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } - - { os: "ubuntu-22.04", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } - - { os: "windows-2022", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } + - { pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } + # "fabric" installs the standalone package - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.7" } - - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.7" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.7" } + - { pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.7" } + # adding recently cut Torch 2.7 - FUTURE - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.8" } - - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.8" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.8" } + - { pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.8" } + + # "oldest" versions tests, only on minimum Python + - { pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } timeout-minutes: 25 # because of building grpcio on Mac env: - PACKAGE_NAME: ${{ matrix.pkg-name }} + PACKAGE_NAME: ${{ matrix.config.pkg-name }} FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" @@ -83,10 +70,10 @@ jobs: steps: - uses: actions/checkout@v5 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.config.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version || '3.9' }} + python-version: ${{ matrix.config.python-version || '3.9' }} - name: basic setup run: pip install -q -r .actions/requirements.txt @@ -106,7 +93,7 @@ jobs: pip install -q -r requirements/ci.txt python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py for fpath in `ls requirements/**/*.txt`; do \ - python ./adjust-torch-versions.py $fpath ${{ matrix.pytorch-version }}; \ + python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \ done - name: pip wheels cache @@ -121,11 +108,11 @@ jobs: - name: Expand Env. variables run: | # Switch PyTorch URL between stable and test/future - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.config.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope - python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV + python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.config.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage - python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.pkg-name}}' != 'lightning' else 'fabric-'))" >> $GITHUB_ENV + python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.config.pkg-name}}' != 'lightning' else 'fabric-'))" >> $GITHUB_ENV - name: Append Env. vars for MacOS if: ${{ runner.os == 'macOS' }} run: | @@ -155,7 +142,7 @@ jobs: cache-key: "pypi_wheels" - name: Adjust tests - if: ${{ matrix.pkg-name != 'lightning' }} + if: ${{ matrix.config.pkg-name != 'lightning' }} run: | python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ --source_import="lightning.fabric" --target_import="lightning_fabric" @@ -188,7 +175,7 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} file: tests/tests_fabric/coverage.xml - flags: ${{ env.COVERAGE_SCOPE }},cpu,pytest,python${{ matrix.python-version }} + flags: ${{ env.COVERAGE_SCOPE }},cpu,pytest,python${{ matrix.config.python-version }} name: CPU-coverage fail_ci_if_error: false From 3df3fcb1b01d71a425de8cdb596089b07a426fb7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:47:16 +0200 Subject: [PATCH 17/40] build(deps): update ipython[all] requirement from <8.19.0 to <9.0.0 in /requirements (#21138) * build(deps): update ipython[all] requirement in /requirements Updates the requirements on [ipython[all]](https://github.com/ipython/ipython) to permit the latest version. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](https://github.com/ipython/ipython/compare/rel-0.8.4...9.5.0) --- updated-dependencies: - dependency-name: ipython[all] dependency-version: 9.5.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Apply suggestions from code review --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit da7f2f9a9ff116b2316d4475a58d310834b376c6) --- requirements/pytorch/examples.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 84ea80df6ff0c..c34309a0234f2 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -3,5 +3,5 @@ requests <2.33.0 torchvision >=0.16.0, <0.24.0 -ipython[all] <8.19.0 +ipython[all] >=8.0.0, <9.0.0 torchmetrics >=0.10.0, <1.9.0 From ff13bea694cadc6ba48cfa5c4304d9a9eed24a29 Mon Sep 17 00:00:00 2001 From: GdoongMathew Date: Tue, 2 Sep 2025 15:57:23 +0800 Subject: [PATCH 18/40] docs: update `optimizer_zero_grad` order, and the backward pass. (#21144) (cherry picked from commit 06bed20190c2e428c00ef73c6aa70ab423b2a47a) --- docs/source-pytorch/common/hooks.rst | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/source-pytorch/common/hooks.rst b/docs/source-pytorch/common/hooks.rst index 79b1118a0dcf4..31a65430649f2 100644 --- a/docs/source-pytorch/common/hooks.rst +++ b/docs/source-pytorch/common/hooks.rst @@ -143,23 +143,24 @@ with the source of each hook indicated: │ │ │ ├── [LightningModule] │ │ │ └── [Strategy] │ │ │ - │ │ ├── on_before_zero_grad() - │ │ │ ├── [Callbacks] - │ │ │ └── [LightningModule] - │ │ │ │ │ ├── [Forward Pass - training_step()] │ │ │ └── [Strategy only] │ │ │ - │ │ ├── on_before_backward() + │ │ ├── on_before_zero_grad() │ │ │ ├── [Callbacks] │ │ │ └── [LightningModule] │ │ │ - │ │ ├── [Backward Pass] - │ │ │ └── [Strategy only] + │ │ ├── optimizer_zero_grad() + │ │ │ └── [LightningModule only - optimizer_zero_grad()] │ │ │ - │ │ ├── on_after_backward() - │ │ │ ├── [Callbacks] - │ │ │ └── [LightningModule] + │ │ ├── [Backward Pass - Strategy.backward()] + │ │ │ ├── on_before_backward() + │ │ │ │ ├── [Callbacks] + │ │ │ │ └── [LightningModule] + │ │ │ ├── LightningModule.backward() + │ │ │ └── on_after_backward() + │ │ │ ├── [Callbacks] + │ │ │ └── [LightningModule] │ │ │ │ │ ├── on_before_optimizer_step() │ │ │ ├── [Callbacks] From 8bd363c2f8085793f05a320592136ebad00e4043 Mon Sep 17 00:00:00 2001 From: GdoongMathew Date: Tue, 2 Sep 2025 15:58:03 +0800 Subject: [PATCH 19/40] docs: move `on_fit_end` hook to `teardown` stage. (#21143) * docs: move `on_fit_end` hook to `teardown` stage. * Apply suggestion from @bhimrazy --------- Co-authored-by: Bhimraj Yadav (cherry picked from commit 630db82da619ec94e5c3f417cfb248e3fb495c1b) --- docs/source-pytorch/common/hooks.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/source-pytorch/common/hooks.rst b/docs/source-pytorch/common/hooks.rst index 31a65430649f2..4c20ad1b857f0 100644 --- a/docs/source-pytorch/common/hooks.rst +++ b/docs/source-pytorch/common/hooks.rst @@ -213,13 +213,14 @@ with the source of each hook indicated: │ ├── [LightningModule] │ └── [Strategy] │ - ├── on_fit_end() - │ ├── [Callbacks] - │ ├── [LightningModule] - │ └── [Strategy] - │ └── teardown(stage="fit") - └── [Callbacks only] + ├── [Strategy] + ├── on_fit_end() + │ ├── [Callbacks] + │ └── [LightningModule] + ├── [LightningDataModule] + ├── [Callbacks] + └── [LightningModule] *********************** Testing Loop Hook Order From 1607a336973ac975e2ab84f7a2fa6b2b8a529f4b Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Tue, 2 Sep 2025 10:05:22 +0200 Subject: [PATCH 20/40] Fix TQDM progress bar showing the wrong total when using a finite and iterable dataloader (#21147) * fix implementation reset * add testing * changelog (cherry picked from commit 14a57c76bd4fe9b7fd8c1a313441a514b41563e4) --- src/lightning/pytorch/CHANGELOG.md | 3 ++ .../callbacks/progress/tqdm_progress.py | 16 +++++-- .../progress/test_tqdm_progress_bar.py | 47 +++++++++++++++++++ 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 81c34c1ee2a91..b9bd312fb3438 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) +- Fixed `TQDMProgressBar` not resetting correctly when using both a finite and iterable dataloader ([#21147](https://github.com/Lightning-AI/pytorch-lightning/pull/21147)) + + ## [2.5.4] - 2025-08-29 ### Fixed diff --git a/src/lightning/pytorch/callbacks/progress/tqdm_progress.py b/src/lightning/pytorch/callbacks/progress/tqdm_progress.py index 4ef260f00006d..4a0b0d67041ba 100644 --- a/src/lightning/pytorch/callbacks/progress/tqdm_progress.py +++ b/src/lightning/pytorch/callbacks/progress/tqdm_progress.py @@ -265,7 +265,9 @@ def on_train_start(self, *_: Any) -> None: def on_train_epoch_start(self, trainer: "pl.Trainer", *_: Any) -> None: if self._leave: self.train_progress_bar = self.init_train_tqdm() - self.train_progress_bar.reset(convert_inf(self.total_train_batches)) + total = convert_inf(self.total_train_batches) + self.train_progress_bar.reset() + self.train_progress_bar.total = total self.train_progress_bar.initial = 0 self.train_progress_bar.set_description(f"Epoch {trainer.current_epoch}") @@ -306,7 +308,9 @@ def on_validation_batch_start( if not self.has_dataloader_changed(dataloader_idx): return - self.val_progress_bar.reset(convert_inf(self.total_val_batches_current_dataloader)) + total = convert_inf(self.total_val_batches_current_dataloader) + self.val_progress_bar.reset() + self.val_progress_bar.total = total self.val_progress_bar.initial = 0 desc = self.sanity_check_description if trainer.sanity_checking else self.validation_description self.val_progress_bar.set_description(f"{desc} DataLoader {dataloader_idx}") @@ -348,7 +352,9 @@ def on_test_batch_start( if not self.has_dataloader_changed(dataloader_idx): return - self.test_progress_bar.reset(convert_inf(self.total_test_batches_current_dataloader)) + total = convert_inf(self.total_test_batches_current_dataloader) + self.test_progress_bar.reset() + self.test_progress_bar.total = total self.test_progress_bar.initial = 0 self.test_progress_bar.set_description(f"{self.test_description} DataLoader {dataloader_idx}") @@ -387,7 +393,9 @@ def on_predict_batch_start( if not self.has_dataloader_changed(dataloader_idx): return - self.predict_progress_bar.reset(convert_inf(self.total_predict_batches_current_dataloader)) + total = convert_inf(self.total_predict_batches_current_dataloader) + self.predict_progress_bar.reset() + self.predict_progress_bar.total = total self.predict_progress_bar.initial = 0 self.predict_progress_bar.set_description(f"{self.predict_description} DataLoader {dataloader_idx}") diff --git a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py index d93bf1cf60e9c..aafc2d577416d 100644 --- a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py @@ -801,3 +801,50 @@ def test_tqdm_leave(leave, tmp_path): ) trainer.fit(model) assert pbar.init_train_tqdm.call_count == (4 if leave else 1) + + +@patch("lightning.pytorch.trainer.connectors.callback_connector._RICH_AVAILABLE", False) +def test_tqdm_progress_bar_reset_behavior(tmp_path): + """Test that progress bars call reset() without parameters and set total separately.""" + model = BoringModel() + + class ResetTrackingTqdm(MockTqdm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.reset_calls_with_params = [] + + def reset(self, total=None): + self.reset_calls_with_params.append(total) + super().reset(total) + + trainer = Trainer( + default_root_dir=tmp_path, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=1, + logger=False, + enable_checkpointing=False, + ) + + pbar = trainer.progress_bar_callback + + with mock.patch("lightning.pytorch.callbacks.progress.tqdm_progress.Tqdm", ResetTrackingTqdm): + trainer.fit(model) + + train_bar = pbar.train_progress_bar + assert None in train_bar.reset_calls_with_params, ( + f"train reset() should be called without parameters, got calls: {train_bar.reset_calls_with_params}" + ) + # Verify that total was set separately to the expected value + assert 2 in train_bar.total_values, ( + f"train total should be set to 2 after reset(), got total_values: {train_bar.total_values}" + ) + # Verify that validation progress bar reset() was called without parameters + val_bar = pbar.val_progress_bar + assert None in val_bar.reset_calls_with_params, ( + f"validation reset() should be called without parameters, got calls: {val_bar.reset_calls_with_params}" + ) + # Verify that total was set separately to the expected value + assert 2 in val_bar.total_values, ( + f"validation total should be set to 2 after reset(), got total_values: {val_bar.total_values}" + ) From 581d5762a4b4977099f0da5ed9e983c1f6be524d Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Tue, 2 Sep 2025 11:07:32 +0200 Subject: [PATCH 21/40] Warning on eval module (#21146) * add warning * tests * changelog * Apply suggestions from code review --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Deependu (cherry picked from commit 90eba3ffdbbc1bab2b4ff811cb2d2573e0b43ef1) --- src/lightning/pytorch/CHANGELOG.md | 2 ++ src/lightning/pytorch/loops/fit_loop.py | 16 +++++++++++ .../tests_pytorch/loops/test_training_loop.py | 28 +++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index b9bd312fb3438..4cc8dbcf51d3d 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [unReleased] - 2025-09-DD +- Added `PossibleUserWarning` that is raised if modules are in eval mode when training starts ([#21146](https://github.com/Lightning-AI/pytorch-lightning/pull/21146)) + ### Changed - diff --git a/src/lightning/pytorch/loops/fit_loop.py b/src/lightning/pytorch/loops/fit_loop.py index 31d6724a043a3..f25c33359a78a 100644 --- a/src/lightning/pytorch/loops/fit_loop.py +++ b/src/lightning/pytorch/loops/fit_loop.py @@ -414,6 +414,9 @@ def on_run_start(self) -> None: self.epoch_loop.val_loop.setup_data() trainer.training = True + # Check for modules in eval mode at training start + self._warn_if_modules_in_eval_mode() + call._call_callback_hooks(trainer, "on_train_start") call._call_lightning_module_hook(trainer, "on_train_start") call._call_strategy_hook(trainer, "on_train_start") @@ -515,6 +518,19 @@ def on_load_checkpoint(self, state_dict: dict) -> None: self._combined_loader_states_to_load = state_dict.get("combined_loader", []) super().on_load_checkpoint(state_dict) + def _warn_if_modules_in_eval_mode(self) -> None: + """Warn if any modules are in eval mode at the start of training.""" + model = self.trainer.lightning_module + eval_modules = [name for name, module in model.named_modules() if not module.training] + + if eval_modules: + rank_zero_warn( + f"Found {len(eval_modules)} module(s) in eval mode at the start of training." + " This may lead to unexpected behavior during training. If this is intentional," + " you can ignore this warning.", + category=PossibleUserWarning, + ) + def _should_accumulate(self) -> bool: """Whether the gradients should be accumulated.""" return self.epoch_loop._should_accumulate() diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py index e3a4c37f6a284..f5aaa18095fc5 100644 --- a/tests/tests_pytorch/loops/test_training_loop.py +++ b/tests/tests_pytorch/loops/test_training_loop.py @@ -13,12 +13,14 @@ # limitations under the License. import itertools import logging +import warnings from unittest.mock import Mock import pytest import torch from torch.utils.data import DataLoader +from lightning.fabric.utilities.warnings import PossibleUserWarning from lightning.pytorch import Trainer, seed_everything from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.loops import _FitLoop @@ -277,3 +279,29 @@ def __iter__(self): # assert progress bar callback uses correct total steps assert pbar.train_progress_bar.total == max_steps + + +@pytest.mark.parametrize("warn", [True, False]) +def test_eval_mode_warning(tmp_path, warn): + """Test that a warning is raised if any module is in eval mode at the start of training.""" + model = BoringModel() + if warn: + model.some_eval_module = torch.nn.Linear(32, 16) + model.some_eval_module.eval() + + trainer = Trainer( + default_root_dir=tmp_path, + max_epochs=1, + ) + + if warn: + with pytest.warns(PossibleUserWarning): + trainer.fit(model) + else: + with warnings.catch_warnings(record=True) as warning_list: + warnings.simplefilter("always") + trainer.fit(model) + eval_warnings = [ + w for w in warning_list if issubclass(w.category, PossibleUserWarning) and "eval mode" in str(w.message) + ] + assert len(eval_warnings) == 0, "Expected no eval mode warnings" From a2005ebea7790d966d979894062c8fb60eb1f4f3 Mon Sep 17 00:00:00 2001 From: Shion Matsumoto Date: Tue, 2 Sep 2025 05:15:42 -0400 Subject: [PATCH 22/40] Fix workflow matrix reference (#21145) Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit c3ca8a516ff4467ae1ed31a6bcbf1d69ac474635) --- .github/workflows/ci-tests-fabric.yml | 4 ++-- .github/workflows/ci-tests-pytorch.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 939f986cc187a..abfe0e4e8881a 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -79,7 +79,7 @@ jobs: run: pip install -q -r .actions/requirements.txt - name: Set min. dependencies - if: ${{ matrix.requires == 'oldest' }} + if: ${{ matrix.config.requires == 'oldest' }} run: | cd requirements/fabric pip install -U "lightning-utilities[cli]" @@ -88,7 +88,7 @@ jobs: pip install "pyyaml==5.4" --no-build-isolation - name: Adjust PyTorch versions in requirements files - if: ${{ matrix.requires != 'oldest' }} + if: ${{ matrix.config.requires != 'oldest' }} run: | pip install -q -r requirements/ci.txt python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 011cd0b6a9cbc..424cb3375f51a 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -84,7 +84,7 @@ jobs: run: pip install -q -r .actions/requirements.txt - name: Set min. dependencies - if: ${{ matrix.requires == 'oldest' }} + if: ${{ matrix.config.requires == 'oldest' }} run: | cd requirements/pytorch pip install -U "lightning-utilities[cli]" @@ -93,7 +93,7 @@ jobs: pip install "pyyaml==5.4" --no-build-isolation - name: Adjust PyTorch versions in requirements files - if: ${{ matrix.requires != 'oldest' }} + if: ${{ matrix.config.requires != 'oldest' }} run: | pip install -q -r requirements/ci.txt python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py From 289f0c4b7e330f9b3018cc278494ed1c63a2bb9c Mon Sep 17 00:00:00 2001 From: Mauricio Villegas <5780272+mauvilsa@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:18:04 +0200 Subject: [PATCH 23/40] Fix `LightningCLI` not using `ckpt_path` hyperparameters to instantiate classes (#21116) * Fix LightningCLI not using ckpt_path hyperparameters to instantiate classes * Add message about ckpt_path hyperparameters when parsing fails * Apply suggestions from code review --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Nicki Skafte Detlefsen Co-authored-by: Jirka B (cherry picked from commit 4824cc15d3aba166704f318414a46a1418c3be88) --- src/lightning/pytorch/CHANGELOG.md | 3 +++ src/lightning/pytorch/cli.py | 20 +++++++++++++++ tests/tests_pytorch/test_cli.py | 41 +++++++++++++++++++++++++++++- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 4cc8dbcf51d3d..41462f9769c46 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -17,6 +17,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed `LightningCLI` not using `ckpt_path` hyperparameters to instantiate classes ([#21116](https://github.com/Lightning-AI/pytorch-lightning/pull/21116)) + + - Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106)) diff --git a/src/lightning/pytorch/cli.py b/src/lightning/pytorch/cli.py index 225296240674a..91247127f6c87 100644 --- a/src/lightning/pytorch/cli.py +++ b/src/lightning/pytorch/cli.py @@ -16,6 +16,7 @@ import sys from collections.abc import Iterable from functools import partial, update_wrapper +from pathlib import Path from types import MethodType from typing import Any, Callable, Optional, TypeVar, Union @@ -397,6 +398,7 @@ def __init__( main_kwargs, subparser_kwargs = self._setup_parser_kwargs(self.parser_kwargs) self.setup_parser(run, main_kwargs, subparser_kwargs) self.parse_arguments(self.parser, args) + self._parse_ckpt_path() self.subcommand = self.config["subcommand"] if run else None @@ -551,6 +553,24 @@ def parse_arguments(self, parser: LightningArgumentParser, args: ArgsType) -> No else: self.config = parser.parse_args(args) + def _parse_ckpt_path(self) -> None: + """If a checkpoint path is given, parse the hyperparameters from the checkpoint and update the config.""" + if not self.config.get("subcommand"): + return + ckpt_path = self.config[self.config.subcommand].get("ckpt_path") + if ckpt_path and Path(ckpt_path).is_file(): + ckpt = torch.load(ckpt_path, weights_only=True, map_location="cpu") + hparams = ckpt.get("hyper_parameters", {}) + hparams.pop("_instantiator", None) + if not hparams: + return + hparams = {self.config.subcommand: {"model": hparams}} + try: + self.config = self.parser.parse_object(hparams, self.config) + except SystemExit: + sys.stderr.write("Parsing of ckpt_path hyperparameters failed!\n") + raise + def _dump_config(self) -> None: if hasattr(self, "config_dump"): return diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 1b883dda0282a..50e6a6f721ea1 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -17,7 +17,7 @@ import operator import os import sys -from contextlib import ExitStack, contextmanager, redirect_stdout +from contextlib import ExitStack, contextmanager, redirect_stderr, redirect_stdout from io import StringIO from pathlib import Path from typing import Callable, Optional, Union @@ -487,6 +487,45 @@ def test_lightning_cli_print_config(): assert outval["ckpt_path"] is None +class BoringCkptPathModel(BoringModel): + def __init__(self, out_dim: int = 2, hidden_dim: int = 2) -> None: + super().__init__() + self.save_hyperparameters() + self.layer = torch.nn.Linear(32, out_dim) + + +def test_lightning_cli_ckpt_path_argument_hparams(cleandir): + class CkptPathCLI(LightningCLI): + def add_arguments_to_parser(self, parser): + parser.link_arguments("model.out_dim", "model.hidden_dim", compute_fn=lambda x: x * 2) + + cli_args = ["fit", "--model.out_dim=3", "--trainer.max_epochs=1"] + with mock.patch("sys.argv", ["any.py"] + cli_args): + cli = CkptPathCLI(BoringCkptPathModel) + + assert cli.config.fit.model.out_dim == 3 + assert cli.config.fit.model.hidden_dim == 6 + hparams_path = Path(cli.trainer.log_dir) / "hparams.yaml" + assert hparams_path.is_file() + hparams = yaml.safe_load(hparams_path.read_text()) + assert hparams["out_dim"] == 3 + assert hparams["hidden_dim"] == 6 + + checkpoint_path = next(Path(cli.trainer.log_dir, "checkpoints").glob("*.ckpt")) + cli_args = ["predict", f"--ckpt_path={checkpoint_path}"] + with mock.patch("sys.argv", ["any.py"] + cli_args): + cli = CkptPathCLI(BoringCkptPathModel) + + assert cli.config.predict.model.out_dim == 3 + assert cli.config.predict.model.hidden_dim == 6 + assert cli.config_init.predict.model.layer.out_features == 3 + + err = StringIO() + with mock.patch("sys.argv", ["any.py"] + cli_args), redirect_stderr(err), pytest.raises(SystemExit): + cli = LightningCLI(BoringModel) + assert "Parsing of ckpt_path hyperparameters failed" in err.getvalue() + + def test_lightning_cli_submodules(cleandir): class MainModule(BoringModel): def __init__(self, submodule1: LightningModule, submodule2: LightningModule, main_param: int = 1): From 74e94877ef598b2b289c89ae8e8aa2cb4069add0 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Tue, 2 Sep 2025 16:15:38 +0200 Subject: [PATCH 24/40] Add support for deepspeeds `exclude_frozen_parameters` (#21060) * add to deepspeed strategies * add testing * changelog * GLOO_SOCKET_IFNAME --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Jirka B (cherry picked from commit 5071a04ae5e06a52e438e5be0b40cf188bf87b57) --- .github/workflows/ci-tests-pytorch.yml | 4 ++ src/lightning/fabric/CHANGELOG.md | 4 +- src/lightning/fabric/strategies/deepspeed.py | 8 +++- src/lightning/pytorch/CHANGELOG.md | 4 ++ src/lightning/pytorch/strategies/deepspeed.py | 11 ++++- .../tests_fabric/strategies/test_deepspeed.py | 29 +++++++++++++- .../strategies/test_deepspeed.py | 40 +++++++++++++++++++ 7 files changed, 94 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 424cb3375f51a..c92e4cd9d0a12 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -83,6 +83,10 @@ jobs: - name: basic setup run: pip install -q -r .actions/requirements.txt + - name: Append Env. vars for Linux + if: ${{ runner.os == 'Linux' }} + run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV + - name: Set min. dependencies if: ${{ matrix.config.requires == 'oldest' }} run: | diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 54acb25eaf682..2020d560d1760 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -10,7 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- Include `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060)) ### Fixed @@ -57,7 +57,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Removed -- Removed legacy support for `lightning run model`. Use `fabric run` instead ([#20588](https://github.com/Lightning-AI/pytorch-lightning/pull/20588)) +- Removed legacy support for `lightning run model`; use `fabric run` instead ([#20588](https://github.com/Lightning-AI/pytorch-lightning/pull/20588)) ## [2.5.0] - 2024-12-19 diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 322fa1899b0ee..d21182b525f66 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -99,6 +99,7 @@ def __init__( precision: Optional[Precision] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, + exclude_frozen_parameters: bool = False, ) -> None: """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. `For more information: https://pytorch- @@ -228,6 +229,8 @@ def __init__( when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards per worker. + exclude_frozen_parameters: Exclude frozen parameters when saving checkpoints. + """ if not _DEEPSPEED_AVAILABLE: raise ImportError( @@ -288,6 +291,7 @@ def __init__( self.remote_device = remote_device self.load_full_weights = load_full_weights + self.exclude_frozen_parameters = exclude_frozen_parameters # default FP16 parameters. self.loss_scale = loss_scale @@ -444,7 +448,9 @@ def save_checkpoint( # there might be other stateful objects unrelated to the deepspeed engine - convert them to a state_dict state = self._convert_stateful_objects_in_state(state, filter={}) # use deepspeed's internal checkpointing function to handle partitioned weights across processes - engine.save_checkpoint(path, client_state=state, tag="checkpoint") + engine.save_checkpoint( + path, client_state=state, tag="checkpoint", exclude_frozen_parameters=self.exclude_frozen_parameters + ) @override def load_checkpoint( diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 41462f9769c46..97e4e13a31a16 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -8,8 +8,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [unReleased] - 2025-09-DD +- Added `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060)) + + - Added `PossibleUserWarning` that is raised if modules are in eval mode when training starts ([#21146](https://github.com/Lightning-AI/pytorch-lightning/pull/21146)) + ### Changed - diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py index dabfde70242b9..261b24545314f 100644 --- a/src/lightning/pytorch/strategies/deepspeed.py +++ b/src/lightning/pytorch/strategies/deepspeed.py @@ -122,6 +122,7 @@ def __init__( precision_plugin: Optional[Precision] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, + exclude_frozen_parameters: bool = False, ) -> None: """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. `For more information: https://pytorch- @@ -253,6 +254,8 @@ def __init__( when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards per worker. + exclude_frozen_parameters: Exclude frozen parameters when saving checkpoints. + """ if not _DEEPSPEED_AVAILABLE: raise MisconfigurationException( @@ -311,6 +314,7 @@ def __init__( self.remote_device = remote_device self.load_full_weights = load_full_weights + self.exclude_frozen_parameters = exclude_frozen_parameters # default FP16 parameters. self.loss_scale = loss_scale @@ -648,7 +652,12 @@ def save_checkpoint(self, checkpoint: dict, filepath: _PATH, storage_options: Op # dump states as a checkpoint dictionary object _exclude_keys = ["state_dict", "optimizer_states"] checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys} - self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint") + self.deepspeed_engine.save_checkpoint( + filepath, + client_state=checkpoint, + tag="checkpoint", + exclude_frozen_parameters=self.exclude_frozen_parameters, + ) @override def load_checkpoint(self, checkpoint_path: _PATH) -> dict[str, Any]: diff --git a/tests/tests_fabric/strategies/test_deepspeed.py b/tests/tests_fabric/strategies/test_deepspeed.py index 032ee63cd4721..817121c9e2f59 100644 --- a/tests/tests_fabric/strategies/test_deepspeed.py +++ b/tests/tests_fabric/strategies/test_deepspeed.py @@ -193,7 +193,9 @@ def test_deepspeed_save_checkpoint_client_state_separation(tmp_path): model.modules.return_value = [model] strategy.save_checkpoint(path=tmp_path, state={"model": model, "test": "data"}) # the client_state should not contain any deepspeed engine or deepspeed optimizer - model.save_checkpoint.assert_called_with(tmp_path, client_state={"test": "data"}, tag="checkpoint") + model.save_checkpoint.assert_called_with( + tmp_path, client_state={"test": "data"}, tag="checkpoint", exclude_frozen_parameters=False + ) # Model and optimizer optimizer = Mock() @@ -201,7 +203,9 @@ def test_deepspeed_save_checkpoint_client_state_separation(tmp_path): model.modules.return_value = [model] strategy.save_checkpoint(path=tmp_path, state={"model": model, "optimizer": optimizer, "test": "data"}) # the client_state should not contain any deepspeed engine or deepspeed optimizer - model.save_checkpoint.assert_called_with(tmp_path, client_state={"test": "data"}, tag="checkpoint") + model.save_checkpoint.assert_called_with( + tmp_path, client_state={"test": "data"}, tag="checkpoint", exclude_frozen_parameters=False + ) @RunIf(deepspeed=True) @@ -218,6 +222,27 @@ def test_deepspeed_save_checkpoint_warn_colliding_keys(tmp_path): strategy.save_checkpoint(path=tmp_path, state={"model": model, "optimizer": optimizer, "mp_world_size": 2}) +@RunIf(deepspeed=True) +@pytest.mark.parametrize("exclude_frozen_parameters", [True, False]) +def test_deepspeed_save_checkpoint_exclude_frozen_parameters(exclude_frozen_parameters): + """Test that the DeepSpeed strategy can save checkpoints with the `exclude_frozen_parameters` argument.""" + from deepspeed import DeepSpeedEngine + + strategy = DeepSpeedStrategy(exclude_frozen_parameters=exclude_frozen_parameters) + assert strategy.exclude_frozen_parameters is exclude_frozen_parameters + + model = Mock(spec=DeepSpeedEngine, optimizer=None) + model.modules.return_value = [model] + strategy.save_checkpoint(path="test_path", state={"model": model, "extra": "data"}) + + model.save_checkpoint.assert_called_with( + "test_path", + client_state={"extra": "data"}, + tag="checkpoint", + exclude_frozen_parameters=exclude_frozen_parameters, + ) + + @RunIf(deepspeed=True) def test_deepspeed_load_checkpoint_validate_path(tmp_path): """Test that we validate the checkpoint path for a DeepSpeed checkpoint and give suggestions for user error.""" diff --git a/tests/tests_pytorch/strategies/test_deepspeed.py b/tests/tests_pytorch/strategies/test_deepspeed.py index 7e7d2eacd0617..503d1ea0e630b 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed.py +++ b/tests/tests_pytorch/strategies/test_deepspeed.py @@ -562,6 +562,46 @@ def test_deepspeed_multigpu_single_file(tmp_path): trainer.test(model, ckpt_path=checkpoint_path) +@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) +def test_deepspeed_strategy_exclude_frozen_parameters_integration(tmp_path): + """Test end-to-end integration of exclude_frozen_parameters with actual model training and checkpointing.""" + + class TestModelWithFrozenParams(BoringModel): + def __init__(self): + super().__init__() + self.frozen_layer = torch.nn.Linear(32, 32) + + def configure_model(self) -> None: + super().configure_model() + # Freeze the additional layer parameters + for param in self.frozen_layer.parameters(): + param.requires_grad = False + + def forward(self, x): + x = self.frozen_layer(x) + return super().forward(x) + + model = TestModelWithFrozenParams() + + trainer = Trainer( + default_root_dir=tmp_path, + strategy=DeepSpeedStrategy(exclude_frozen_parameters=True), + accelerator="gpu", + devices=1, + fast_dev_run=True, + precision="16-mixed", + enable_progress_bar=False, + enable_model_summary=False, + ) + + trainer.fit(model) + checkpoint_path = os.path.join(tmp_path, "checkpoint_exclude_frozen.ckpt") + trainer.save_checkpoint(checkpoint_path) + + # Verify checkpoint was created + assert os.path.exists(checkpoint_path) + + class ModelParallelClassificationModel(LightningModule): def __init__(self, lr: float = 0.01, num_blocks: int = 5): super().__init__() From 257db2bd609e8ad808ea83e02b226df09320a98c Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Tue, 2 Sep 2025 18:20:52 +0200 Subject: [PATCH 25/40] Impr docs on batch sizes and limits in distributed (#21070) * improve gpu_faq * add per device text * header * Apply suggestions from code review --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit d3996ad32e636a9e45316dbfe716354fb7f37685) --- docs/source-pytorch/accelerators/gpu_faq.rst | 68 +++++++++++++++---- docs/source-pytorch/common/trainer.rst | 4 +- docs/source-pytorch/expertise_levels.rst | 2 +- docs/source-pytorch/levels/intermediate.rst | 2 +- .../levels/intermediate_level_7.rst | 6 +- src/lightning/pytorch/trainer/trainer.py | 8 +-- 6 files changed, 66 insertions(+), 24 deletions(-) diff --git a/docs/source-pytorch/accelerators/gpu_faq.rst b/docs/source-pytorch/accelerators/gpu_faq.rst index 4cc05555bb559..346843e316bce 100644 --- a/docs/source-pytorch/accelerators/gpu_faq.rst +++ b/docs/source-pytorch/accelerators/gpu_faq.rst @@ -5,31 +5,71 @@ GPU training (FAQ) ================== -****************************************************************** -How should I adjust the learning rate when using multiple devices? -****************************************************************** +*************************************************************** +How should I adjust the batch size when using multiple devices? +*************************************************************** -When using distributed training make sure to modify your learning rate according to your effective -batch size. +Lightning automatically shards your data across multiple GPUs, meaning that each device only sees a unique subset of your +data, but the `batch_size` in your DataLoader remains the same. This means that the effective batch size e.g. the +total number of samples processed in one forward/backward pass is -Let's say you have a batch size of 7 in your dataloader. +.. math:: -.. testcode:: + \text{Effective Batch Size} = \text{DataLoader Batch Size} \times \text{Number of Devices} \times \text{Number of Nodes} - class LitModel(LightningModule): - def train_dataloader(self): - return Dataset(..., batch_size=7) - -Whenever you use multiple devices and/or nodes, your effective batch size will be 7 * devices * num_nodes. +A couple of examples to illustrate this: .. code-block:: python - # effective batch size = 7 * 8 + dataloader = DataLoader(..., batch_size=7) + + # Single GPU: effective batch size = 7 + Trainer(accelerator="gpu", devices=1) + + # Multi-GPU: effective batch size = 7 * 8 = 56 Trainer(accelerator="gpu", devices=8, strategy=...) - # effective batch size = 7 * 8 * 10 + # Multi-node: effective batch size = 7 * 8 * 10 = 560 Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy=...) +In general you should be able to use the same `batch_size` in your DataLoader regardless of the number of devices you are +using. + +.. note:: + + If you want distributed training to work exactly the same as single GPU training, you need to set the `batch_size` + in your DataLoader to `original_batch_size / num_devices` to maintain the same effective batch size. However, this + can lead to poor GPU utilization. + +---- + +****************************************************************** +How should I adjust the learning rate when using multiple devices? +****************************************************************** + +Because the effective batch size is larger when using multiple devices, you need to adjust your learning rate +accordingly. Because the learning rate is a hyperparameter that controls how much to change the model in response to +the estimated error each time the model weights are updated, it is important to scale it with the effective batch size. + +In general, there are two common scaling rules: + +1. **Linear scaling**: Increase the learning rate linearly with the number of devices. + + .. code-block:: python + + # Example: Linear scaling + base_lr = 1e-3 + num_devices = 8 + scaled_lr = base_lr * num_devices # 8e-3 + +2. **Square root scaling**: Increase the learning rate by the square root of the number of devices. + + .. code-block:: python + + # Example: Square root scaling + base_lr = 1e-3 + num_devices = 8 + scaled_lr = base_lr * (num_devices ** 0.5) # 2.83e-3 .. note:: Huge batch sizes are actually really bad for convergence. Check out: `Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour `_ diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index 86ee52f41f0c9..a81934c104b81 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -510,6 +510,7 @@ limit_train_batches How much of training dataset to check. Useful when debugging or testing something that happens at the end of an epoch. +Value is per device. .. testcode:: @@ -535,7 +536,7 @@ limit_test_batches :width: 400 :muted: -How much of test dataset to check. +How much of test dataset to check. Value is per device. .. testcode:: @@ -560,6 +561,7 @@ limit_val_batches How much of validation dataset to check. Useful when debugging or testing something that happens at the end of an epoch. +Value is per device. .. testcode:: diff --git a/docs/source-pytorch/expertise_levels.rst b/docs/source-pytorch/expertise_levels.rst index 7c34123f5b3f7..672ccf48ab7eb 100644 --- a/docs/source-pytorch/expertise_levels.rst +++ b/docs/source-pytorch/expertise_levels.rst @@ -84,7 +84,7 @@ Learn to scale up your models and enable collaborative model development at acad .. Add callout items below this line .. displayitem:: - :header: Level 7: Interactive cloud development + :header: Level 7: Hardware acceleration :description: Learn how to access GPUs and TPUs on the cloud. :button_link: levels/intermediate_level_7.html :col_css: col-md-6 diff --git a/docs/source-pytorch/levels/intermediate.rst b/docs/source-pytorch/levels/intermediate.rst index 282ac7bc98c90..797cfbdf0c2e2 100644 --- a/docs/source-pytorch/levels/intermediate.rst +++ b/docs/source-pytorch/levels/intermediate.rst @@ -16,7 +16,7 @@ Learn to scale up your models and enable collaborative model development at acad .. Add callout items below this line .. displayitem:: - :header: Level 7: Interactive cloud development + :header: Level 7: Hardware acceleration :description: Learn how to access GPUs and TPUs on the cloud. :button_link: intermediate_level_7.html :col_css: col-md-6 diff --git a/docs/source-pytorch/levels/intermediate_level_7.rst b/docs/source-pytorch/levels/intermediate_level_7.rst index ef4122d0d3150..660fd9962c8d2 100644 --- a/docs/source-pytorch/levels/intermediate_level_7.rst +++ b/docs/source-pytorch/levels/intermediate_level_7.rst @@ -1,8 +1,8 @@ :orphan: -###################################### -Level 7: Interactive cloud development -###################################### +############################## +Level 7: Hardware acceleration +############################## Learn to develop models on cloud GPUs and TPUs. diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index b78843990af30..f3d45a17d4a3b 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -184,16 +184,16 @@ def __init__( :class:`datetime.timedelta`. limit_train_batches: How much of training dataset to check (float = fraction, int = num_batches). - Default: ``1.0``. + Value is per device. Default: ``1.0``. limit_val_batches: How much of validation dataset to check (float = fraction, int = num_batches). - Default: ``1.0``. + Value is per device. Default: ``1.0``. limit_test_batches: How much of test dataset to check (float = fraction, int = num_batches). - Default: ``1.0``. + Value is per device. Default: ``1.0``. limit_predict_batches: How much of prediction dataset to check (float = fraction, int = num_batches). - Default: ``1.0``. + Value is per device. Default: ``1.0``. overfit_batches: Overfit a fraction of training/validation data (float) or a set number of batches (int). Default: ``0.0``. From b2ef2e3cc43b0480ade16f5aee1fbbd579c257dc Mon Sep 17 00:00:00 2001 From: Bas Krahmer Date: Tue, 2 Sep 2025 23:34:18 +0200 Subject: [PATCH 26/40] Fix: `no_grad` with AMP bug (#20921) * Disable cache for torch.autocast in amp * Add a test * Only test for bf16-mixed * Implement test to reproduce the issue --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 216f9ec90c5bf3554f7cf484accee325f2a15440) --- src/lightning/pytorch/plugins/precision/amp.py | 3 ++- .../plugins/precision/test_amp.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/lightning/pytorch/plugins/precision/amp.py b/src/lightning/pytorch/plugins/precision/amp.py index 75e792af46b90..5ea62233e1f69 100644 --- a/src/lightning/pytorch/plugins/precision/amp.py +++ b/src/lightning/pytorch/plugins/precision/amp.py @@ -112,7 +112,8 @@ def clip_gradients( super().clip_gradients(optimizer=optimizer, clip_val=clip_val, gradient_clip_algorithm=gradient_clip_algorithm) def autocast_context_manager(self) -> torch.autocast: - return torch.autocast(self.device, dtype=(torch.bfloat16 if self.precision == "bf16-mixed" else torch.half)) + dtype = torch.bfloat16 if self.precision == "bf16-mixed" else torch.half + return torch.autocast(self.device, dtype=dtype, cache_enabled=False) @override @contextmanager diff --git a/tests/tests_pytorch/plugins/precision/test_amp.py b/tests/tests_pytorch/plugins/precision/test_amp.py index cb061c540b2be..3894c4256e0b8 100644 --- a/tests/tests_pytorch/plugins/precision/test_amp.py +++ b/tests/tests_pytorch/plugins/precision/test_amp.py @@ -14,6 +14,8 @@ from unittest.mock import Mock import pytest +import torch +from torch import nn from torch.optim import Optimizer from lightning.pytorch.plugins import MixedPrecision @@ -51,3 +53,19 @@ def test_optimizer_amp_scaling_support_in_step_method(): with pytest.raises(RuntimeError, match="The current optimizer.*does not allow for gradient clipping"): precision.clip_gradients(optimizer, clip_val=1.0) + + +def test_amp_with_no_grad(): + """Test that asserts using `no_grad` context wrapper with a persistent AMP context wrapper does not break gradient + tracking.""" + layer = nn.Linear(2, 1) + x = torch.randn(1, 2) + amp = MixedPrecision(precision="bf16-mixed", device="cpu") + + with amp.autocast_context_manager(): + with torch.no_grad(): + _ = layer(x) + + loss = layer(x).mean() + loss.backward() + assert loss.grad_fn is not None From 052836a0f4c0d43a476271fc558dbe6865a22034 Mon Sep 17 00:00:00 2001 From: GdoongMathew Date: Wed, 3 Sep 2025 14:05:23 +0800 Subject: [PATCH 27/40] docs: update `setup` and `restore`. (#21149) (cherry picked from commit 8ea61650298b2b3fa37f70ed3f4a92d4b030db5c) --- docs/source-pytorch/common/hooks.rst | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/docs/source-pytorch/common/hooks.rst b/docs/source-pytorch/common/hooks.rst index 4c20ad1b857f0..89c1c15d0413f 100644 --- a/docs/source-pytorch/common/hooks.rst +++ b/docs/source-pytorch/common/hooks.rst @@ -83,13 +83,30 @@ with the source of each hook indicated: trainer.fit() │ ├── setup(stage="fit") - │ └── [Callbacks only] - │ - ├── on_fit_start() + │ ├── [LightningDataModule] │ ├── [Callbacks] │ ├── [LightningModule] + │ ├── [LightningModule.configure_shared_model()] + │ ├── [LightningModule.configure_model()] + │ ├── Strategy.restore_checkpoint_before_setup + │ │ ├── [LightningModule.on_load_checkpoint()] + │ │ ├── [LightningModule.load_state_dict()] + │ │ ├── [LightningDataModule.load_state_dict()] + │ │ ├── [Callbacks.on_load_checkpoint()] + │ │ └── [Callbacks.load_state_dict()] │ └── [Strategy] │ + ├── on_fit_start() + │ ├── [Callbacks] + │ └── [LightningModule] + │ + ├── Strategy.restore_checkpoint_after_setup + │ ├── [LightningModule.on_load_checkpoint()] + │ ├── [LightningModule.load_state_dict()] + │ ├── [LightningDataModule.load_state_dict()] + │ ├── [Callbacks.on_load_checkpoint()] + │ └── [Callbacks.load_state_dict()] + │ ├── on_sanity_check_start() │ ├── [Callbacks] │ ├── [LightningModule] From b28f2c824697e94ad4914fc59b77793dcae40756 Mon Sep 17 00:00:00 2001 From: Kavyansh Tyagi <142140238+KAVYANSHTYAGI@users.noreply.github.com> Date: Wed, 3 Sep 2025 12:22:45 +0530 Subject: [PATCH 28/40] =?UTF-8?q?docs:=20warning=20don=E2=80=99t=20use=20`?= =?UTF-8?q?torch.profiler.profile`=20context=20manager=20around=20`Trainer?= =?UTF-8?q?`=20methods=20(#20864)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update profiler.rst * move documentation to basic --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nicki Skafte Detlefsen Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 04e103be545d813320d0ff7a9fdf6034a0e57782) --- docs/source-pytorch/tuning/profiler_basic.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source-pytorch/tuning/profiler_basic.rst b/docs/source-pytorch/tuning/profiler_basic.rst index 880381268fa78..01f4d8a51daaf 100644 --- a/docs/source-pytorch/tuning/profiler_basic.rst +++ b/docs/source-pytorch/tuning/profiler_basic.rst @@ -121,3 +121,22 @@ This can be measured with the :class:`~lightning.pytorch.callbacks.device_stats_ CPU metrics will be tracked by default on the CPU accelerator. To enable it for other accelerators set ``DeviceStatsMonitor(cpu_stats=True)``. To disable logging CPU metrics, you can specify ``DeviceStatsMonitor(cpu_stats=False)``. + +.. warning:: + + **Do not wrap** ``Trainer.fit()``, ``Trainer.validate()``, or other Trainer methods inside a manual + ``torch.profiler.profile`` context manager. This will cause unexpected crashes and cryptic errors due to + incompatibility between PyTorch Profiler's context management and Lightning's internal training loop. + Instead, always use the ``profiler`` argument in the ``Trainer`` constructor or the + :class:`~lightning.pytorch.profilers.pytorch.PyTorchProfiler` profiler class if you want to customize the profiling. + + Example: + + .. code-block:: python + + from lightning.pytorch import Trainer + from lightning.pytorch.profilers import PytorchProfiler + + trainer = Trainer(profiler="pytorch") + # or + trainer = Trainer(profiler=PytorchProfiler(dirpath=".", filename="perf_logs")) From 795930a8e2fd08f06de70063015ed472c1160e98 Mon Sep 17 00:00:00 2001 From: Shion Matsumoto Date: Wed, 3 Sep 2025 10:02:35 +0200 Subject: [PATCH 29/40] `uv` for pytorch tests (#21148) * swap to uv * no pip cache * enable caching. use venv. * activate environment * fix uninstall * replace pip with uv * activate venv * --find-links=... * GLOO_SOCKET_IFNAME --------- Co-authored-by: Nicki Skafte Detlefsen Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Jirka B (cherry picked from commit 29e8ce4b8ab44e6c72f37634a5a29eb166f866eb) --- .github/workflows/_legacy-checkpoints.yml | 16 ++++--- .github/workflows/ci-tests-pytorch.yml | 52 ++++++++++++----------- tests/legacy/generate_checkpoints.sh | 19 ++++----- 3 files changed, 46 insertions(+), 41 deletions(-) diff --git a/.github/workflows/_legacy-checkpoints.yml b/.github/workflows/_legacy-checkpoints.yml index 8cceb5870af8d..6c37281fc04ec 100644 --- a/.github/workflows/_legacy-checkpoints.yml +++ b/.github/workflows/_legacy-checkpoints.yml @@ -57,28 +57,32 @@ jobs: steps: - uses: actions/checkout@v5 - - uses: actions/setup-python@v5 + - name: Install uv and set Python version + uses: astral-sh/setup-uv@v6 with: - # Python version here needs to be supported by all PL versions listed in back-compatible-versions.txt. python-version: "3.9" + # TODO: Avoid activating environment like this + # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment + activate-environment: true + enable-cache: true - name: Install PL from source env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 timeout-minutes: 20 - run: pip install . --extra-index-url="${TORCH_URL}" + run: uv pip install . --extra-index-url="${TORCH_URL}" if: inputs.pl_version == '' - name: Install PL version timeout-minutes: 20 - run: pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}" + run: uv pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}" if: inputs.pl_version != '' - name: Adjust tests -> PL if: ${{ matrix.pkg-name != 'lightning' }} run: | - pip install -q -r .actions/requirements.txt + uv pip install -q -r .actions/requirements.txt python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ --source_import="lightning.fabric,lightning.pytorch" \ --target_import="lightning_fabric,pytorch_lightning" @@ -115,7 +119,7 @@ jobs: # export to env bool if secrets.AWS_REGION is not empty run: echo "WITH_SECRETS=$([ -n '${{ secrets.AWS_REGION }}' ] && echo 1 || echo 0)" >> $GITHUB_ENV - - run: pip install -r requirements/ci.txt + - run: uv pip install -r requirements/ci.txt - name: Upload checkpoints to S3 if: ${{ env.WITH_SECRETS == '1' }} working-directory: ${{ env.LEGACY_FOLDER }} diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index c92e4cd9d0a12..e4ec99849cb41 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -69,52 +69,49 @@ jobs: TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - PYPI_CACHE_DIR: "_pip-wheels" # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: - uses: actions/checkout@v5 - - name: Set up Python ${{ matrix.config.python-version }} - uses: actions/setup-python@v5 + - name: Install uv and set Python version + uses: astral-sh/setup-uv@v6 with: python-version: ${{ matrix.config.python-version || '3.9' }} + # TODO: Avoid activating environment like this + # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment + activate-environment: true + enable-cache: true - - name: basic setup - run: pip install -q -r .actions/requirements.txt + - name: Basic setup + run: uv pip install -q -r .actions/requirements.txt - name: Append Env. vars for Linux if: ${{ runner.os == 'Linux' }} run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV + - name: Append Env. vars for MacOS + if: ${{ runner.os == 'macOS' }} + run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV - name: Set min. dependencies if: ${{ matrix.config.requires == 'oldest' }} run: | cd requirements/pytorch - pip install -U "lightning-utilities[cli]" + uv pip install -U "lightning-utilities[cli]" python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt', 'test.txt']" - pip install "cython<3.0" wheel - pip install "pyyaml==5.4" --no-build-isolation + uv pip install "cython<3.0" wheel + uv pip install "pyyaml==5.4" --no-build-isolation - name: Adjust PyTorch versions in requirements files if: ${{ matrix.config.requires != 'oldest' }} run: | - pip install -q -r requirements/ci.txt + uv pip install -q -r requirements/ci.txt python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py for fpath in `ls requirements/**/*.txt`; do \ python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \ done cat requirements/pytorch/base.txt - - name: pip wheels cache - uses: actions/cache/restore@v4 - with: - path: ${{ env.PYPI_CACHE_DIR }} - key: pypi_wheels - - run: | - mkdir -p $PYPI_CACHE_DIR - ls -lh $PYPI_CACHE_DIR - - name: Env. variables run: | # Switch PyTorch URL between stable and test/future @@ -129,19 +126,21 @@ jobs: - name: Install package & dependencies timeout-minutes: 20 run: | - pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \ - -U --upgrade-strategy=eager --prefer-binary \ + uv pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \ + --upgrade \ -r requirements/_integrations/accelerators.txt \ - --extra-index-url="${TORCH_URL}" \ - --find-links="${PYPI_CACHE_DIR}" - pip list + --find-links="${TORCH_URL}" + uv pip list + - name: Drop LAI from extensions if: ${{ matrix.config.pkg-name != 'lightning' }} # Lightning is dependency of Habana or other accelerators/integrations so in case we test PL we need to remove it - run: pip uninstall -y lightning + run: uv pip uninstall lightning + - name: Drop PL for LAI if: ${{ matrix.config.pkg-name == 'lightning' }} - run: pip uninstall -y pytorch-lightning + run: uv pip uninstall pytorch-lightning + - name: Dump handy wheels if: github.event_name == 'push' && github.ref == 'refs/heads/master' continue-on-error: true @@ -218,6 +217,9 @@ jobs: name: CPU-coverage fail_ci_if_error: false + - name: Minimize uv cache + run: uv cache prune --ci + pl-cpu-guardian: runs-on: ubuntu-latest needs: pl-cpu diff --git a/tests/legacy/generate_checkpoints.sh b/tests/legacy/generate_checkpoints.sh index 1d083a2a8e052..d3cfa693c2906 100644 --- a/tests/legacy/generate_checkpoints.sh +++ b/tests/legacy/generate_checkpoints.sh @@ -7,18 +7,17 @@ set -e LEGACY_FOLDER=$(cd $(dirname $0); pwd -P) -printf "LEGACY_FOLDER: $LEGACY_FOLDER" +printf "LEGACY_FOLDER: $LEGACY_FOLDER\n" TESTS_FOLDER=$(dirname $LEGACY_FOLDER) -ENV_PATH=$LEGACY_FOLDER/vEnv -printf "ENV_PATH: $ENV_PATH" +ENV_PATH=$LEGACY_FOLDER/.venv +printf "ENV_PATH: $ENV_PATH\n" export PYTHONPATH=$TESTS_FOLDER # for `import tests_pytorch` -printf "PYTHONPATH: $PYTHONPATH" +printf "PYTHONPATH: $PYTHONPATH\n" rm -rf $ENV_PATH function create_and_save_checkpoint { - python --version - python -m pip --version - python -m pip list + uv --version + uv pip list python $LEGACY_FOLDER/simple_classif_training.py $pl_ver @@ -33,10 +32,10 @@ do printf "\n\n processing version: $pl_ver\n" # Don't install/update anything before activating venv to avoid breaking any existing environment. - python -m venv $ENV_PATH + uv venv $ENV_PATH source $ENV_PATH/bin/activate - python -m pip install "pytorch_lightning==$pl_ver" \ + uv pip install "pytorch_lightning==$pl_ver" \ -r $LEGACY_FOLDER/requirements.txt \ -r "$(dirname $TESTS_FOLDER)/requirements/pytorch/test.txt" \ -f https://download.pytorch.org/whl/cpu/torch_stable.html @@ -52,7 +51,7 @@ done if [[ -z "$@" ]]; then printf "\n\n processing local version\n" - python -m pip install \ + uv pip install \ -r $LEGACY_FOLDER/requirements.txt \ -r "$(dirname $TESTS_FOLDER)/requirements/pytorch/test.txt" \ -f https://download.pytorch.org/whl/cpu/torch_stable.html From ae3658723b6854b4aa8114de1b97412bdf5b1b10 Mon Sep 17 00:00:00 2001 From: Anay Dongre <56868253+MrAnayDongre@users.noreply.github.com> Date: Wed, 3 Sep 2025 01:32:13 -0700 Subject: [PATCH 30/40] docs: clarify `DeviceStatsMonitor` logged metrics (#20895) * DOC: Clarify DeviceStatsMonitor logged metrics (#20807) * update * use nested list * Apply suggestions from code review --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka B Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Nicki Skafte (cherry picked from commit 9f757c0155463d5b08b6677b8b637df22e772a80) --- .../pytorch/callbacks/device_stats_monitor.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py index 6279dd13be4af..873c4c05f5aed 100644 --- a/src/lightning/pytorch/callbacks/device_stats_monitor.py +++ b/src/lightning/pytorch/callbacks/device_stats_monitor.py @@ -34,6 +34,67 @@ class DeviceStatsMonitor(Callback): r"""Automatically monitors and logs device stats during training, validation and testing stage. ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. + **Logged Metrics** + + Logs device statistics with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}``. + The actual metrics depend on the active accelerator and the ``cpu_stats`` flag. Below are an overview of the + possible available metrics and their meaning. + + - CPU (via ``psutil``) + + - ``cpu_percent`` — System-wide CPU utilization (%) + - ``cpu_vm_percent`` — System-wide virtual memory (RAM) utilization (%) + - ``cpu_swap_percent`` — System-wide swap memory utilization (%) + + - CUDA GPU (via ``torch.cuda.memory_stats``) + + Logs memory statistics from PyTorch caching allocator (all in bytes). + GPU compute utilization is not logged by default. + + - General Memory Usage: + + - ``allocated_bytes.all.current`` — Current allocated GPU memory + - ``allocated_bytes.all.peak`` — Peak allocated GPU memory + - ``reserved_bytes.all.current`` — Current reserved GPU memory (allocated + cached) + - ``reserved_bytes.all.peak`` — Peak reserved GPU memory + - ``active_bytes.all.current`` — Current GPU memory in active use + - ``active_bytes.all.peak`` — Peak GPU memory in active use + - ``inactive_split_bytes.all.current`` — Memory in inactive, splittable blocks + + - Allocator Pool Statistics* (for ``small_pool`` and ``large_pool``): + + - ``allocated_bytes.{pool_type}.current`` / ``allocated_bytes.{pool_type}.peak`` + - ``reserved_bytes.{pool_type}.current`` / ``reserved_bytes.{pool_type}.peak`` + - ``active_bytes.{pool_type}.current`` / ``active_bytes.{pool_type}.peak`` + + - Allocator Events: + + - ``num_ooms`` — Cumulative out-of-memory errors + - ``num_alloc_retries`` — Number of allocation retries + - ``num_device_alloc`` — Number of device allocations + - ``num_device_free`` — Number of device deallocations + + For a full list of CUDA memory stats, see the + `PyTorch documentation `_. + + - TPU (via ``torch_xla``) + + - *Memory Metrics* (per device, e.g., ``xla:0``): + + - ``memory.free.xla:0`` — Free HBM memory (MB) + - ``memory.used.xla:0`` — Used HBM memory (MB) + - ``memory.percent.xla:0`` — Percentage of HBM memory used (%) + + - *XLA Operation Counters*: + + - ``CachedCompile.xla`` + - ``CreateXlaTensor.xla`` + - ``DeviceDataCacheMiss.xla`` + - ``UncachedCompile.xla`` + - ``xla::add.xla``, ``xla::addmm.xla``, etc. + + These counters can be retrieved using: ``torch_xla.debug.metrics.counter_names()`` + Args: cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. If ``True``, it will log CPU stats regardless of the accelerator. From cb4f7292423e0f7242d19f19fe2ab710dd21806a Mon Sep 17 00:00:00 2001 From: slicesequal Date: Wed, 3 Sep 2025 17:08:27 +0800 Subject: [PATCH 31/40] chore: remove redundant words (#21151) Signed-off-by: slicesequal (cherry picked from commit 10f10b6151c52f004a185df5af541d63c60ac95b) --- docs/source-fabric/guide/index.rst | 2 +- docs/source-fabric/levels/intermediate.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source-fabric/guide/index.rst b/docs/source-fabric/guide/index.rst index 669501ec09864..6444a6bbdd17a 100644 --- a/docs/source-fabric/guide/index.rst +++ b/docs/source-fabric/guide/index.rst @@ -78,7 +78,7 @@ Build your own Trainer
.. displayitem:: - :header: Organize your model code with with LightningModule + :header: Organize your model code with LightningModule :description: Organize your code in a LightningModule and use it with Fabric :button_link: lightning_module.html :col_css: col-md-4 diff --git a/docs/source-fabric/levels/intermediate.rst b/docs/source-fabric/levels/intermediate.rst index f21e7c96608ab..30f27bdd1d544 100644 --- a/docs/source-fabric/levels/intermediate.rst +++ b/docs/source-fabric/levels/intermediate.rst @@ -19,7 +19,7 @@ Intermediate skills
.. displayitem:: - :header: Organize your model code with with LightningModule + :header: Organize your model code with LightningModule :description: Organize your code in a LightningModule and use it with Fabric :button_link: ../guide/lightning_module.html :col_css: col-md-4 From 98536d63c2f135b6be09153951c4117f7e898b0b Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 3 Sep 2025 11:11:26 +0200 Subject: [PATCH 32/40] tests: fix missing import --- .../callbacks/progress/test_tqdm_progress_bar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py index aafc2d577416d..89d2fa73e3b6b 100644 --- a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py @@ -18,7 +18,7 @@ from collections import defaultdict from typing import Union from unittest import mock -from unittest.mock import ANY, Mock, PropertyMock, call +from unittest.mock import ANY, Mock, PropertyMock, call, patch import pytest import torch @@ -803,7 +803,7 @@ def test_tqdm_leave(leave, tmp_path): assert pbar.init_train_tqdm.call_count == (4 if leave else 1) -@patch("lightning.pytorch.trainer.connectors.callback_connector._RICH_AVAILABLE", False) +@patch("lightning.pytorch.callbacks.progress.rich_progress._RICH_AVAILABLE", False) def test_tqdm_progress_bar_reset_behavior(tmp_path): """Test that progress bars call reset() without parameters and set total separately.""" model = BoringModel() From 514a79279be76b755398f0a7a1c550403df8c616 Mon Sep 17 00:00:00 2001 From: Mariano Ntrougkas <44480600+marios1861@users.noreply.github.com> Date: Wed, 3 Sep 2025 16:32:09 +0300 Subject: [PATCH 33/40] fix links in DeepSpeed docs (#21153) (cherry picked from commit f61713a8490851cfd3754648754f87d623ba5ab2) --- src/lightning/pytorch/strategies/deepspeed.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py index 261b24545314f..c5253f77cdedb 100644 --- a/src/lightning/pytorch/strategies/deepspeed.py +++ b/src/lightning/pytorch/strategies/deepspeed.py @@ -125,14 +125,13 @@ def __init__( exclude_frozen_parameters: bool = False, ) -> None: """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large - billion parameter models. `For more information: https://pytorch- - lightning.readthedocs.io/en/stable/advanced/model_parallel.html#deepspeed`. + billion parameter models. *For more information:* :ref:`deepspeed_advanced`. .. warning:: This is an :ref:`experimental ` feature. Defaults have been set to enable ZeRO-Offload and some have been taken from the link below. These defaults have been set generally, but may require tuning for optimum performance based on your model size. - `For more information: https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training`. + *For more information:* https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training. Arguments: From bb4dbf98693c22a4d11b551c2c7bf244c0a939c9 Mon Sep 17 00:00:00 2001 From: Shion Matsumoto Date: Wed, 3 Sep 2025 09:53:04 -0400 Subject: [PATCH 34/40] `uv` for tests-fabric (#21155) (cherry picked from commit 663b6ce3a858dfb2da4752cb80f5cee08af17dfe) --- .github/workflows/ci-tests-fabric.yml | 67 +++++++++++++------------- .github/workflows/ci-tests-pytorch.yml | 1 + 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index abfe0e4e8881a..615633c5311bf 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -62,7 +62,6 @@ jobs: env: PACKAGE_NAME: ${{ matrix.config.pkg-name }} FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - PYPI_CACHE_DIR: "_pip-wheels" TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" # TODO: Remove this - Enable running MPS tests on this platform @@ -70,41 +69,50 @@ jobs: steps: - uses: actions/checkout@v5 - - name: Set up Python ${{ matrix.config.python-version }} - uses: actions/setup-python@v5 + - name: Install uv and set Python version + uses: astral-sh/setup-uv@v6 with: python-version: ${{ matrix.config.python-version || '3.9' }} + # TODO: Avoid activating environment like this + # see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment + activate-environment: true + enable-cache: true - - name: basic setup - run: pip install -q -r .actions/requirements.txt + - name: Basic setup + run: uv pip install -q -r .actions/requirements.txt + + - name: Append Env. vars for Linux + if: ${{ runner.os == 'Linux' }} + run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV + + - name: Append Env. vars for MacOS + if: ${{ runner.os == 'macOS' }} + run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV + + - name: Append Env. vars for Windows + if: ${{ runner.os == 'windows' }} + run: | + # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support" + echo "USE_LIBUV=0" >> $GITHUB_ENV - name: Set min. dependencies if: ${{ matrix.config.requires == 'oldest' }} run: | cd requirements/fabric - pip install -U "lightning-utilities[cli]" + uv pip install -U "lightning-utilities[cli]" python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt', 'test.txt']" - pip install "cython<3.0" wheel - pip install "pyyaml==5.4" --no-build-isolation + uv pip install "cython<3.0" wheel + uv pip install "pyyaml==5.4" --no-build-isolation - name: Adjust PyTorch versions in requirements files if: ${{ matrix.config.requires != 'oldest' }} run: | - pip install -q -r requirements/ci.txt + uv pip install -q -r requirements/ci.txt python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py for fpath in `ls requirements/**/*.txt`; do \ python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \ done - - name: pip wheels cache - uses: actions/cache/restore@v4 - with: - path: ${{ env.PYPI_CACHE_DIR }} - key: pypi_wheels - - run: | - mkdir -p $PYPI_CACHE_DIR - ls -lh $PYPI_CACHE_DIR - - name: Expand Env. variables run: | # Switch PyTorch URL between stable and test/future @@ -113,25 +121,15 @@ jobs: python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.config.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.config.pkg-name}}' != 'lightning' else 'fabric-'))" >> $GITHUB_ENV - - name: Append Env. vars for MacOS - if: ${{ runner.os == 'macOS' }} - run: | - # trying to avoid "gloo" issue with SIGABRT - echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV - - name: Append Env. vars for Windows - if: ${{ runner.os == 'windows' }} - run: | - # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support" - echo "USE_LIBUV=0" >> $GITHUB_ENV - name: Install package & dependencies timeout-minutes: 20 run: | - pip install -e ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \ - -U --upgrade-strategy=eager --prefer-binary \ - --extra-index-url="${TORCH_URL}" \ - --find-links="${PYPI_CACHE_DIR}" - pip list + uv pip install ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \ + --upgrade \ + --find-links="${TORCH_URL}" + uv pip list + - name: Dump handy wheels if: github.event_name == 'push' && github.ref == 'refs/heads/master' continue-on-error: true @@ -179,6 +177,9 @@ jobs: name: CPU-coverage fail_ci_if_error: false + - name: Minimize uv cache + run: uv cache prune --ci + fabric-cpu-guardian: runs-on: ubuntu-latest needs: fabric-cpu diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index e4ec99849cb41..41d895f14ae86 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -89,6 +89,7 @@ jobs: - name: Append Env. vars for Linux if: ${{ runner.os == 'Linux' }} run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV + - name: Append Env. vars for MacOS if: ${{ runner.os == 'macOS' }} run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV From adc72e24f657cb34a096ea433e681965646abe23 Mon Sep 17 00:00:00 2001 From: Mauricio Villegas <5780272+mauvilsa@users.noreply.github.com> Date: Thu, 4 Sep 2025 10:18:47 +0200 Subject: [PATCH 35/40] Remove forgotten deprecated parse_as_dict argument in LightningCLI test (#21159) (cherry picked from commit cdc0db4898b8cad6dbc8611b8e722280bdf79ec6) --- tests/tests_pytorch/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 50e6a6f721ea1..248852f4cf1f3 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -98,7 +98,7 @@ class _UnkArgError(Exception): def _raise(): raise _UnkArgError - parser = LightningArgumentParser(add_help=False, parse_as_dict=False) + parser = LightningArgumentParser(add_help=False) parser.add_lightning_class_args(Trainer, None) monkeypatch.setattr(parser, "exit", lambda *args: _raise(), raising=True) From 98a5188bc43bafcfd9058ce69337948ba563153d Mon Sep 17 00:00:00 2001 From: Kavyansh Tyagi <142140238+kavyanshtyagi@users.noreply.github.com> Date: Thu, 4 Sep 2025 13:40:10 +0200 Subject: [PATCH 36/40] Respect `verbose=False` in `seed_everything` when no seed is provided (#21161) * Update test_mlflow.py * test_mlflow.py * Respect verbose flag in seed_everything * Apply suggestions from code review --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit d6499eddeb69d541dba13cd5814691a74a45318c) --- src/lightning/fabric/CHANGELOG.md | 3 +++ src/lightning/fabric/utilities/seed.py | 3 ++- tests/tests_fabric/utilities/test_seed.py | 8 ++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 2020d560d1760..111328e362975 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -18,6 +18,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) +- Respect `verbose=False` in `seed_everything` when no seed is provided + + ## [2.5.4] - 2025-08-29 ### Changed diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index 534e5e3db653e..841fa195696a2 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -40,7 +40,8 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False, verbose: env_seed = os.environ.get("PL_GLOBAL_SEED") if env_seed is None: seed = 0 - rank_zero_warn(f"No seed found, seed set to {seed}") + if verbose: + rank_zero_warn(f"No seed found, seed set to {seed}") else: try: seed = int(env_seed) diff --git a/tests/tests_fabric/utilities/test_seed.py b/tests/tests_fabric/utilities/test_seed.py index 2700213747f9a..81fde5aae3ef5 100644 --- a/tests/tests_fabric/utilities/test_seed.py +++ b/tests/tests_fabric/utilities/test_seed.py @@ -72,6 +72,14 @@ def test_seed_everything_accepts_valid_seed_from_env(): assert seed_everything() == 17 +@mock.patch.dict(os.environ, {}, clear=True) +def test_seed_everything_non_verbose_no_warning(): + """Ensure that no warning is emitted when verbose is False and no seed is provided.""" + with warnings.catch_warnings(record=True) as caught: + seed_everything(verbose=False) + assert caught == [] + + def test_reset_seed_no_op(): """Test that the reset_seed function is a no-op when seed_everything() was not used.""" assert "PL_GLOBAL_SEED" not in os.environ From 2e5d022f681f22a259230db915989b854ce822ff Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Thu, 4 Sep 2025 13:47:53 +0200 Subject: [PATCH 37/40] Tuner cleanup on error (#21162) * Make sure temp checkpoints are cleaned up on failed tuning * add testing * changelog --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit e1e2534d32cad1d2bad87f5b03da857108773bba) --- src/lightning/pytorch/CHANGELOG.md | 5 ++ .../pytorch/tuner/batch_size_scaling.py | 29 ++++---- src/lightning/pytorch/tuner/lr_finder.py | 67 ++++++++++--------- tests/tests_pytorch/tuner/test_lr_finder.py | 50 ++++++++++++++ .../tuner/test_scale_batch_size.py | 47 +++++++++++++ 5 files changed, 154 insertions(+), 44 deletions(-) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 97e4e13a31a16..9f78e5a17e77c 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -33,6 +33,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `TQDMProgressBar` not resetting correctly when using both a finite and iterable dataloader ([#21147](https://github.com/Lightning-AI/pytorch-lightning/pull/21147)) +- Fixed cleanup of temporary files from `Tuner` on crashes ([#21162](https://github.com/Lightning-AI/pytorch-lightning/pull/21162)) + + +--- + ## [2.5.4] - 2025-08-29 ### Fixed diff --git a/src/lightning/pytorch/tuner/batch_size_scaling.py b/src/lightning/pytorch/tuner/batch_size_scaling.py index 99badd84bb8ad..78d2aa52f5725 100644 --- a/src/lightning/pytorch/tuner/batch_size_scaling.py +++ b/src/lightning/pytorch/tuner/batch_size_scaling.py @@ -76,24 +76,27 @@ def _scale_batch_size( if trainer.progress_bar_callback: trainer.progress_bar_callback.disable() - new_size, _ = _adjust_batch_size(trainer, batch_arg_name, value=init_val) - - if mode == "power": - new_size = _run_power_scaling(trainer, new_size, batch_arg_name, max_trials, params) - elif mode == "binsearch": - new_size = _run_binary_scaling(trainer, new_size, batch_arg_name, max_trials, params) + try: + new_size, _ = _adjust_batch_size(trainer, batch_arg_name, value=init_val) - garbage_collection_cuda() + if mode == "power": + new_size = _run_power_scaling(trainer, new_size, batch_arg_name, max_trials, params) + elif mode == "binsearch": + new_size = _run_binary_scaling(trainer, new_size, batch_arg_name, max_trials, params) - log.info(f"Finished batch size finder, will continue with full run using batch size {new_size}") + garbage_collection_cuda() - __scale_batch_restore_params(trainer, params) + log.info(f"Finished batch size finder, will continue with full run using batch size {new_size}") + except Exception as ex: + raise ex + finally: + __scale_batch_restore_params(trainer, params) - if trainer.progress_bar_callback: - trainer.progress_bar_callback.enable() + if trainer.progress_bar_callback: + trainer.progress_bar_callback.enable() - trainer._checkpoint_connector.restore(ckpt_path) - trainer.strategy.remove_checkpoint(ckpt_path) + trainer._checkpoint_connector.restore(ckpt_path) + trainer.strategy.remove_checkpoint(ckpt_path) return new_size diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py index b4b61d5cf0f93..5ef35dcd6d992 100644 --- a/src/lightning/pytorch/tuner/lr_finder.py +++ b/src/lightning/pytorch/tuner/lr_finder.py @@ -257,40 +257,45 @@ def _lr_find( # Initialize lr finder object (stores results) lr_finder = _LRFinder(mode, min_lr, max_lr, num_training) - # Configure optimizer and scheduler - lr_finder._exchange_scheduler(trainer) - - # Fit, lr & loss logged in callback - _try_loop_run(trainer, params) - - # Prompt if we stopped early - if trainer.global_step != num_training + start_steps: - log.info(f"LR finder stopped early after {trainer.global_step} steps due to diverging loss.") - - # Transfer results from callback to lr finder object - lr_finder.results.update({"lr": trainer.callbacks[0].lrs, "loss": trainer.callbacks[0].losses}) - lr_finder._total_batch_idx = trainer.fit_loop.total_batch_idx # for debug purpose - - __lr_finder_restore_params(trainer, params) - - if trainer.progress_bar_callback: - trainer.progress_bar_callback.enable() - - # Update results across ranks - lr_finder.results = trainer.strategy.broadcast(lr_finder.results) - - # Restore initial state of model (this will also restore the original optimizer state) - trainer._checkpoint_connector.restore(ckpt_path) - trainer.strategy.remove_checkpoint(ckpt_path) - trainer.fit_loop.restarting = False # reset restarting flag as checkpoint restoring sets it to True - trainer.fit_loop.epoch_loop.restarting = False # reset restarting flag as checkpoint restoring sets it to True - trainer.fit_loop.epoch_loop.val_loop._combined_loader = None - trainer.fit_loop._combined_loader = None # reset data fetcher to avoid issues with the next fit - trainer.fit_loop.setup_data() + lr_finder_finished = False + try: + # Configure optimizer and scheduler + lr_finder._exchange_scheduler(trainer) + + # Fit, lr & loss logged in callback + _try_loop_run(trainer, params) + + # Prompt if we stopped early + if trainer.global_step != num_training + start_steps: + log.info(f"LR finder stopped early after {trainer.global_step} steps due to diverging loss.") + + # Transfer results from callback to lr finder object + lr_finder.results.update({"lr": trainer.callbacks[0].lrs, "loss": trainer.callbacks[0].losses}) + lr_finder._total_batch_idx = trainer.fit_loop.total_batch_idx # for debug purpose + + __lr_finder_restore_params(trainer, params) + + if trainer.progress_bar_callback: + trainer.progress_bar_callback.enable() + + # Update results across ranks + lr_finder.results = trainer.strategy.broadcast(lr_finder.results) + lr_finder_finished = True + except Exception as ex: + raise ex + finally: + # Restore initial state of model (this will also restore the original optimizer state) + trainer._checkpoint_connector.restore(ckpt_path) + trainer.strategy.remove_checkpoint(ckpt_path) + trainer.fit_loop.restarting = False # reset restarting flag as checkpoint restoring sets it to True + trainer.fit_loop.epoch_loop.restarting = False # reset restarting flag as checkpoint restoring sets it to True + trainer.fit_loop.epoch_loop.val_loop._combined_loader = None + trainer.fit_loop._combined_loader = None # reset data fetcher to avoid issues with the next fit + trainer.fit_loop.setup_data() # Apply LR suggestion after restoring so it persists for the real training run # When used as a callback, the suggestion would otherwise be lost due to checkpoint restore - if update_attr: + if update_attr and lr_finder_finished: lr = lr_finder.suggestion() if lr is not None: # update the attribute on the LightningModule (e.g., lr or learning_rate) diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py index 69575a351b0a5..81352ebe256ef 100644 --- a/tests/tests_pytorch/tuner/test_lr_finder.py +++ b/tests/tests_pytorch/tuner/test_lr_finder.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import glob import logging import math import os @@ -750,3 +751,52 @@ def __init__(self): assert not torch.allclose(gradients, gradients_no_spacing, rtol=0.1), ( "Gradients should differ significantly in exponential mode when using proper spacing" ) + + +def test_lr_finder_checkpoint_cleanup_on_error(tmp_path): + """Test that temporary checkpoint files are cleaned up even when an error occurs during lr finding.""" + + class FailingModel(BoringModel): + def __init__(self, fail_on_step=2): + super().__init__() + self.fail_on_step = fail_on_step + self.current_step = 0 + self.learning_rate = 1e-3 + + def training_step(self, batch, batch_idx): + self.current_step += 1 + if self.current_step >= self.fail_on_step: + raise RuntimeError("Intentional failure for testing cleanup") + return super().training_step(batch, batch_idx) + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + model = FailingModel() + lr_finder = LearningRateFinder(num_training_steps=5) + + trainer = Trainer( + default_root_dir=tmp_path, + max_epochs=1, + enable_checkpointing=False, + enable_progress_bar=False, + enable_model_summary=False, + logger=False, + callbacks=[lr_finder], + ) + + # Check no lr_find checkpoint files exist initially + lr_find_checkpoints = glob.glob(os.path.join(tmp_path, ".lr_find_*.ckpt")) + assert len(lr_find_checkpoints) == 0, "No lr_find checkpoint files should exist initially" + + # Run lr finder and expect it to fail + with pytest.raises(RuntimeError, match="Intentional failure for testing cleanup"): + trainer.fit(model) + + # Check that no lr_find checkpoint files are left behind + lr_find_checkpoints = glob.glob(os.path.join(tmp_path, ".lr_find_*.ckpt")) + assert len(lr_find_checkpoints) == 0, ( + f"lr_find checkpoint files should be cleaned up, but found: {lr_find_checkpoints}" + ) diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py index e4ed533c6fa83..f0e5fbe6a3c49 100644 --- a/tests/tests_pytorch/tuner/test_scale_batch_size.py +++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import glob import logging import os from copy import deepcopy @@ -486,3 +487,49 @@ def test_batch_size_finder_callback_val_batches(tmp_path): assert trainer.num_val_batches[0] == len(trainer.val_dataloaders) assert trainer.num_val_batches[0] != steps_per_trial + + +def test_scale_batch_size_checkpoint_cleanup_on_error(tmp_path): + """Test that temporary checkpoint files are cleaned up even when an error occurs during batch size scaling.""" + + class FailingModel(BoringModel): + def __init__(self, fail_on_step=2): + super().__init__() + self.fail_on_step = fail_on_step + self.current_step = 0 + self.batch_size = 2 + + def training_step(self, batch, batch_idx): + self.current_step += 1 + if self.current_step >= self.fail_on_step: + raise RuntimeError("Intentional failure for testing cleanup") + return super().training_step(batch, batch_idx) + + def train_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + + model = FailingModel() + batch_size_finder = BatchSizeFinder(max_trials=3, steps_per_trial=2) + trainer = Trainer( + default_root_dir=tmp_path, + max_epochs=1, + enable_checkpointing=False, + enable_progress_bar=False, + enable_model_summary=False, + logger=False, + callbacks=[batch_size_finder], + ) + + # Check no scale_batch_size checkpoint files exist initially + scale_checkpoints = glob.glob(os.path.join(tmp_path, ".scale_batch_size_*.ckpt")) + assert len(scale_checkpoints) == 0, "No scale_batch_size checkpoint files should exist initially" + + # Run batch size scaler and expect it to fail + with pytest.raises(RuntimeError, match="Intentional failure for testing cleanup"): + trainer.fit(model) + + # Check that no scale_batch_size checkpoint files are left behind + scale_checkpoints = glob.glob(os.path.join(tmp_path, ".scale_batch_size_*.ckpt")) + assert len(scale_checkpoints) == 0, ( + f"scale_batch_size checkpoint files should be cleaned up, but found: {scale_checkpoints}" + ) From 1ed69e04df7716ad540643145b906fd7260ab66c Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:04:08 +0200 Subject: [PATCH 38/40] update flaky syntax (#21156) (cherry picked from commit bb4769f711897efcae80a424579bceea0900b35a) --- tests/tests_fabric/utilities/test_spike.py | 2 +- tests/tests_pytorch/callbacks/test_spike.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_fabric/utilities/test_spike.py b/tests/tests_fabric/utilities/test_spike.py index e96a5f77df384..e7f0bedb8e9e9 100644 --- a/tests/tests_fabric/utilities/test_spike.py +++ b/tests/tests_fabric/utilities/test_spike.py @@ -30,7 +30,7 @@ def spike_detection_test(fabric, global_rank_spike, spike_value, should_raise): ) -@pytest.mark.flaky(max_runs=3) +@pytest.mark.flaky(reruns=3) @pytest.mark.parametrize( ("global_rank_spike", "num_devices", "spike_value", "finite_only"), # NOTE FOR ALL FOLLOWING TESTS: diff --git a/tests/tests_pytorch/callbacks/test_spike.py b/tests/tests_pytorch/callbacks/test_spike.py index f61a6c59ca9db..86e3ac88e93cf 100644 --- a/tests/tests_pytorch/callbacks/test_spike.py +++ b/tests/tests_pytorch/callbacks/test_spike.py @@ -48,7 +48,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx) -@pytest.mark.flaky(max_runs=3) +@pytest.mark.flaky(reruns=3) @pytest.mark.parametrize( ("global_rank_spike", "num_devices", "spike_value", "finite_only"), # NOTE FOR ALL FOLLOWING TESTS: From edbca3604a6f814acf0f3a6018b953a776f59f4f Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+borda@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:05:26 +0200 Subject: [PATCH 39/40] debug failing tests for Fabric with `ddp_fork` on PT 2.8 (#21093) * let `_get_default_process_group_backend_for_device` support more hardware platforms (#21057) * support more hardware platforms and no longer hard code cuda when call _get_default_process_group_backend_for_device * Apply suggestions from code review --------- * try it * chlog --------- Signed-off-by: taozhiwei Co-authored-by: taozhiwei Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nicki Skafte Detlefsen (cherry picked from commit 3c81316aed0ba26d8ab1d9abdfe2fdf7f926acaf) --- src/lightning/fabric/CHANGELOG.md | 4 ++++ src/lightning/fabric/strategies/ddp.py | 12 +++++++++- src/lightning/fabric/utilities/distributed.py | 6 ++++- .../utilities/test_distributed.py | 22 +++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 111328e362975..55961cb820bab 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -13,6 +13,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Include `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060)) +- Let `_get_default_process_group_backend_for_device` support more hardware platforms ( + [#21057](https://github.com/Lightning-AI/pytorch-lightning/pull/21057), [#21093](https://github.com/Lightning-AI/pytorch-lightning/pull/21093)) + + ### Fixed - Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index e826b910c16d3..af182ad7f422f 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -160,7 +160,17 @@ def barrier(self, *args: Any, **kwargs: Any) -> None: if torch.distributed.get_backend() == "nccl": torch.distributed.barrier(device_ids=self._determine_ddp_device_ids()) else: - torch.distributed.barrier() + # Handle PyTorch bug where barrier() fails on CPU with "PrivateUse1HooksInterface" error + try: + torch.distributed.barrier() + except RuntimeError as e: + if "PrivateUse1HooksInterface" in str(e): + # Fallback: Use all_reduce as barrier - all processes must participate + # This achieves the same synchronization effect as barrier() + dummy_tensor = torch.tensor(0.0, device=self.root_device) + torch.distributed.all_reduce(dummy_tensor) + else: + raise @override def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index ec4eb261f2d3e..500f3a3e2aa92 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -319,7 +319,11 @@ def _destroy_dist_connection() -> None: def _get_default_process_group_backend_for_device(device: torch.device) -> str: - return "nccl" if device.type == "cuda" else "gloo" + """Return corresponding distributed backend for a given device.""" + device_backend_map = torch.distributed.Backend.default_device_backend_map + if device.type in device_backend_map: + return device_backend_map[device.type] + return "gloo" class _DatasetSamplerWrapper(Dataset): diff --git a/tests/tests_fabric/utilities/test_distributed.py b/tests/tests_fabric/utilities/test_distributed.py index d65eaa810ff4d..51c4b320d5525 100644 --- a/tests/tests_fabric/utilities/test_distributed.py +++ b/tests/tests_fabric/utilities/test_distributed.py @@ -17,6 +17,7 @@ from lightning.fabric.utilities.distributed import ( _destroy_dist_connection, _gather_all_tensors, + _get_default_process_group_backend_for_device, _InfiniteBarrier, _init_dist_connection, _is_dtensor, @@ -243,6 +244,27 @@ def test_init_dist_connection_registers_destruction_handler(_, atexit_mock): atexit_mock.register.assert_not_called() +def test_get_default_process_group_backend_for_device(): + """Test that each device type maps to its correct default process group backend.""" + # register a custom backend for test + torch.utils.rename_privateuse1_backend("pcu") + + def mock_backend(store, group_rank, group_size, timeout): + pass + + torch.distributed.Backend.register_backend( + "pccl", + lambda store, group_rank, group_size, timeout: mock_backend(store, group_rank, group_size, timeout), + devices=["pcu"], + ) + + # test that the default backend is correctly set for each device + devices = [torch.device("cpu"), torch.device("cuda:0"), torch.device("pcu:0")] + backends = ["gloo", "nccl", "pccl"] + for device, backend in zip(devices, backends): + assert _get_default_process_group_backend_for_device(device) == backend + + @RunIf(min_torch="2.4") def test_is_dtensor(monkeypatch): from torch.distributed._tensor import DTensor From 88c3591f14ba32d2971b586400fb560548edc012 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 3 Sep 2025 11:15:28 +0200 Subject: [PATCH 40/40] releasing `2.5.5` --- docs/source-pytorch/common/hooks.rst | 9 +-------- src/lightning/fabric/CHANGELOG.md | 9 ++------- src/lightning/pytorch/CHANGELOG.md | 22 +++------------------- src/version.info | 2 +- 4 files changed, 7 insertions(+), 35 deletions(-) diff --git a/docs/source-pytorch/common/hooks.rst b/docs/source-pytorch/common/hooks.rst index 89c1c15d0413f..94531ab818d98 100644 --- a/docs/source-pytorch/common/hooks.rst +++ b/docs/source-pytorch/common/hooks.rst @@ -44,22 +44,15 @@ important to understand. The following order is always used: model = MyModel() callback = MyCallback() - trainer = Trainer(callbacks=[callback], logger=False, max_epochs=1) + trainer = Trainer(callbacks=[callback], logger=False, max_epochs=1, enable_progress_bar=False) trainer.fit(model) .. testoutput:: :hide: :options: +ELLIPSIS, +NORMALIZE_WHITESPACE - ┏━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┓ - ┃ ┃ Name ┃ Type ┃ Params ┃ Mode ┃ FLOPs ┃ - ┡━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━┩ - │ 0 │ layer │ Linear │ 66 │ train │ 0 │ - └───┴───────┴────────┴────────┴───────┴───────┘ - ... Callback: Training is starting! Model: Training is starting! - Epoch 0/0 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64/64 ... .. note:: diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 55961cb820bab..ae9ab0fae131f 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -6,23 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). --- -## [unReleased] - 2025-09-DD +## [2.5.5] - 2025-09-05 ### Changed - Include `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060)) - - - Let `_get_default_process_group_backend_for_device` support more hardware platforms ( [#21057](https://github.com/Lightning-AI/pytorch-lightning/pull/21057), [#21093](https://github.com/Lightning-AI/pytorch-lightning/pull/21093)) - ### Fixed - Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) - - -- Respect `verbose=False` in `seed_everything` when no seed is provided +- Respecting `verbose=False` in `seed_everything` when no seed is provided ([#21161](https://github.com/Lightning-AI/pytorch-lightning/pull/21161)) ## [2.5.4] - 2025-08-29 diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 9f78e5a17e77c..55519273e0b27 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -6,38 +6,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). --- -## [unReleased] - 2025-09-DD - -- Added `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060)) - - -- Added `PossibleUserWarning` that is raised if modules are in eval mode when training starts ([#21146](https://github.com/Lightning-AI/pytorch-lightning/pull/21146)) - +## [2.5.5] - 2025-09-05 ### Changed -- - +- Include `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060)) +- Include `PossibleUserWarning` that is raised if modules are in eval mode when training starts ([#21146](https://github.com/Lightning-AI/pytorch-lightning/pull/21146)) ### Fixed - Fixed `LightningCLI` not using `ckpt_path` hyperparameters to instantiate classes ([#21116](https://github.com/Lightning-AI/pytorch-lightning/pull/21116)) - - - Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106)) - - - Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105)) - - - Fixed `TQDMProgressBar` not resetting correctly when using both a finite and iterable dataloader ([#21147](https://github.com/Lightning-AI/pytorch-lightning/pull/21147)) - - - Fixed cleanup of temporary files from `Tuner` on crashes ([#21162](https://github.com/Lightning-AI/pytorch-lightning/pull/21162)) ---- - ## [2.5.4] - 2025-08-29 ### Fixed diff --git a/src/version.info b/src/version.info index fe16b348d97f7..0cadbc1e33b27 100644 --- a/src/version.info +++ b/src/version.info @@ -1 +1 @@ -2.5.4 +2.5.5