Merge branch 'master' into feature/13324_validation-interval

SkafteNicki · web-flow · commit 228f52db01f8 · 2025-08-15T10:19:36.000+02:00
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -19,18 +19,28 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
-- Raise ValueError when seed is `out-of-bounds` or `cannot be cast to int` ([#21029](https://github.com/Lightning-AI/pytorch-lightning/pull/21029))
+-
 
 
 ### Fixed
 
-- Fix XLA strategy to add support for `global_ordinal`, `local_ordinal`, `world_size` which came instead of deprecated methods ([#20852](https://github.com/Lightning-AI/pytorch-lightning/issues/20852))
+-
 
 
-- fix: remove extra `name` parameter in accelerator registry decorator ([#20975](https://github.com/Lightning-AI/pytorch-lightning/pull/20975))
+---
 
+## [2.5.3] - 2025-08-13
+
+### Changed
+
+- Enable "auto" for `devices` and `accelerator` as CLI arguments ([#20913](https://github.com/Lightning-AI/pytorch-lightning/pull/20913))
+- Raise ValueError when seed is `out-of-bounds` or `cannot be cast to int` ([#21029](https://github.com/Lightning-AI/pytorch-lightning/pull/21029))
+
+### Fixed
+
+- Fixed XLA strategy to add support for `global_ordinal`, `local_ordinal`, `world_size` which came instead of deprecated methods ([#20852](https://github.com/Lightning-AI/pytorch-lightning/issues/20852))
+- Fixed remove extra `name` parameter in accelerator registry decorator ([#20975](https://github.com/Lightning-AI/pytorch-lightning/pull/20975))
 
----
 
 ## [2.5.2] - 2025-3-20
 
diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py
@@ -319,7 +319,11 @@ def _destroy_dist_connection() -> None:
 
 
 def _get_default_process_group_backend_for_device(device: torch.device) -> str:
-    return "nccl" if device.type == "cuda" else "gloo"
+    """Return corresponding distributed backend for a given device."""
+    device_backend_map = torch.distributed.Backend.default_device_backend_map
+    if device.type in device_backend_map:
+        return device_backend_map[device.type]
+    return "gloo"
 
 
 class _DatasetSamplerWrapper(Dataset):
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -10,17 +10,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-- Added `save_on_exception` option to `ModelCheckpoint` Callback ([#20916](https://github.com/Lightning-AI/pytorch-lightning/pull/20916))
-
-
-- Added support for general mappings being returned from `training_step` when using manual optimization ([#21011](https://github.com/Lightning-AI/pytorch-lightning/pull/21011))
+-
 
 
 ### Changed
 
-- Allow returning `ONNXProgram` when calling `to_onnx(dynamo=True)` ([#20811](https://github.com/Lightning-AI/pytorch-lightning/pull/20811))
-
-
 - Default to RichProgressBar and RichModelSummary if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))
 
 
@@ -31,32 +25,37 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- fix progress bar console clearing for Rich `14.1+` ([#21016](https://github.com/Lightning-AI/pytorch-lightning/pull/21016))
-
-
-- fix `AdvancedProfiler` to handle nested profiling actions for Python 3.12+ ([#20809](https://github.com/Lightning-AI/pytorch-lightning/pull/20809))
+-
 
 
-- Fix support for more dtypes in `ModelSummary` ([#21034](https://github.com/Lightning-AI/pytorch-lightning/pull/21034))
+- Fixed learning rate not being correctly set after using `LearningRateFinder` callback ([#21068](https://github.com/Lightning-AI/pytorch-lightning/pull/21068))
 
+---
 
-- Fixed metrics in `RichProgressBar` being updated according to user provided `refresh_rate` ([#21032](https://github.com/Lightning-AI/pytorch-lightning/pull/21032))
+## [2.5.3] - 2025-08-13
 
+### Changed
 
-- Fix `save_last` behavior in the absence of validation ([#20960](https://github.com/Lightning-AI/pytorch-lightning/pull/20960))
+- Added `save_on_exception` option to `ModelCheckpoint` Callback ([#20916](https://github.com/Lightning-AI/pytorch-lightning/pull/20916))
+- Allow `dataloader_idx_` in log names when `add_dataloader_idx=False` ([#20987](https://github.com/Lightning-AI/pytorch-lightning/pull/20987))
+- Allow returning `ONNXProgram` when calling `to_onnx(dynamo=True)` ([#20811](https://github.com/Lightning-AI/pytorch-lightning/pull/20811))
+- Extended support for general mappings being returned from `training_step` when using manual optimization ([#21011](https://github.com/Lightning-AI/pytorch-lightning/pull/21011))
 
+### Fixed
 
+- Fixed Allowing trainer to accept CUDAAccelerator instance as accelerator with FSDP strategy ([#20964](https://github.com/Lightning-AI/pytorch-lightning/pull/20964))
+- Fixed progress bar console clearing for Rich `14.1+` ([#21016](https://github.com/Lightning-AI/pytorch-lightning/pull/21016))
+- Fixed `AdvancedProfiler` to handle nested profiling actions for Python 3.12+ ([#20809](https://github.com/Lightning-AI/pytorch-lightning/pull/20809))
+- Fixed rich progress bar error when resume training ([#21000](https://github.com/Lightning-AI/pytorch-lightning/pull/21000))
+- Fixed double iteration bug when resumed from a checkpoint. ([#20775](https://github.com/Lightning-AI/pytorch-lightning/pull/20775))
+- Fixed support for more dtypes in `ModelSummary` ([#21034](https://github.com/Lightning-AI/pytorch-lightning/pull/21034))
+- Fixed metrics in `RichProgressBar` being updated according to user provided `refresh_rate` ([#21032](https://github.com/Lightning-AI/pytorch-lightning/pull/21032))
+- Fixed `save_last` behavior in the absence of validation ([#20960](https://github.com/Lightning-AI/pytorch-lightning/pull/20960))
 - Fixed integration between `LearningRateFinder` and `EarlyStopping` ([#21056](https://github.com/Lightning-AI/pytorch-lightning/pull/21056))
-
-
-- Fix gradient calculation in `lr_finder` for `mode="exponential"`  ([#21055](https://github.com/Lightning-AI/pytorch-lightning/pull/21055))
-
-
+- Fixed gradient calculation in `lr_finder` for `mode="exponential"`  ([#21055](https://github.com/Lightning-AI/pytorch-lightning/pull/21055))
 - Fixed `save_hyperparameters` crashing with `dataclasses` using `init=False` fields ([#21051](https://github.com/Lightning-AI/pytorch-lightning/pull/21051))
 
 
----
-
 ## [2.5.2] - 2025-06-20
 
 ### Changed
diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py
@@ -276,24 +276,30 @@ def _lr_find(
     if trainer.progress_bar_callback:
         trainer.progress_bar_callback.enable()
 
-    # Update lr attr if required
+    # Update results across ranks
     lr_finder.results = trainer.strategy.broadcast(lr_finder.results)
-    if update_attr:
-        lr = lr_finder.suggestion()
-
-        # TODO: log lr.results to self.logger
-        if lr is not None:
-            lightning_setattr(model, attr_name, lr)
-            log.info(f"Learning rate set to {lr}")
 
-    # Restore initial state of model
+    # Restore initial state of model (this will also restore the original optimizer state)
     trainer._checkpoint_connector.restore(ckpt_path)
     trainer.strategy.remove_checkpoint(ckpt_path)
     trainer.fit_loop.restarting = False  # reset restarting flag as checkpoint restoring sets it to True
     trainer.fit_loop.epoch_loop.restarting = False  # reset restarting flag as checkpoint restoring sets it to True
     trainer.fit_loop.epoch_loop.val_loop._combined_loader = None
     trainer.fit_loop._combined_loader = None  # reset data fetcher to avoid issues with the next fit
     trainer.fit_loop.setup_data()
+
+    # Apply LR suggestion after restoring so it persists for the real training run
+    # When used as a callback, the suggestion would otherwise be lost due to checkpoint restore
+    if update_attr:
+        lr = lr_finder.suggestion()
+        if lr is not None:
+            # update the attribute on the LightningModule (e.g., lr or learning_rate)
+            lightning_setattr(model, attr_name, lr)
+            # also update the currently active optimizer(s) so training continues with the suggested LR
+            for opt in trainer.optimizers or []:
+                for pg in opt.param_groups:
+                    pg["lr"] = lr
+            log.info(f"Learning rate set to {lr}")
     return lr_finder
 
 
diff --git a/tests/tests_fabric/utilities/test_distributed.py b/tests/tests_fabric/utilities/test_distributed.py
@@ -17,6 +17,7 @@
 from lightning.fabric.utilities.distributed import (
     _destroy_dist_connection,
     _gather_all_tensors,
+    _get_default_process_group_backend_for_device,
     _InfiniteBarrier,
     _init_dist_connection,
     _is_dtensor,
@@ -243,6 +244,27 @@ def test_init_dist_connection_registers_destruction_handler(_, atexit_mock):
     atexit_mock.register.assert_not_called()
 
 
+def test_get_default_process_group_backend_for_device():
+    """Test that each device type maps to its correct default process group backend."""
+    # register a custom backend for test
+    torch.utils.rename_privateuse1_backend("pcu")
+
+    def mock_backend(store, group_rank, group_size, timeout):
+        pass
+
+    torch.distributed.Backend.register_backend(
+        "pccl",
+        lambda store, group_rank, group_size, timeout: mock_backend(store, group_rank, group_size, timeout),
+        devices=["pcu"],
+    )
+
+    # test that the default backend is correctly set for each device
+    devices = [torch.device("cpu"), torch.device("cuda:0"), torch.device("pcu:0")]
+    backends = ["gloo", "nccl", "pccl"]
+    for device, backend in zip(devices, backends):
+        assert _get_default_process_group_backend_for_device(device) == backend
+
+
 @RunIf(min_torch="2.4")
 def test_is_dtensor(monkeypatch):
     from torch.distributed._tensor import DTensor
diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py
@@ -619,6 +619,78 @@ def test_gradient_correctness():
     assert abs(suggestion - math.pi) < 1e-2, "Suggestion should be close to pi for this synthetic example"
 
 
+def test_lr_finder_callback_applies_lr_after_restore(tmp_path):
+    """LearningRateFinder used as a callback should apply its suggested LR to the optimizer used after state
+    restoration."""
+
+    import torch.nn as nn
+    import torch.nn.functional as F
+    from torch.utils.data import DataLoader, Dataset
+
+    from lightning.pytorch.callbacks import LearningRateMonitor
+
+    class RandomDataset(Dataset):
+        def __init__(self, n: int = 256, in_dim: int = 28 * 28):
+            self.x = torch.randn(n, in_dim)
+            self.y = torch.randn(n, in_dim)
+
+        def __len__(self) -> int:
+            return len(self.x)
+
+        def __getitem__(self, idx):
+            return self.x[idx], self.y[idx]
+
+    class TinyAE(BoringModel):
+        def __init__(self, lr: float = 1e-5):
+            super().__init__()
+            self.save_hyperparameters()
+            self.encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3))
+            self.decoder = nn.Sequential(nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28))
+
+        def training_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT:
+            x, y = batch
+            z = self.encoder(x)
+            x_hat = self.decoder(z)
+            loss = F.mse_loss(x_hat, y)
+            return loss
+
+        def configure_optimizers(self):
+            return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
+
+    seed_everything(123)
+
+    ds = RandomDataset(n=512)
+    train_loader = DataLoader(ds, batch_size=64, shuffle=False)
+
+    model = TinyAE(lr=1e-5)
+
+    lr_finder_cb = LearningRateFinder()  # default update_attr=True should apply suggestion
+    lr_monitor = LearningRateMonitor(logging_interval="step")
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=2,
+        callbacks=[lr_finder_cb, lr_monitor],
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        log_every_n_steps=1,
+    )
+
+    trainer.fit(model, train_loader)
+    assert model.hparams.lr is not None
+    # Ensure LR Finder produced a suggestion for this setup; if not, the test can't assert application
+    assert lr_finder_cb.optimal_lr is not None, "LR Finder should have computed results"
+    suggestion = lr_finder_cb.optimal_lr.suggestion()
+    assert suggestion is not None, "LR Finder should produce a suggestion for this setup"
+
+    # Verify that the optimizer used for subsequent training has the suggested LR applied
+    assert trainer.optimizers, "Trainer should have an optimizer after fit"
+    current_lr = trainer.optimizers[0].param_groups[0]["lr"]
+    assert current_lr == pytest.approx(suggestion), (
+        f"LR Finder suggestion {suggestion} should be applied to optimizer, but got {current_lr}"
+    )
+
+
 def test_exponential_vs_linear_mode_gradient_difference(tmp_path):
     """Test that exponential and linear modes produce different but valid suggestions.