Merge remote-tracking branch 'upstream/master' into fabric_callback_filtering

SkafteNicki · SkafteNicki · commit 2bda7436bcab · 2025-10-01T07:46:19.000+02:00
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -47,9 +47,9 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10, L4_X_2)"
-      - "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)"
-      - "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, pytorch, 3.12, L4_X_2)"
+      - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10)"
+      - "pytorch.yml / Lit Job (lightning, 3.12)"
+      - "pytorch.yml / Lit Job (pytorch, 3.12)"
 
   - id: "Benchmarks"
     paths:
@@ -148,9 +148,9 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10, L4_X_2)"
-      - "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, fabric, 3.12, L4_X_2)"
-      - "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)"
+      - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10)"
+      - "fabric.yml / Lit Job (fabric, 3.12)"
+      - "fabric.yml / Lit Job (lightning, 3.12)"
 
   # Temporarily disabled
   #  - id: "lightning_fabric: TPU workflow"
diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml
@@ -12,14 +12,14 @@ jobs:
   required-jobs:
     runs-on: ubuntu-latest
     if: github.event.pull_request.draft == false
-    timeout-minutes: 61 # in case something is wrong with the internal timeout
+    timeout-minutes: 71 # in case something is wrong with the internal timeout
     steps:
       - uses: Lightning-AI/probot@v5.5
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           job: check-group
           interval: 180 # seconds
-          timeout: 60 # minutes
+          timeout: 70 # minutes
           maintainers: "Lightning-AI/lai-frameworks"
           owner: "carmocca"
diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml
@@ -4,25 +4,22 @@ trigger:
   pull_request:
     branches: ["master", "release/stable"]
 
-timeout: "55" # minutes
+timeout: "60" # minutes
+machine: "L4_X_2"
+image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this is setting also all oldest requirements which is linked to python == 3.10
     - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
       PACKAGE_NAME: "fabric"
       python_version: "3.10"
-      machine: "L4_X_2"
-    - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
-      PACKAGE_NAME: "fabric"
+    - PACKAGE_NAME: "fabric"
       python_version: "3.12"
-      machine: "L4_X_2"
     # - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
     #   PACKAGE_NAME: "fabric"
-    - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
-      PACKAGE_NAME: "lightning"
+    - PACKAGE_NAME: "lightning"
       python_version: "3.12"
-      machine: "L4_X_2"
   exclude: []
 
 env:
diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml
@@ -4,25 +4,22 @@ trigger:
   pull_request:
     branches: ["master", "release/stable"]
 
-timeout: "55" # minutes
+timeout: "60" # minutes
+machine: "L4_X_2"
+image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this also sets oldest requirements which are linked to Python == 3.10
     - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
       PACKAGE_NAME: "pytorch"
       python_version: "3.10"
-      machine: "L4_X_2"
-    - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
-      PACKAGE_NAME: "pytorch"
+    - PACKAGE_NAME: "pytorch"
       python_version: "3.12"
-      machine: "L4_X_2"
     # - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
     #   PACKAGE_NAME: "pytorch"
-    - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
-      PACKAGE_NAME: "lightning"
+    - PACKAGE_NAME: "lightning"
       python_version: "3.12"
-      machine: "L4_X_2"
   exclude: []
 
 env:
diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst
@@ -21,7 +21,13 @@ For fine-grained control over checkpointing behavior, use the :class:`~lightning
         checkpoint_callback = ModelCheckpoint(dirpath="my/path/", save_top_k=2, monitor="val_loss")
         trainer = Trainer(callbacks=[checkpoint_callback])
         trainer.fit(model)
-        checkpoint_callback.best_model_path
+
+        # Access best and last model checkpoint directly from the callback
+        print(checkpoint_callback.best_model_path)
+        print(checkpoint_callback.last_model_path)
+        # Or via the trainer
+        print(trainer.checkpoint_callback.best_model_path)
+        print(trainer.checkpoint_callback.last_model_path)
 
 Any value that has been logged via *self.log* in the LightningModule can be monitored.
 
diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py
@@ -645,6 +645,7 @@ def package_list_from_file(file):
     r"installation.html$",
     r"starter/installation.html$",
     r"^../common/trainer.html#trainer-flags$",
+    "https://medium.com/pytorch-lightning/quick-contribution-guide-86d977171b3a",
     "https://deepgenerativemodels.github.io/assets/slides/cs236_lecture11.pdf",
     "https://developer.habana.ai", # returns 403 error but redirects to intel.com documentation
     "https://www.intel.com/content/www/us/en/products/docs/processors/what-is-a-gpu.html",
diff --git a/docs/source-pytorch/data/alternatives.rst b/docs/source-pytorch/data/alternatives.rst
@@ -99,7 +99,12 @@ The webdataset library contains a small wrapper (``WebLoader``) that adds a flui
     import lightning as L
     import webdataset as wds
 
-    dataset = wds.WebDataset(urls)
+    dataset = wds.WebDataset(
+        urls,
+        # needed for multi-gpu or multi-node training
+        workersplitter=wds.shardlists.split_by_worker,
+        nodesplitter=wds.shardlists.split_by_node,
+    )
     train_dataloader = wds.WebLoader(dataset)
 
     model = ...
diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -3,7 +3,7 @@ myst-parser >=0.18.1, <5.0.0
 nbsphinx >=0.8.5, <=0.9.7
 nbconvert >7.14, <7.17
 pandoc >=1.0, <=2.4
-docutils>=0.18.1,<=0.22
+docutils>=0.18.1,<=0.22.2
 sphinxcontrib-fulltoc >=1.0, <=1.2.0
 sphinxcontrib-mockautodoc
 sphinx-autobuild
diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt
@@ -6,5 +6,5 @@ pytest-timeout ==2.4.0
 pytest-rerunfailures ==16.0.1
 pytest-random-order ==1.2.0
 click ==8.1.8; python_version < "3.11"
-click ==8.2.1; python_version > "3.10"
+click ==8.3.0; python_version > "3.10"
 tensorboardX >=2.6, <2.7.0  # todo: relax it back to `>=2.2` after fixing tests
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
@@ -12,7 +12,7 @@ numpy >1.20.0, <1.27.0
 onnx >1.12.0, <1.20.0
 onnxruntime >=1.12.0, <1.23.0
 onnxscript >= 0.1.0, < 0.5.0
-psutil <7.0.1 # for `DeviceStatsMonitor`
+psutil <7.1.1 # for `DeviceStatsMonitor`
 pandas >2.0, <2.4.0  # needed in benchmarks
 fastapi  # for `ServableModuleValidator`  # not setting version as re-defined in App
 uvicorn  # for `ServableModuleValidator`  # not setting version as re-defined in App
diff --git a/requirements/typing.txt b/requirements/typing.txt
@@ -1,4 +1,4 @@
-mypy==1.18.1
+mypy==1.18.2
 torch==2.8.0
 
 types-Markdown
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -30,6 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Default to `RichProgressBar` and `RichModelSummary` if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise ([#20896](https://github.com/Lightning-AI/pytorch-lightning/pull/20896))
 
 
+- Add MPS accelerator support for mixed precision ([#21209](https://github.com/Lightning-AI/pytorch-lightning/pull/21209))
+
+
 ### Removed
 
 -
@@ -49,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed preventing recursive symlink creation iwhen `save_last='link'` and `save_top_k=-1` ([#21186](https://github.com/Lightning-AI/pytorch-lightning/pull/21186))
 
 
+- Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))
+
+
 ---
 
 ## [2.5.5] - 2025-09-05
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -204,11 +204,11 @@ class ModelCheckpoint(Checkpoint):
         ... )
 
         # retrieve the best checkpoint after training
-        checkpoint_callback = ModelCheckpoint(dirpath='my/path/')
-        trainer = Trainer(callbacks=[checkpoint_callback])
-        model = ...
-        trainer.fit(model)
-        checkpoint_callback.best_model_path
+        >>> checkpoint_callback = ModelCheckpoint(dirpath='my/path/')
+        >>> trainer = Trainer(callbacks=[checkpoint_callback])
+        >>> model = ...  # doctest: +SKIP
+        >>> trainer.fit(model)  # doctest: +SKIP
+        >>> print(checkpoint_callback.best_model_path)  # doctest: +SKIP
 
     .. tip:: Saving and restoring multiple checkpoint callbacks at the same time is supported under variation in the
         following arguments:
diff --git a/src/lightning/pytorch/cli.py b/src/lightning/pytorch/cli.py
@@ -564,6 +564,11 @@ def _parse_ckpt_path(self) -> None:
             hparams.pop("_instantiator", None)
             if not hparams:
                 return
+            if "_class_path" in hparams:
+                hparams = {
+                    "class_path": hparams.pop("_class_path"),
+                    "dict_kwargs": hparams,
+                }
             hparams = {self.config.subcommand: {"model": hparams}}
             try:
                 self.config = self.parser.parse_object(hparams, self.config)
diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py
@@ -515,7 +515,7 @@ def _check_and_init_precision(self) -> Precision:
             rank_zero_info(
                 f"Using {'16bit' if self._precision_flag == '16-mixed' else 'bfloat16'} Automatic Mixed Precision (AMP)"
             )
-            device = "cpu" if self._accelerator_flag == "cpu" else "cuda"
+            device = self._accelerator_flag if self._accelerator_flag in ("cpu", "mps") else "cuda"
             return MixedPrecision(self._precision_flag, device)  # type: ignore[arg-type]
 
         raise RuntimeError("No precision set")
diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py
@@ -491,6 +491,7 @@ class BoringCkptPathModel(BoringModel):
     def __init__(self, out_dim: int = 2, hidden_dim: int = 2) -> None:
         super().__init__()
         self.save_hyperparameters()
+        self.hidden_dim = hidden_dim
         self.layer = torch.nn.Linear(32, out_dim)
 
 
@@ -526,6 +527,41 @@ def add_arguments_to_parser(self, parser):
     assert "Parsing of ckpt_path hyperparameters failed" in err.getvalue()
 
 
+class BoringCkptPathSubclass(BoringCkptPathModel):
+    def __init__(self, extra: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.extra = extra
+
+
+def test_lightning_cli_ckpt_path_argument_hparams_subclass_mode(cleandir):
+    class CkptPathCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.link_arguments("model.init_args.out_dim", "model.init_args.hidden_dim", compute_fn=lambda x: x * 2)
+
+    cli_args = ["fit", "--model=BoringCkptPathSubclass", "--model.out_dim=4", "--trainer.max_epochs=1"]
+    with mock.patch("sys.argv", ["any.py"] + cli_args):
+        cli = CkptPathCLI(BoringCkptPathModel, subclass_mode_model=True)
+
+    assert cli.config.fit.model.class_path.endswith(".BoringCkptPathSubclass")
+    assert cli.config.fit.model.init_args == Namespace(out_dim=4, hidden_dim=8, extra=True)
+    hparams_path = Path(cli.trainer.log_dir) / "hparams.yaml"
+    assert hparams_path.is_file()
+    hparams = yaml.safe_load(hparams_path.read_text())
+    assert hparams["out_dim"] == 4
+    assert hparams["hidden_dim"] == 8
+    assert hparams["extra"] is True
+
+    checkpoint_path = next(Path(cli.trainer.log_dir, "checkpoints").glob("*.ckpt"))
+    cli_args = ["predict", "--model=BoringCkptPathModel", f"--ckpt_path={checkpoint_path}"]
+    with mock.patch("sys.argv", ["any.py"] + cli_args):
+        cli = CkptPathCLI(BoringCkptPathModel, subclass_mode_model=True)
+
+    assert isinstance(cli.model, BoringCkptPathSubclass)
+    assert cli.model.hidden_dim == 8
+    assert cli.model.extra is True
+    assert cli.model.layer.out_features == 4
+
+
 def test_lightning_cli_submodules(cleandir):
     class MainModule(BoringModel):
         def __init__(self, submodule1: LightningModule, submodule2: LightningModule, main_param: int = 1):
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -1084,3 +1084,13 @@ def test_precision_selection_model_parallel(precision, raises, mps_count_0):
     error_context = pytest.raises(ValueError, match=f"does not support .*{precision}") if raises else nullcontext()
     with error_context:
         _AcceleratorConnector(precision=precision, strategy=ModelParallelStrategy())
+
+
+@RunIf(mps=True)
+@pytest.mark.parametrize("accelerator", ["mps", "cpu"])
+@pytest.mark.parametrize("precision", ["16-mixed", "bf16-mixed"])
+def test_mps_amp_device_selection(accelerator, precision):
+    """Test that MPS accelerator with mixed precision correctly sets device to 'mps' instead of 'cuda'."""
+    connector = _AcceleratorConnector(accelerator=accelerator, precision=precision)
+    assert isinstance(connector.precision_plugin, MixedPrecision)
+    assert connector.precision_plugin.device == accelerator