Merge branch 'master' into dependabot-pip-requirements-onnxscript-gte-0.1.0-and-lt-0.6.0

SkafteNicki · web-flow · commit 315ca39cbb35 · 2025-10-24T06:56:51.000+02:00
diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -85,6 +85,7 @@ jobs:
         displayName: "extend env. vars 4 future"
 
       - bash: |
+          set -ex
           echo $(DEVICES)
           echo $CUDA_VISIBLE_DEVICES
           echo $CUDA_VERSION_MM
@@ -96,6 +97,10 @@ jobs:
           python --version
           pip --version
           pip list
+          # todo: rather use devel base image
+          apt-get update -qq --fix-missing
+          apt-get install -y cuda-toolkit
+          nvcc --version
         displayName: "Image info & NVIDIA"
 
       - bash: |
@@ -156,7 +161,7 @@ jobs:
       - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
         workingDirectory: tests/
         displayName: "Testing: fabric standard"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
@@ -165,7 +170,7 @@ jobs:
         env:
           PL_RUN_STANDALONE_TESTS: "1"
         displayName: "Testing: fabric standalone"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           python -m coverage report
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -84,6 +84,7 @@ jobs:
         displayName: "extend env. vars 4 future"
 
       - bash: |
+          set -ex
           echo $(DEVICES)
           echo $CUDA_VISIBLE_DEVICES
           echo $CUDA_VERSION_MM
@@ -95,6 +96,10 @@ jobs:
           python --version
           pip --version
           pip list
+          # todo: rather use devel base image
+          apt-get update -qq --fix-missing
+          apt-get install -y cuda-toolkit
+          nvcc --version
         displayName: "Image info & NVIDIA"
 
       - bash: |
@@ -189,7 +194,7 @@ jobs:
         env:
           PL_USE_MOCKED_MNIST: "1"
         displayName: "Testing: PyTorch standalone tasks"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           python -m coverage report
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -47,7 +47,7 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10)"
+      - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, pytorch, 3.10)"
       - "pytorch.yml / Lit Job (lightning, 3.12)"
       - "pytorch.yml / Lit Job (pytorch, 3.12)"
 
@@ -148,7 +148,7 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10)"
+      - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, fabric, 3.10)"
       - "fabric.yml / Lit Job (fabric, 3.12)"
       - "fabric.yml / Lit Job (lightning, 3.12)"
 
diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml
@@ -6,18 +6,18 @@ trigger:
 
 timeout: "60" # minutes
 machine: "L4_X_2"
-image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
+image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this is setting also all oldest requirements which is linked to python == 3.10
-    - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+    - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       PACKAGE_NAME: "fabric"
       python_version: "3.10"
     - PACKAGE_NAME: "fabric"
       python_version: "3.12"
-    # - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
-    #   PACKAGE_NAME: "fabric"
+    #- image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
+    #  PACKAGE_NAME: "fabric"
     - PACKAGE_NAME: "lightning"
       python_version: "3.12"
   exclude: []
diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml
@@ -6,18 +6,18 @@ trigger:
 
 timeout: "60" # minutes
 machine: "L4_X_2"
-image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
+image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this also sets oldest requirements which are linked to Python == 3.10
-    - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+    - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       PACKAGE_NAME: "pytorch"
       python_version: "3.10"
     - PACKAGE_NAME: "pytorch"
       python_version: "3.12"
-    # - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
-    #   PACKAGE_NAME: "pytorch"
+    #- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
+    #  PACKAGE_NAME: "pytorch"
     - PACKAGE_NAME: "lightning"
       python_version: "3.12"
   exclude: []
diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt
@@ -9,3 +9,4 @@ pytest-random-order ==1.2.0
 click ==8.1.8; python_version < "3.11"
 click ==8.3.0; python_version > "3.10"
 tensorboardX >=2.6, <2.7.0  # todo: relax it back to `>=2.2` after fixing tests
+huggingface-hub
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
@@ -21,3 +21,4 @@ uvicorn  # for `ServableModuleValidator`  # not setting version as re-defined in
 tensorboard >=2.11, <2.21.0  # for `TensorBoardLogger`
 
 torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"
+huggingface-hub
diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py
@@ -113,7 +113,7 @@ def _runif_reasons(
             reasons.append("Standalone execution")
         kwargs["standalone"] = True
 
-    if deepspeed and not (_DEEPSPEED_AVAILABLE and not _TORCH_GREATER_EQUAL_2_4):
+    if deepspeed and not (_DEEPSPEED_AVAILABLE and _TORCH_GREATER_EQUAL_2_4):
         reasons.append("Deepspeed")
 
     if dynamo:
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -58,6 +58,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))
 
 
+- Fixed how `ThroughputMonitor` calculated training time ([#21291](https://github.com/Lightning-AI/pytorch-lightning/pull/21291))
+
+
 ---
 
 ## [2.5.5] - 2025-09-05
diff --git a/src/lightning/pytorch/callbacks/throughput_monitor.py b/src/lightning/pytorch/callbacks/throughput_monitor.py
@@ -109,7 +109,9 @@ def _start(self, trainer: "Trainer") -> None:
         stage = trainer.state.stage
         assert stage is not None
 
-        if stage not in self._samples:
+        reset_needed = trainer.state.fn == TrainerFn.FITTING or stage not in self._samples
+
+        if reset_needed:
             self._throughputs[stage].reset()
             self._lengths[stage] = 0
             self._samples[stage] = 0
@@ -202,10 +204,17 @@ def on_validation_batch_end(
     def on_validation_end(self, trainer: "Trainer", *_: Any) -> None:
         if trainer.sanity_checking or trainer.state.fn != TrainerFn.FITTING:
             return
+
+        train_times = self._throughputs[RunningStage.TRAINING]._time
+        val_times = self._throughputs[RunningStage.VALIDATING]._time
+
+        train_elapsed = train_times[-1] if train_times else 0.0
+        val_elapsed = val_times[-1] if val_times else 0.0
+
         # add the validation time to the training time before continuing to avoid sinking the training throughput
-        training_finished = self._t0s[RunningStage.TRAINING] + sum(self._throughputs[RunningStage.TRAINING]._time)
+        training_finished = self._t0s[RunningStage.TRAINING] + train_elapsed
         time_between_train_and_val = self._t0s[RunningStage.VALIDATING] - training_finished
-        val_time = sum(self._throughputs[RunningStage.VALIDATING]._time)
+        val_time = val_elapsed
         self._t0s[RunningStage.TRAINING] += time_between_train_and_val + val_time
 
     @override
diff --git a/tests/tests_fabric/strategies/test_deepspeed_integration.py b/tests/tests_fabric/strategies/test_deepspeed_integration.py
@@ -312,6 +312,9 @@ def _assert_saved_model_is_equal(fabric, model, checkpoint_path):
             single_ckpt_path = checkpoint_path / "single_model.pt"
             # the tag is hardcoded in DeepSpeedStrategy
             convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, single_ckpt_path, tag="checkpoint")
+
+            is_ckpt_path_a_file = os.path.isfile(single_ckpt_path)
+            single_ckpt_path = single_ckpt_path if is_ckpt_path_a_file else single_ckpt_path / "pytorch_model.bin"
             state_dict = torch.load(single_ckpt_path, weights_only=False)
         else:
             # 'checkpoint' is the tag, hardcoded in DeepSpeedStrategy
diff --git a/tests/tests_pytorch/callbacks/test_throughput_monitor.py b/tests/tests_pytorch/callbacks/test_throughput_monitor.py
@@ -420,3 +420,65 @@ def variable_batch_size_fn(batch):
             train_samples.append(metrics["train/samples"])
         elif "train|samples" in metrics:
             train_samples.append(metrics["train|samples"])
+
+
+def test_throughput_monitor_validation_with_many_epochs(tmp_path):
+    """Ensure ThroughputMonitor handles many epochs with validation and time increases monotonically."""
+
+    logger_mock = Mock()
+    logger_mock.save_dir = tmp_path
+    monitor = ThroughputMonitor(batch_size_fn=lambda x: 1)
+    model = BoringModel()
+    model.flops_per_batch = 10
+    num_epochs = 100
+
+    trainer = Trainer(
+        devices=1,
+        logger=logger_mock,
+        callbacks=[monitor],
+        max_epochs=num_epochs,
+        limit_train_batches=2,
+        limit_val_batches=1,
+        log_every_n_steps=1,
+        enable_checkpointing=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+    )
+
+    timings = []
+    t = 0.0
+    for _ in range(num_epochs):
+        timings += [
+            t,  # train batch 1 start
+            t + 3.0,  # train batch 1 end and start batch 2
+            t + 6.0,  # train batch 2 end
+            t + 7.0,  # val start
+            t + 8.0,  # val end
+        ]
+        t += 10.0
+
+    with mock.patch("time.perf_counter", side_effect=timings):
+        try:
+            trainer.fit(model)
+        except Exception as e:
+            pytest.fail(f"ThroughputMonitor raised an unexpected exception: {e}")
+
+    start_train_timings_idx, end_train_timings_idx = 0, 1
+    batch_num = 1
+    cur_train = timings[end_train_timings_idx] - timings[start_train_timings_idx]
+    for c in logger_mock.log_metrics.mock_calls:
+        metrics = getattr(c, "kwargs", None) or {}
+        metrics = metrics.get("metrics", metrics)
+        for k, v in metrics.items():
+            if k.endswith("train/time"):
+                assert v == cur_train, f"Expected train/time {cur_train}, got {v}"
+                if batch_num == 1:
+                    start_train_timings_idx += 1
+                    end_train_timings_idx += 1
+                    batch_num = 2
+                else:
+                    start_train_timings_idx += 3
+                    end_train_timings_idx += 3
+                    batch_num = 1
+                if end_train_timings_idx < len(timings):
+                    cur_train += timings[end_train_timings_idx] - timings[start_train_timings_idx]
diff --git a/tests/tests_pytorch/strategies/test_deepspeed.py b/tests/tests_pytorch/strategies/test_deepspeed.py
@@ -313,7 +313,7 @@ def on_train_start(self, trainer, pl_module) -> None:
     trainer.fit(model)
     trainer.test(model)
     assert list(lr_monitor.lrs) == ["lr-SGD"]
-    assert len(set(lr_monitor.lrs["lr-SGD"])) == 8
+    assert len(lr_monitor.lrs["lr-SGD"]) == 8
 
 
 @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
@@ -1029,6 +1029,9 @@ def _assert_save_model_is_equal(model, tmp_path, trainer):
     if trainer.is_global_zero:
         single_ckpt_path = os.path.join(tmp_path, "single_model.pt")
         convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, single_ckpt_path)
+
+        if not os.path.isfile(single_ckpt_path):
+            single_ckpt_path = os.path.join(single_ckpt_path, "pytorch_model.bin")
         state_dict = torch.load(single_ckpt_path, weights_only=False)
 
         model = model.cpu()
diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 import os
 import sys
-from contextlib import nullcontext
 from unittest import mock
 
 import pytest
 import torch
-from lightning_utilities.core.imports import RequirementCache
 
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_4
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.utilities.compile import from_compiled, to_uncompiled
@@ -34,7 +32,7 @@
 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
 @RunIf(dynamo=True, deepspeed=True)
 @mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
-def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
+def test_trainer_compiled_model_deepspeed(_, tmp_path, monkeypatch, mps_count_0):
     trainer_kwargs = {
         "default_root_dir": tmp_path,
         "fast_dev_run": True,
@@ -69,22 +67,52 @@ def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
     assert trainer.model._compiler_ctx is None
 
     # some strategies do not support it
-    if RequirementCache("deepspeed"):
-        compiled_model = torch.compile(model)
-        mock_cuda_count(monkeypatch, 2)
-
-        # TODO: Update deepspeed to avoid deprecation warning for `torch.cuda.amp.custom_fwd` on import
-        warn_context = (
-            pytest.warns(FutureWarning, match="torch.cuda.amp.*is deprecated")
-            if _TORCH_GREATER_EQUAL_2_4
-            else nullcontext()
-        )
-
-        with warn_context:
-            trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)
-
-        with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
-            trainer.fit(compiled_model)
+    compiled_model = torch.compile(model)
+    mock_cuda_count(monkeypatch, 2)
+
+    trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)
+
+    with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
+        trainer.fit(compiled_model)
+
+
+# https://github.com/pytorch/pytorch/issues/95708
+@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
+@RunIf(dynamo=True)
+@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
+def test_trainer_compiled_model_ddp(_, tmp_path, monkeypatch, mps_count_0):
+    trainer_kwargs = {
+        "default_root_dir": tmp_path,
+        "fast_dev_run": True,
+        "logger": False,
+        "enable_checkpointing": False,
+        "enable_model_summary": False,
+        "enable_progress_bar": False,
+    }
+
+    model = BoringModel()
+    compiled_model = torch.compile(model)
+    assert model._compiler_ctx is compiled_model._compiler_ctx  # shared reference
+
+    # can train with compiled model
+    trainer = Trainer(**trainer_kwargs)
+    trainer.fit(compiled_model)
+    assert trainer.model._compiler_ctx["compiler"] == "dynamo"
+
+    # the compiled model can be uncompiled
+    to_uncompiled_model = to_uncompiled(compiled_model)
+    assert model._compiler_ctx is None
+    assert compiled_model._compiler_ctx is None
+    assert to_uncompiled_model._compiler_ctx is None
+
+    # the compiled model needs to be passed
+    with pytest.raises(ValueError, match="required to be a compiled LightningModule"):
+        to_uncompiled(to_uncompiled_model)
+
+    # the uncompiled model can be fitted
+    trainer = Trainer(**trainer_kwargs)
+    trainer.fit(model)
+    assert trainer.model._compiler_ctx is None
 
     # ddp does
     trainer = Trainer(strategy="ddp", **trainer_kwargs)
diff --git a/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py b/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py
@@ -57,7 +57,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
     trainer.fit(model)
 
 
-@RunIf(min_cuda_gpus=1, deepspeed=True, rich=True)
+@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True, rich=True)
 @mock.patch("rich.table.Table.add_row", autospec=True)
 def test_deepspeed_summary_with_rich_model_summary(mock_table_add_row, tmp_path):
     from lightning.pytorch.callbacks import RichModelSummary

Original file line number	Diff line number	Diff line change
@@ -21,3 +21,4 @@ uvicorn # for `ServableModuleValidator` # not setting version as re-defined in
`21`	`21`	tensorboard >=2.11, <2.21.0 # for `TensorBoardLogger`
`22`	`22`
`23`	`23`	`torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"`
	`24`	`+huggingface-hub`