Merge branch 'master' into feature/13324_validation-interval

Sohaib-Ahmed21 · web-flow · commit 727ce5773202 · 2025-08-30T13:38:45.000-07:00
diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml
@@ -4,20 +4,22 @@ trigger:
   pull_request:
     branches: ["master"]
 
-timeout: "75" # minutes
-machine: "L4_X_2"
+timeout: "55" # minutes
 parametrize:
   matrix: {}
   include:
-    # note that this is setting also all oldest requirements which is linked to Torch == 2.0
+    # note that this is setting also all oldest requirements which is linked to Torch == 2.1
     - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
       PACKAGE_NAME: "fabric"
-    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
+      machine: "A100_X_2"
+    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
       PACKAGE_NAME: "fabric"
+      machine: "L4_X_2"
     # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
     #   PACKAGE_NAME: "fabric"
-    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
+    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
       PACKAGE_NAME: "lightning"
+      machine: "L4_X_2"
   exclude: []
 
 env:
@@ -30,6 +32,7 @@ run: |
   python --version
   pip --version
   pip install -q fire wget packaging
+  pip list
   set -ex
 
   CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
@@ -40,12 +43,15 @@ run: |
   echo "Torch URL: ${TORCH_URL}"
   COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
   echo "collecting coverage for: ${COVERAGE_SOURCE}"
+  TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
 
   if [ "${TORCH_VER}" == "2.1" ]; then
     echo "Set oldest versions"
-    cd requirements/fabric
+    pip uninstall -y deepspeed
     pip install -U "lightning-utilities[cli]"
+    cd requirements/fabric
     python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
+    python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
     cd ../..
     pip install "cython<3.0" wheel  # for compatibility
   fi
@@ -92,6 +98,7 @@ run: |
   export PL_RUN_STANDALONE_TESTS=1
   wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
   bash ./run_standalone_tests.sh "tests_fabric"
+  export PL_RUN_STANDALONE_TESTS=0
 
   #  echo "Reporting coverage" # todo
   #  python -m coverage report
diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml
@@ -4,20 +4,22 @@ trigger:
   pull_request:
     branches: ["master"]
 
-timeout: "75" # minutes
-machine: "L4_X_2"
+timeout: "55" # minutes
 parametrize:
   matrix: {}
   include:
-    # note that this is setting also all oldest requirements which is linked to Torch == 2.0
+    # note that this is setting also all oldest requirements which is linked to Torch == 2.1
     - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
       PACKAGE_NAME: "pytorch"
-    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
+      machine: "A100_X_2"
+    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
       PACKAGE_NAME: "pytorch"
+      machine: "L4_X_2"
     # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
     #   PACKAGE_NAME: "pytorch"
-    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
+    - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
       PACKAGE_NAME: "lightning"
+      machine: "L4_X_2"
   exclude: []
 
 env:
@@ -30,6 +32,7 @@ run: |
   python --version
   pip --version
   pip install -q fire wget packaging
+  pip list
   set -ex
 
   CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
@@ -40,12 +43,15 @@ run: |
   echo "Torch URL: ${TORCH_URL}"
   COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))')
   echo "collecting coverage for: ${COVERAGE_SOURCE}"
+  TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
 
   if [ "${TORCH_VER}" == "2.1" ]; then
-    recho "Set oldest versions"
-    cd requirements/pytorch
+    echo "Set oldest versions"
+    pip uninstall -y deepspeed
     pip install -U "lightning-utilities[cli]"
+    cd requirements/pytorch
     python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
+    python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
     cd ../..
     pip install "cython<3.0" wheel  # for compatibility
   fi
@@ -108,6 +114,7 @@ run: |
   export PL_RUN_STANDALONE_TESTS=1
   wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
   bash ./run_standalone_tests.sh "tests_pytorch"
+  export PL_RUN_STANDALONE_TESTS=0
 
   echo "Testing: PyTorch standalone tasks"
   cd tests_pytorch/
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 ARG UBUNTU_VERSION=22.04
-ARG CUDA_VERSION=11.7.1
+ARG CUDA_VERSION=12.1.1
 
 
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
@@ -19,5 +19,4 @@ uvicorn  # for `ServableModuleValidator`  # not setting version as re-defined in
 
 tensorboard >=2.11, <2.21.0  # for `TensorBoardLogger`
 
---find-links https://download.pytorch.org/whl/torch-tensorrt
 torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -28,7 +28,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106))
+
 
 
 ---
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -262,6 +262,9 @@ def __init__(
         self.best_model_path = ""
         self.last_model_path = ""
         self._last_checkpoint_saved = ""
+        # When using step/time-based checkpointing with a validation-only monitored metric,
+        # defer the save until validation has produced the metric
+        self._defer_save_until_validation: bool = False
 
         self.kth_value: Tensor
         self.dirpath: Optional[_PATH]
@@ -308,14 +311,17 @@ def on_train_batch_end(
         batch_idx: int,
     ) -> None:
         """Save checkpoint on train batch end if we meet the criteria for `every_n_train_steps`"""
-        if self._should_skip_saving_checkpoint(trainer):
-            return
+        # Do not return early here because we may need to set deferral flags even
+        # if a save already happened at this global step. We'll enforce the skip
+        # just before actually saving below.
+        skip_due_to_state = self._should_skip_saving_checkpoint(trainer)
         skip_batch = self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0)
 
         train_time_interval = self._train_time_interval
         skip_time = True
         now = time.monotonic()
-        if train_time_interval:
+        # Important: allow zero timedelta as a valid interval
+        if train_time_interval is not None:
             prev_time_check = self._last_time_checked
             skip_time = prev_time_check is None or (now - prev_time_check) < train_time_interval.total_seconds()
             # in case we have time differences across ranks
@@ -328,6 +334,42 @@ def on_train_batch_end(
             self._last_time_checked = now
 
         monitor_candidates = self._monitor_candidates(trainer)
+        # If monitoring a metric that is not yet available (e.g., validation-only),
+        # defer saving until validation end so the metric is present.
+        if self.monitor is not None and self.monitor not in monitor_candidates:
+            # Defer both top-k and last to avoid blocking with `_last_global_step_saved`
+            self._defer_save_until_validation = True
+            return
+
+        # Even if the monitored key exists, it could be stale from a previous validation.
+        # If validation is scheduled to run right after this batch (e.g., last batch of epoch)
+        # and we are not saving at train epoch end, defer to `on_validation_end` to use fresh metrics.
+        if (
+            self.monitor is not None
+            and not self._should_save_on_train_epoch_end(trainer)
+            and getattr(trainer.fit_loop.epoch_loop.batch_progress, "is_last_batch", False)
+        ):
+            # Only defer if a validation loop is expected to run after this batch.
+            will_run_val = False
+            if getattr(trainer, "enable_validation", False):
+                num_val_batches = (
+                    sum(trainer.num_val_batches)
+                    if isinstance(trainer.num_val_batches, list)
+                    else trainer.num_val_batches
+                )
+                if num_val_batches and num_val_batches > 0:
+                    cve = trainer.check_val_every_n_epoch
+                    if cve is None or ((trainer.current_epoch + 1) % cve == 0):
+                        will_run_val = True
+
+            if will_run_val:
+                self._defer_save_until_validation = True
+                return
+
+        # Only proceed to save if not skipping due to trainer/callback state
+        if skip_due_to_state:
+            return
+
         self._save_topk_checkpoint(trainer, monitor_candidates)
         self._save_last_checkpoint(trainer, monitor_candidates)
 
@@ -345,6 +387,14 @@ def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModul
         """Save a checkpoint at the end of the validation stage."""
         if not self._should_skip_saving_checkpoint(trainer) and not self._should_save_on_train_epoch_end(trainer):
             monitor_candidates = self._monitor_candidates(trainer)
+            # If a step/time-triggered save was deferred due to a missing monitored metric,
+            # perform the save now that validation metrics are available.
+            if self._defer_save_until_validation:
+                self._save_topk_checkpoint(trainer, monitor_candidates)
+                self._save_last_checkpoint(trainer, monitor_candidates)
+                self._defer_save_until_validation = False
+                return
+
             if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0:
                 self._save_topk_checkpoint(trainer, monitor_candidates)
             self._save_last_checkpoint(trainer, monitor_candidates)
diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_additional_cases.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_additional_cases.py
diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_edge_cases.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_edge_cases.py
diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_step_interval_val_metric.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_step_interval_val_metric.py

Original file line number	Diff line number	Diff line change
@@ -19,5 +19,4 @@ uvicorn # for `ServableModuleValidator` # not setting version as re-defined in
`19`	`19`
`20`	`20`	tensorboard >=2.11, <2.21.0 # for `TensorBoardLogger`
`21`	`21`
`22`		`---find-links https://download.pytorch.org/whl/torch-tensorrt`
`23`	`22`	`torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"`