Merge branch 'master' into feat/ModelCheckpointException

vsey · web-flow · commit 744cbbf99e41 · 2025-08-09T03:46:08.000+02:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -58,7 +58,7 @@ repos:
         #args: ["--write-changes"] # uncomment if you want to get automatic fixing
 
   - repo: https://github.com/PyCQA/docformatter
-    rev: 06907d0267368b49b9180eed423fae5697c1e909 # todo: fix for docformatter after last 1.7.5
+    rev: v1.7.7
     hooks:
       - id: docformatter
         additional_dependencies: [tomli]
@@ -70,7 +70,7 @@ repos:
       - id: sphinx-lint
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.4
+    rev: v0.12.2
     hooks:
       # try to fix what is possible
       - id: ruff
diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
@@ -759,6 +759,9 @@ overfit_batches
 Uses this much data of the training & validation set.
 If the training & validation dataloaders have ``shuffle=True``, Lightning will automatically disable it.
 
+* When set to a value > 0, sequential sampling (no shuffling) is used
+* Consistent batches are used for both training and validation across epochs, but training and validation use different sets of data
+
 Useful for quickly debugging or trying to overfit on purpose.
 
 .. testcode::
@@ -769,11 +772,11 @@ Useful for quickly debugging or trying to overfit on purpose.
     # use only 1% of the train & val set
     trainer = Trainer(overfit_batches=0.01)
 
-    # overfit on 10 of the same batches
+    # overfit on 10 consistent train batches & 10 consistent val batches
     trainer = Trainer(overfit_batches=10)
 
-plugins
-^^^^^^^
+    # debug using a single consistent train batch and a single consistent val batch
+
 
 :ref:`Plugins` allow you to connect arbitrary backends, precision libraries, clusters etc. For example:
 
@@ -895,7 +898,7 @@ DataSource can be a ``LightningModule`` or a ``LightningDataModule``.
 
     # if 0 (default)
     train_loader = model.train_dataloader()
-    # or if using data module: datamodule.train_dataloader()
+    # or if using data module: datamodule.train_dataloaders()
     for epoch in epochs:
         for batch in train_loader:
             ...
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -39,6 +39,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed metrics in `RichProgressBar` being updated according to user provided `refresh_rate` ([#21032](https://github.com/Lightning-AI/pytorch-lightning/pull/21032))
 
+
+- Fix `save_last` behavior in the absence of validation ([#20960](https://github.com/Lightning-AI/pytorch-lightning/pull/20960))
+
+
 ---
 
 ## [2.5.2] - 2025-06-20
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -348,6 +348,7 @@ def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModul
             self._save_last_checkpoint(trainer, monitor_candidates)
 
     @override
+
     def on_exception(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", exception: BaseException) -> None:
         """Save a checkpoint when an exception is raised."""
         if not self._should_save_on_exception(trainer):
@@ -361,6 +362,13 @@ def on_exception(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", e
             {str(exception)}, saved checkpoint to {filepath}"
         )
 
+    def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        """Ensure save_last=True is applied when training ends."""
+        if self.save_last and not self._last_checkpoint_saved:
+            monitor_candidates = self._monitor_candidates(trainer)
+            self._save_last_checkpoint(trainer, monitor_candidates)
+
+
     @override
     def state_dict(self) -> dict[str, Any]:
         return {
diff --git a/src/lightning/pytorch/core/optimizer.py b/src/lightning/pytorch/core/optimizer.py
@@ -274,7 +274,7 @@ def _configure_schedulers_automatic_opt(schedulers: list, monitor: Optional[str]
             scheduler["reduce_on_plateau"] = scheduler.get(
                 "reduce_on_plateau", isinstance(scheduler["scheduler"], optim.lr_scheduler.ReduceLROnPlateau)
             )
-            if scheduler["reduce_on_plateau"] and scheduler.get("monitor", None) is None:
+            if scheduler["reduce_on_plateau"] and scheduler.get("monitor") is None:
                 raise MisconfigurationException(
                     "The lr scheduler dict must include a monitor when a `ReduceLROnPlateau` scheduler is used."
                     ' For example: {"optimizer": optimizer, "lr_scheduler":'
diff --git a/src/lightning/pytorch/demos/transformer.py b/src/lightning/pytorch/demos/transformer.py
@@ -54,15 +54,24 @@ def __init__(
 
         self.ninp = ninp
         self.vocab_size = vocab_size
-        self.src_mask = None
+        self.src_mask: Optional[Tensor] = None
+
+    def generate_square_subsequent_mask(self, size: int) -> Tensor:
+        """Generate a square mask for the sequence to prevent future tokens from being seen."""
+        mask = torch.triu(torch.ones(size, size), diagonal=1)
+        mask = mask.float().masked_fill(mask == 1, float("-inf")).masked_fill(mask == 0, 0.0)
+        return mask
 
     def forward(self, inputs: Tensor, target: Tensor, mask: Optional[Tensor] = None) -> Tensor:
         _, t = inputs.shape
 
-        # we assume target is already shifted w.r.t. inputs
+        # Generate source mask to prevent future token leakage
+        if self.src_mask is None or self.src_mask.size(0) != t:
+            self.src_mask = self.generate_square_subsequent_mask(t).to(inputs.device)
+
+        # Generate target mask if not provided
         if mask is None:
-            mask = torch.tril(torch.ones(t, t, device=inputs.device)) == 1
-            mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, 0.0)
+            mask = self.generate_square_subsequent_mask(t).to(inputs.device)
 
         src = self.pos_encoder(self.embedding(inputs) * math.sqrt(self.ninp))
         target = self.pos_encoder(self.embedding(target) * math.sqrt(self.ninp))
diff --git a/src/lightning/pytorch/trainer/connectors/data_connector.py b/src/lightning/pytorch/trainer/connectors/data_connector.py
@@ -244,15 +244,23 @@ def _get_distributed_sampler(
 
 
 def _resolve_overfit_batches(combined_loader: CombinedLoader, mode: RunningStage) -> None:
+    """Resolve overfit batches by disabling shuffling.
+
+    When overfit_batches > 0, this function ensures that sequential sampling is used without shuffling for consistent
+    batches across epochs. Training and validation use different sets of data.
+
+    """
     all_have_sequential_sampler = all(
         isinstance(dl.sampler, SequentialSampler) for dl in combined_loader.flattened if hasattr(dl, "sampler")
     )
     if all_have_sequential_sampler:
         return
+
     rank_zero_warn(
         f"You requested to overfit but enabled {mode.dataloader_prefix} dataloader shuffling."
         f" We are turning off the {mode.dataloader_prefix} dataloader shuffling for you."
     )
+
     updated = [
         _update_dataloader(dl, sampler=SequentialSampler(dl.dataset), mode=mode) if hasattr(dl, "dataset") else dl
         for dl in combined_loader.flattened
diff --git a/tests/tests_pytorch/callbacks/test_finetuning_callback.py b/tests/tests_pytorch/callbacks/test_finetuning_callback.py
@@ -109,8 +109,8 @@ def configure_optimizers(self):
     model.validation_step = None
     callback = TestBackboneFinetuningWarningCallback(unfreeze_backbone_at_epoch=3, verbose=False)
 
+    trainer = Trainer(limit_train_batches=1, default_root_dir=tmp_path, callbacks=[callback, chk], max_epochs=2)
     with pytest.warns(UserWarning, match="Did you init your optimizer in"):
-        trainer = Trainer(limit_train_batches=1, default_root_dir=tmp_path, callbacks=[callback, chk], max_epochs=2)
         trainer.fit(model)
 
     assert model.backbone.has_been_used
diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py
@@ -2086,3 +2086,30 @@ def val_dataloader(self) -> DataLoader:
     trainer_kwargs["max_epochs"] = 4
     trainer = Trainer(**trainer_kwargs, callbacks=ModelCheckpoint(**mc_kwargs))
     trainer.fit(model, ckpt_path=checkpoint_path)
+
+
+def test_save_last_without_save_on_train_epoch_and_without_val(tmp_path):
+    """Test that save_last=True works correctly when save_on_train_epoch_end=False in a model without validation."""
+
+    # Remove validation methods to test the edge case
+    model = BoringModel()
+    model.validation_step = None
+    model.val_dataloader = None
+
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=tmp_path,
+        save_last=True,
+        save_on_train_epoch_end=False,
+    )
+
+    trainer = Trainer(
+        max_epochs=2,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+    # save_last=True should always save last.ckpt
+    assert (tmp_path / "last.ckpt").exists()
diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py
@@ -95,6 +95,12 @@ def restore_env_variables():
         "TF_GRPC_DEFAULT_OPTIONS",
         "XLA_FLAGS",
         "TORCHINDUCTOR_CACHE_DIR",  # leaked by torch.compile
+        # TensorFlow and TPU related variables
+        "TF2_BEHAVIOR",
+        "TPU_ML_PLATFORM",
+        "TPU_ML_PLATFORM_VERSION",
+        "LD_LIBRARY_PATH",
+        "ENABLE_RUNTIME_UPTIME_TELEMETRY",
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py
@@ -73,9 +73,9 @@ def test_simple_profiler_durations(simple_profiler, action: str, expected: list)
     np.testing.assert_allclose(simple_profiler.recorded_durations[action], expected, rtol=0.2)
 
 
-def test_simple_profiler_overhead(simple_profiler, n_iter=5):
+def test_simple_profiler_overhead(simple_profiler):
     """Ensure that the profiler doesn't introduce too much overhead during training."""
-    for _ in range(n_iter):
+    for _ in range(5):
         with simple_profiler.profile("no-op"):
             pass
 
@@ -284,8 +284,9 @@ def test_advanced_profiler_durations(advanced_profiler, action: str, expected: l
 
 
 @pytest.mark.flaky(reruns=3)
-def test_advanced_profiler_overhead(advanced_profiler, n_iter=5):
+def test_advanced_profiler_overhead(advanced_profiler):
     """Ensure that the profiler doesn't introduce too much overhead during training."""
+    n_iter = 5
     for _ in range(n_iter):
         with advanced_profiler.profile("no-op"):
             pass
@@ -620,8 +621,8 @@ def test_pytorch_profiler_raises_warning_for_limited_steps(tmp_path, trainer_con
     warning_cache.clear()
     with pytest.warns(UserWarning, match="not enough steps to properly record traces"):
         getattr(trainer, trainer_fn)(model)
-        assert trainer.profiler._schedule is None
-        warning_cache.clear()
+    assert trainer.profiler._schedule is None
+    warning_cache.clear()
 
 
 def test_profile_callbacks(tmp_path):
diff --git a/tests/tests_pytorch/trainer/flags/test_overfit_batches.py b/tests/tests_pytorch/trainer/flags/test_overfit_batches.py
@@ -170,3 +170,44 @@ def test_distributed_sampler_with_overfit_batches():
     train_sampler = trainer.train_dataloader.sampler
     assert isinstance(train_sampler, DistributedSampler)
     assert train_sampler.shuffle is False
+
+
+def test_overfit_batches_same_batch_for_train_and_val(tmp_path):
+    """Test that when overfit_batches=1, the same batch is used for both training and validation."""
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.train_batches = []
+            self.val_batches = []
+
+        def training_step(self, batch, batch_idx):
+            self.train_batches.append(batch)
+            return super().training_step(batch, batch_idx)
+
+        def validation_step(self, batch, batch_idx):
+            self.val_batches.append(batch)
+            return super().validation_step(batch, batch_idx)
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=2,
+        overfit_batches=1,
+        check_val_every_n_epoch=1,
+        enable_model_summary=False,
+    )
+    trainer.fit(model)
+
+    # Verify that the same batch was used for both training and validation
+    assert len(model.train_batches) > 0
+    assert len(model.val_batches) > 0
+
+    # Compare the actual batch contents
+    train_batch = model.train_batches[0]
+    val_batch = model.val_batches[0]
+
+    # Check if the batches are identical
+    assert torch.equal(train_batch, val_batch), (
+        "Training and validation batches should be identical when overfit_batches=1"
+    )
diff --git a/tests/tests_pytorch/trainer/logging_/test_eval_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_eval_loop_logging.py
@@ -234,9 +234,9 @@ def on_test_epoch_end(self):
 
 
 @pytest.mark.parametrize("suffix", [False, True])
-def test_multi_dataloaders_add_suffix_properly(tmp_path, suffix):
+def test_multi_dataloaders_add_suffix_properly(suffix, tmp_path):
     class TestModel(BoringModel):
-        def test_step(self, batch, batch_idx, dataloader_idx=0):
+        def test_step(self, batch, batch_idx, dataloader_idx=0):  # noqa: PT028
             out = super().test_step(batch, batch_idx)
             self.log("test_loss", out["y"], on_step=True, on_epoch=True)
             return out
@@ -441,7 +441,7 @@ def on_test_epoch_end(self, _, pl_module):
     class TestModel(BoringModel):
         seen_losses = {i: [] for i in range(num_dataloaders)}
 
-        def test_step(self, batch, batch_idx, dataloader_idx=0):
+        def test_step(self, batch, batch_idx, dataloader_idx=0):  # noqa: PT028
             loss = super().test_step(batch, batch_idx)["y"]
             self.log("test_loss", loss)
             self.seen_losses[dataloader_idx].append(loss)
diff --git a/tests/tests_pytorch/trainer/test_config_validator.py b/tests/tests_pytorch/trainer/test_config_validator.py
@@ -16,7 +16,6 @@
 import pytest
 import torch
 
-from lightning.fabric.utilities.warnings import PossibleUserWarning
 from lightning.pytorch import LightningDataModule, LightningModule, Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset
 from lightning.pytorch.trainer.configuration_validator import (
@@ -46,20 +45,19 @@ def test_wrong_configure_optimizers(tmp_path):
         trainer.fit(model)
 
 
-def test_fit_val_loop_config(tmp_path):
+@pytest.mark.parametrize("model_attrib", ["validation_step", "val_dataloader"])
+def test_fit_val_loop_config(model_attrib, tmp_path):
     """When either val loop or val data are missing raise warning."""
     trainer = Trainer(default_root_dir=tmp_path, max_epochs=1)
 
-    # no val data has val loop
-    with pytest.warns(UserWarning, match=r"You passed in a `val_dataloader` but have no `validation_step`"):
-        model = BoringModel()
-        model.validation_step = None
-        trainer.fit(model)
-
-    # has val loop but no val data
-    with pytest.warns(PossibleUserWarning, match=r"You defined a `validation_step` but have no `val_dataloader`"):
-        model = BoringModel()
-        model.val_dataloader = None
+    model = BoringModel()
+    setattr(model, model_attrib, None)
+    match_msg = (
+        r"You passed in a `val_dataloader` but have no `validation_step`"
+        if model_attrib == "validation_step"
+        else "You defined a `validation_step` but have no `val_dataloader`"
+    )
+    with pytest.warns(UserWarning, match=match_msg):
         trainer.fit(model)
 
 
diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py
diff --git a/tests/tests_pytorch/utilities/test_model_summary.py b/tests/tests_pytorch/utilities/test_model_summary.py

Original file line number	Diff line number	Diff line change
`@@ -274,7 +274,7 @@ def _configure_schedulers_automatic_opt(schedulers: list, monitor: Optional[str]`
`274`	`274`	`scheduler["reduce_on_plateau"] = scheduler.get(`
`275`	`275`	`"reduce_on_plateau", isinstance(scheduler["scheduler"], optim.lr_scheduler.ReduceLROnPlateau)`
`276`	`276`	`)`
`277`		`- if scheduler["reduce_on_plateau"] and scheduler.get("monitor", None) is None:`
	`277`	`+ if scheduler["reduce_on_plateau"] and scheduler.get("monitor") is None:`
`278`	`278`	`raise MisconfigurationException(`
`279`	`279`	"The lr scheduler dict must include a monitor when a `ReduceLROnPlateau` scheduler is used."
`280`	`280`	`' For example: {"optimizer": optimizer, "lr_scheduler":'`