Merge branch 'master' into uv-for-pytorch-tests

matsumotosan · web-flow · commit c0d66b741e87 · 2025-09-02T09:19:38.000-04:00
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -79,7 +79,7 @@ jobs:
         run: pip install -q -r .actions/requirements.txt
 
       - name: Set min. dependencies
-        if: ${{ matrix.requires == 'oldest' }}
+        if: ${{ matrix.config.requires == 'oldest' }}
         run: |
           cd requirements/fabric
           pip install -U "lightning-utilities[cli]"
@@ -88,7 +88,7 @@ jobs:
           pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
-        if: ${{ matrix.requires != 'oldest' }}
+        if: ${{ matrix.config.requires != 'oldest' }}
         run: |
           pip install -q -r requirements/ci.txt
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -93,7 +93,7 @@ jobs:
         run: uv pip install -q -r .actions/requirements.txt
 
       - name: Set min. dependencies
-        if: ${{ matrix.requires == 'oldest' }}
+        if: ${{ matrix.config.requires == 'oldest' }}
         run: |
           cd requirements/pytorch
           uv pip install -U "lightning-utilities[cli]"
@@ -102,7 +102,7 @@ jobs:
           uv pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
-        if: ${{ matrix.requires != 'oldest' }}
+        if: ${{ matrix.config.requires != 'oldest' }}
         run: |
           uv pip install -q -r requirements/ci.txt
           python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
diff --git a/docs/source-pytorch/common/hooks.rst b/docs/source-pytorch/common/hooks.rst
@@ -143,23 +143,24 @@ with the source of each hook indicated:
     │   │   │   ├── [LightningModule]
     │   │   │   └── [Strategy]
     │   │   │
-    │   │   ├── on_before_zero_grad()
-    │   │   │   ├── [Callbacks]
-    │   │   │   └── [LightningModule]
-    │   │   │
     │   │   ├── [Forward Pass - training_step()]
     │   │   │   └── [Strategy only]
     │   │   │
-    │   │   ├── on_before_backward()
+    │   │   ├── on_before_zero_grad()
     │   │   │   ├── [Callbacks]
     │   │   │   └── [LightningModule]
     │   │   │
-    │   │   ├── [Backward Pass]
-    │   │   │   └── [Strategy only]
+    │   │   ├── optimizer_zero_grad()
+    │   │   │   └── [LightningModule only - optimizer_zero_grad()]
     │   │   │
-    │   │   ├── on_after_backward()
-    │   │   │   ├── [Callbacks]
-    │   │   │   └── [LightningModule]
+    │   │   ├── [Backward Pass - Strategy.backward()]
+    │   │   │   ├── on_before_backward()
+    │   │   │   │   ├── [Callbacks]
+    │   │   │   │   └── [LightningModule]
+    │   │   │   ├── LightningModule.backward()
+    │   │   │   └── on_after_backward()
+    │   │   │       ├── [Callbacks]
+    │   │   │       └── [LightningModule]
     │   │   │
     │   │   ├── on_before_optimizer_step()
     │   │   │   ├── [Callbacks]
@@ -212,13 +213,14 @@ with the source of each hook indicated:
     │   ├── [LightningModule]
     │   └── [Strategy]
     │
-    ├── on_fit_end()
-    │   ├── [Callbacks]
-    │   ├── [LightningModule]
-    │   └── [Strategy]
-    │
     └── teardown(stage="fit")
-        └── [Callbacks only]
+        ├── [Strategy]
+        ├── on_fit_end()
+        │   ├── [Callbacks]
+        │   └── [LightningModule]
+        ├── [LightningDataModule]
+        ├── [Callbacks]
+        └── [LightningModule]
 
 ***********************
 Testing Loop Hook Order
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -16,6 +16,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added Torch-Tensorrt integration with `LightningModule` ([#20808](https://github.com/Lightning-AI/pytorch-lightning/pull/20808))
 
 
+- Added `PossibleUserWarning` that is raised if modules are in eval mode when training starts ([#21146](https://github.com/Lightning-AI/pytorch-lightning/pull/21146))
+
 ### Changed
 
 - Default to `RichProgressBar` and `RichModelSummary` if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))
@@ -28,14 +30,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed `LightningCLI` not using `ckpt_path` hyperparameters to instantiate classes ([#21116](https://github.com/Lightning-AI/pytorch-lightning/pull/21116))
+
+
 - Fixed callbacks by defer step/time-triggered `ModelCheckpoint` saves until validation metrics are available ([#21106](https://github.com/Lightning-AI/pytorch-lightning/pull/21106))
 
 
 - Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105))
 
 
----
+- Fixed `TQDMProgressBar` not resetting correctly when using both a finite and iterable dataloader ([#21147](https://github.com/Lightning-AI/pytorch-lightning/pull/21147))
 
+---
 
 ## [2.5.4] - 2025-08-29
 
@@ -47,7 +53,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed misalignment column while using rich model summary in `DeepSpeedstrategy` ([#21100](https://github.com/Lightning-AI/pytorch-lightning/pull/21100))
 - Fixed `RichProgressBar` crashing when sanity checking using val dataloader with 0 len ([#21108](https://github.com/Lightning-AI/pytorch-lightning/pull/21108))
 
-
 ## [2.5.3] - 2025-08-13
 
 ### Changed
diff --git a/src/lightning/pytorch/callbacks/progress/tqdm_progress.py b/src/lightning/pytorch/callbacks/progress/tqdm_progress.py
@@ -265,7 +265,9 @@ def on_train_start(self, *_: Any) -> None:
     def on_train_epoch_start(self, trainer: "pl.Trainer", *_: Any) -> None:
         if self._leave:
             self.train_progress_bar = self.init_train_tqdm()
-        self.train_progress_bar.reset(convert_inf(self.total_train_batches))
+        total = convert_inf(self.total_train_batches)
+        self.train_progress_bar.reset()
+        self.train_progress_bar.total = total
         self.train_progress_bar.initial = 0
         self.train_progress_bar.set_description(f"Epoch {trainer.current_epoch}")
 
@@ -306,7 +308,9 @@ def on_validation_batch_start(
         if not self.has_dataloader_changed(dataloader_idx):
             return
 
-        self.val_progress_bar.reset(convert_inf(self.total_val_batches_current_dataloader))
+        total = convert_inf(self.total_val_batches_current_dataloader)
+        self.val_progress_bar.reset()
+        self.val_progress_bar.total = total
         self.val_progress_bar.initial = 0
         desc = self.sanity_check_description if trainer.sanity_checking else self.validation_description
         self.val_progress_bar.set_description(f"{desc} DataLoader {dataloader_idx}")
@@ -348,7 +352,9 @@ def on_test_batch_start(
         if not self.has_dataloader_changed(dataloader_idx):
             return
 
-        self.test_progress_bar.reset(convert_inf(self.total_test_batches_current_dataloader))
+        total = convert_inf(self.total_test_batches_current_dataloader)
+        self.test_progress_bar.reset()
+        self.test_progress_bar.total = total
         self.test_progress_bar.initial = 0
         self.test_progress_bar.set_description(f"{self.test_description} DataLoader {dataloader_idx}")
 
@@ -387,7 +393,9 @@ def on_predict_batch_start(
         if not self.has_dataloader_changed(dataloader_idx):
             return
 
-        self.predict_progress_bar.reset(convert_inf(self.total_predict_batches_current_dataloader))
+        total = convert_inf(self.total_predict_batches_current_dataloader)
+        self.predict_progress_bar.reset()
+        self.predict_progress_bar.total = total
         self.predict_progress_bar.initial = 0
         self.predict_progress_bar.set_description(f"{self.predict_description} DataLoader {dataloader_idx}")
 
diff --git a/src/lightning/pytorch/cli.py b/src/lightning/pytorch/cli.py
@@ -16,6 +16,7 @@
 import sys
 from collections.abc import Iterable
 from functools import partial, update_wrapper
+from pathlib import Path
 from types import MethodType
 from typing import Any, Callable, Optional, TypeVar, Union
 
@@ -397,6 +398,7 @@ def __init__(
         main_kwargs, subparser_kwargs = self._setup_parser_kwargs(self.parser_kwargs)
         self.setup_parser(run, main_kwargs, subparser_kwargs)
         self.parse_arguments(self.parser, args)
+        self._parse_ckpt_path()
 
         self.subcommand = self.config["subcommand"] if run else None
 
@@ -551,6 +553,24 @@ def parse_arguments(self, parser: LightningArgumentParser, args: ArgsType) -> No
         else:
             self.config = parser.parse_args(args)
 
+    def _parse_ckpt_path(self) -> None:
+        """If a checkpoint path is given, parse the hyperparameters from the checkpoint and update the config."""
+        if not self.config.get("subcommand"):
+            return
+        ckpt_path = self.config[self.config.subcommand].get("ckpt_path")
+        if ckpt_path and Path(ckpt_path).is_file():
+            ckpt = torch.load(ckpt_path, weights_only=True, map_location="cpu")
+            hparams = ckpt.get("hyper_parameters", {})
+            hparams.pop("_instantiator", None)
+            if not hparams:
+                return
+            hparams = {self.config.subcommand: {"model": hparams}}
+            try:
+                self.config = self.parser.parse_object(hparams, self.config)
+            except SystemExit:
+                sys.stderr.write("Parsing of ckpt_path hyperparameters failed!\n")
+                raise
+
     def _dump_config(self) -> None:
         if hasattr(self, "config_dump"):
             return
diff --git a/src/lightning/pytorch/loops/fit_loop.py b/src/lightning/pytorch/loops/fit_loop.py
@@ -414,6 +414,9 @@ def on_run_start(self) -> None:
             self.epoch_loop.val_loop.setup_data()
             trainer.training = True
 
+        # Check for modules in eval mode at training start
+        self._warn_if_modules_in_eval_mode()
+
         call._call_callback_hooks(trainer, "on_train_start")
         call._call_lightning_module_hook(trainer, "on_train_start")
         call._call_strategy_hook(trainer, "on_train_start")
@@ -515,6 +518,19 @@ def on_load_checkpoint(self, state_dict: dict) -> None:
         self._combined_loader_states_to_load = state_dict.get("combined_loader", [])
         super().on_load_checkpoint(state_dict)
 
+    def _warn_if_modules_in_eval_mode(self) -> None:
+        """Warn if any modules are in eval mode at the start of training."""
+        model = self.trainer.lightning_module
+        eval_modules = [name for name, module in model.named_modules() if not module.training]
+
+        if eval_modules:
+            rank_zero_warn(
+                f"Found {len(eval_modules)} module(s) in eval mode at the start of training."
+                " This may lead to unexpected behavior during training. If this is intentional,"
+                " you can ignore this warning.",
+                category=PossibleUserWarning,
+            )
+
     def _should_accumulate(self) -> bool:
         """Whether the gradients should be accumulated."""
         return self.epoch_loop._should_accumulate()
diff --git a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py
@@ -812,3 +812,50 @@ def test_tqdm_leave(leave, tmp_path):
     )
     trainer.fit(model)
     assert pbar.init_train_tqdm.call_count == (4 if leave else 1)
+
+
+@patch("lightning.pytorch.trainer.connectors.callback_connector._RICH_AVAILABLE", False)
+def test_tqdm_progress_bar_reset_behavior(tmp_path):
+    """Test that progress bars call reset() without parameters and set total separately."""
+    model = BoringModel()
+
+    class ResetTrackingTqdm(MockTqdm):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self.reset_calls_with_params = []
+
+        def reset(self, total=None):
+            self.reset_calls_with_params.append(total)
+            super().reset(total)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        max_epochs=1,
+        logger=False,
+        enable_checkpointing=False,
+    )
+
+    pbar = trainer.progress_bar_callback
+
+    with mock.patch("lightning.pytorch.callbacks.progress.tqdm_progress.Tqdm", ResetTrackingTqdm):
+        trainer.fit(model)
+
+    train_bar = pbar.train_progress_bar
+    assert None in train_bar.reset_calls_with_params, (
+        f"train reset() should be called without parameters, got calls: {train_bar.reset_calls_with_params}"
+    )
+    # Verify that total was set separately to the expected value
+    assert 2 in train_bar.total_values, (
+        f"train total should be set to 2 after reset(), got total_values: {train_bar.total_values}"
+    )
+    # Verify that validation progress bar reset() was called without parameters
+    val_bar = pbar.val_progress_bar
+    assert None in val_bar.reset_calls_with_params, (
+        f"validation reset() should be called without parameters, got calls: {val_bar.reset_calls_with_params}"
+    )
+    # Verify that total was set separately to the expected value
+    assert 2 in val_bar.total_values, (
+        f"validation total should be set to 2 after reset(), got total_values: {val_bar.total_values}"
+    )
diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 import itertools
 import logging
+import warnings
 from unittest.mock import Mock
 
 import pytest
 import torch
 from torch.utils.data import DataLoader
 
+from lightning.fabric.utilities.warnings import PossibleUserWarning
 from lightning.pytorch import Trainer, seed_everything
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.loops import _FitLoop
@@ -277,3 +279,29 @@ def __iter__(self):
 
     # assert progress bar callback uses correct total steps
     assert pbar.train_progress_bar.total == max_steps
+
+
+@pytest.mark.parametrize("warn", [True, False])
+def test_eval_mode_warning(tmp_path, warn):
+    """Test that a warning is raised if any module is in eval mode at the start of training."""
+    model = BoringModel()
+    if warn:
+        model.some_eval_module = torch.nn.Linear(32, 16)
+        model.some_eval_module.eval()
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+    )
+
+    if warn:
+        with pytest.warns(PossibleUserWarning):
+            trainer.fit(model)
+    else:
+        with warnings.catch_warnings(record=True) as warning_list:
+            warnings.simplefilter("always")
+            trainer.fit(model)
+            eval_warnings = [
+                w for w in warning_list if issubclass(w.category, PossibleUserWarning) and "eval mode" in str(w.message)
+            ]
+            assert len(eval_warnings) == 0, "Expected no eval mode warnings"
diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py
@@ -17,7 +17,7 @@
 import operator
 import os
 import sys
-from contextlib import ExitStack, contextmanager, redirect_stdout
+from contextlib import ExitStack, contextmanager, redirect_stderr, redirect_stdout
 from io import StringIO
 from pathlib import Path
 from typing import Callable, Optional, Union
@@ -487,6 +487,45 @@ def test_lightning_cli_print_config():
     assert outval["ckpt_path"] is None
 
 
+class BoringCkptPathModel(BoringModel):
+    def __init__(self, out_dim: int = 2, hidden_dim: int = 2) -> None:
+        super().__init__()
+        self.save_hyperparameters()
+        self.layer = torch.nn.Linear(32, out_dim)
+
+
+def test_lightning_cli_ckpt_path_argument_hparams(cleandir):
+    class CkptPathCLI(LightningCLI):
+        def add_arguments_to_parser(self, parser):
+            parser.link_arguments("model.out_dim", "model.hidden_dim", compute_fn=lambda x: x * 2)
+
+    cli_args = ["fit", "--model.out_dim=3", "--trainer.max_epochs=1"]
+    with mock.patch("sys.argv", ["any.py"] + cli_args):
+        cli = CkptPathCLI(BoringCkptPathModel)
+
+    assert cli.config.fit.model.out_dim == 3
+    assert cli.config.fit.model.hidden_dim == 6
+    hparams_path = Path(cli.trainer.log_dir) / "hparams.yaml"
+    assert hparams_path.is_file()
+    hparams = yaml.safe_load(hparams_path.read_text())
+    assert hparams["out_dim"] == 3
+    assert hparams["hidden_dim"] == 6
+
+    checkpoint_path = next(Path(cli.trainer.log_dir, "checkpoints").glob("*.ckpt"))
+    cli_args = ["predict", f"--ckpt_path={checkpoint_path}"]
+    with mock.patch("sys.argv", ["any.py"] + cli_args):
+        cli = CkptPathCLI(BoringCkptPathModel)
+
+    assert cli.config.predict.model.out_dim == 3
+    assert cli.config.predict.model.hidden_dim == 6
+    assert cli.config_init.predict.model.layer.out_features == 3
+
+    err = StringIO()
+    with mock.patch("sys.argv", ["any.py"] + cli_args), redirect_stderr(err), pytest.raises(SystemExit):
+        cli = LightningCLI(BoringModel)
+    assert "Parsing of ckpt_path hyperparameters failed" in err.getvalue()
+
+
 def test_lightning_cli_submodules(cleandir):
     class MainModule(BoringModel):
         def __init__(self, submodule1: LightningModule, submodule2: LightningModule, main_param: int = 1):