Merge branch 'master' into ci/lit

Borda · web-flow · commit da1b0d2bffe1 · 2025-08-07T19:00:43.000+03:00
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -113,14 +113,28 @@ ______________________________________________________________________
 
 To set up a local development environment, we recommend using `uv`, which can be installed following their [instructions](https://docs.astral.sh/uv/getting-started/installation/).
 
-Once `uv` has been installed, begin by cloning the repository:
+Once `uv` has been installed, begin by cloning the forked repository:
 
 ```bash
-git clone https://github.com/Lightning-AI/lightning.git
-cd lightning
+git clone https://github.com/{YOUR_GITHUB_USERNAME}/pytorch-lightning.git
+cd pytorch-lightning
 ```
 
-Once in root level of the repository, create a new virtual environment and install the project dependencies.
+> If you're using [Lightning Studio](https://lightning.ai) or already have your `uv venv` activated, you can quickly set up the project by running:
+
+```bash
+make setup
+```
+
+This will:
+
+- Install all required dependencies.
+- Perform an editable install of the `pytorch-lightning` project.
+- Install and configure `pre-commit`.
+
+#### Manual Setup (Optional)
+
+If you prefer more fine-grained control over the dependencies, you can set up the environment manually:
 
 ```bash
 uv venv
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: test clean docs
+.PHONY: test clean docs setup
 
 # to imitate SLURM set only single node
 export SLURM_LOCALID=0
@@ -7,6 +7,23 @@ export SPHINX_MOCK_REQUIREMENTS=1
 # install only Lightning Trainer packages
 export PACKAGE_NAME=pytorch
 
+setup:
+	uv pip install -r requirements.txt \
+	    -r requirements/pytorch/base.txt \
+	    -r requirements/pytorch/test.txt \
+	    -r requirements/pytorch/extra.txt \
+	    -r requirements/pytorch/strategies.txt \
+	    -r requirements/fabric/base.txt \
+	    -r requirements/fabric/test.txt \
+	    -r requirements/fabric/strategies.txt \
+	    -r requirements/typing.txt \
+	    -e ".[all]" \
+	    pre-commit
+	pre-commit install
+	@echo "-----------------------------"
+	@echo "✅ Environment setup complete. Ready to Contribute ⚡️!"
+
+
 clean:
 	# clean all temp runs
 	rm -rf $(shell find . -name "mlruns")
diff --git a/docs/source-pytorch/common/checkpointing_basic.rst b/docs/source-pytorch/common/checkpointing_basic.rst
@@ -111,7 +111,7 @@ The LightningModule also has access to the Hyperparameters
 .. code-block:: python
 
     model = MyLightningModule.load_from_checkpoint("/path/to/checkpoint.ckpt")
-    print(model.learning_rate)
+    print(model.hparams.learning_rate)
 
 ----
 
diff --git a/examples/fabric/image_classifier/train_fabric.py b/examples/fabric/image_classifier/train_fabric.py
@@ -158,7 +158,7 @@ def run(hparams):
     # When using distributed training, use `fabric.save`
     # to ensure the current process is allowed to save a checkpoint
     if hparams.save_model:
-        fabric.save(model.state_dict(), "mnist_cnn.pt")
+        fabric.save(path="mnist_cnn.pt", state=model.state_dict())
 
 
 if __name__ == "__main__":
diff --git a/examples/fabric/kfold_cv/train_fabric.py b/examples/fabric/kfold_cv/train_fabric.py
@@ -161,7 +161,7 @@ def run(hparams):
     # When using distributed training, use `fabric.save`
     # to ensure the current process is allowed to save a checkpoint
     if hparams.save_model:
-        fabric.save(model.state_dict(), "mnist_cnn.pt")
+        fabric.save(path="mnist_cnn.pt", state=model.state_dict())
 
 
 if __name__ == "__main__":
diff --git a/examples/fabric/tensor_parallel/train.py b/examples/fabric/tensor_parallel/train.py
@@ -67,7 +67,7 @@ def train():
     # See `fabric consolidate --help` if you need to convert the checkpoint to a single file
     fabric.print("Saving a (distributed) checkpoint ...")
     state = {"model": model, "optimizer": optimizer, "iteration": i}
-    fabric.save("checkpoint.pt", state)
+    fabric.save(path="checkpoint.pt", state=state)
 
     fabric.print("Training successfully completed!")
     fabric.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -16,6 +16,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 -
 
+### Changed
+
+- Raise ValueError when seed is `out-of-bounds` or `cannot be cast to int` ([#21029](https://github.com/Lightning-AI/pytorch-lightning/pull/21029))
+
 
 ---
 
diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py
@@ -27,7 +27,8 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False, verbose:
     Args:
         seed: the integer value seed for global random state in Lightning.
             If ``None``, it will read the seed from ``PL_GLOBAL_SEED`` env variable. If ``None`` and the
-            ``PL_GLOBAL_SEED`` env variable is not set, then the seed defaults to 0.
+            ``PL_GLOBAL_SEED`` env variable is not set, then the seed defaults to 0. If seed is
+            not in bounds or cannot be cast to int, a ValueError is raised.
         workers: if set to ``True``, will properly configure all dataloaders passed to the
             Trainer with a ``worker_init_fn``. If the user already provides such a function
             for their dataloaders, setting this argument will have no influence. See also:
@@ -44,14 +45,12 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False, verbose:
             try:
                 seed = int(env_seed)
             except ValueError:
-                seed = 0
-                rank_zero_warn(f"Invalid seed found: {repr(env_seed)}, seed set to {seed}")
+                raise ValueError(f"Invalid seed specified via PL_GLOBAL_SEED: {repr(env_seed)}")
     elif not isinstance(seed, int):
         seed = int(seed)
 
     if not (min_seed_value <= seed <= max_seed_value):
-        rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
-        seed = 0
+        raise ValueError(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
 
     if verbose:
         log.info(rank_prefixed_message(f"Seed set to {seed}", _get_rank()))
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -10,7 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added support for general mappings being returned from `training_step` when using manual optimization ([#21011](https://github.com/Lightning-AI/pytorch-lightning/pull/21011))
+
 
 
 ### Changed
@@ -26,6 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 - fix progress bar console clearing for Rich `14.1+` ([#21016](https://github.com/Lightning-AI/pytorch-lightning/pull/21016))
+- fix `AdvancedProfiler` to handle nested profiling actions for Python 3.12+ ([#20809](https://github.com/Lightning-AI/pytorch-lightning/pull/20809))
 
 
 ---
diff --git a/src/lightning/pytorch/loops/loop.py b/src/lightning/pytorch/loops/loop.py
@@ -23,6 +23,7 @@ class _Loop:
     def __init__(self, trainer: "pl.Trainer") -> None:
         self._restarting = False
         self._loaded_from_state_dict = False
+        self._resuming_from_checkpoint = False
         self.trainer = trainer
 
     @property
@@ -38,6 +39,11 @@ def restarting(self, restarting: bool) -> None:
             if isinstance(loop, _Loop):
                 loop.restarting = restarting
 
+    @property
+    def is_resuming(self) -> bool:
+        """Indicates whether training is being resumed from a checkpoint."""
+        return self._resuming_from_checkpoint
+
     def reset_restart_stage(self) -> None:
         pass
 
@@ -87,6 +93,7 @@ def load_state_dict(
                 v.load_state_dict(state_dict.copy(), prefix + k + ".")
         self.restarting = True
         self._loaded_from_state_dict = True
+        self._resuming_from_checkpoint = True
 
     def _load_from_state_dict(self, state_dict: dict, prefix: str) -> None:
         for k, v in self.__dict__.items():
@@ -102,4 +109,5 @@ def _load_from_state_dict(self, state_dict: dict, prefix: str) -> None:
     def on_iteration_done(self) -> None:
         self._restarting = False
         self._loaded_from_state_dict = False
+        self._resuming_from_checkpoint = False
         self.reset_restart_stage()
diff --git a/src/lightning/pytorch/loops/optimization/manual.py b/src/lightning/pytorch/loops/optimization/manual.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import OrderedDict
+from collections.abc import Mapping
 from contextlib import suppress
 from dataclasses import dataclass, field
 from typing import Any
@@ -45,7 +46,7 @@ class ManualResult(OutputResult):
     @classmethod
     def from_training_step_output(cls, training_step_output: STEP_OUTPUT) -> "ManualResult":
         extra = {}
-        if isinstance(training_step_output, dict):
+        if isinstance(training_step_output, Mapping):
             extra = training_step_output.copy()
         elif isinstance(training_step_output, Tensor):
             extra = {"loss": training_step_output}
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -237,7 +237,11 @@ def reset(self) -> None:
 
     def on_run_start(self, data_fetcher: _DataFetcher) -> None:
         # `iter()` was called once in `FitLoop.setup_data()` already
-        if self.trainer.current_epoch > 0 and not self.restarting:
+        # Call `iter()` again only when:
+        #       1. Not restarting
+        #       2. Not resuming from checkpoint (not is_resuming)
+        #       3. Past first epoch (current_epoch > 0)
+        if self.trainer.current_epoch > 0 and not self.trainer.fit_loop.is_resuming and not self.restarting:
             iter(data_fetcher)  # creates the iterator inside the fetcher
 
         # add the previous `fetched` value to properly track `is_last_batch` with no prefetching
diff --git a/src/lightning/pytorch/profilers/advanced.py b/src/lightning/pytorch/profilers/advanced.py
@@ -19,6 +19,7 @@
 import os
 import pstats
 import tempfile
+from collections import defaultdict
 from pathlib import Path
 from typing import Optional, Union
 
@@ -66,14 +67,15 @@ def __init__(
                 If you attempt to stop recording an action which was never started.
         """
         super().__init__(dirpath=dirpath, filename=filename)
-        self.profiled_actions: dict[str, cProfile.Profile] = {}
+        self.profiled_actions: dict[str, cProfile.Profile] = defaultdict(cProfile.Profile)
         self.line_count_restriction = line_count_restriction
         self.dump_stats = dump_stats
 
     @override
     def start(self, action_name: str) -> None:
-        if action_name not in self.profiled_actions:
-            self.profiled_actions[action_name] = cProfile.Profile()
+        # Disable all profilers before starting a new one
+        for pr in self.profiled_actions.values():
+            pr.disable()
         self.profiled_actions[action_name].enable()
 
     @override
@@ -114,7 +116,7 @@ def summary(self) -> str:
     @override
     def teardown(self, stage: Optional[str]) -> None:
         super().teardown(stage=stage)
-        self.profiled_actions = {}
+        self.profiled_actions.clear()
 
     def __reduce__(self) -> tuple:
         # avoids `TypeError: cannot pickle 'cProfile.Profile' object`
diff --git a/tests/tests_fabric/utilities/test_seed.py b/tests/tests_fabric/utilities/test_seed.py
@@ -47,19 +47,29 @@ def test_correct_seed_with_environment_variable():
 
 @mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "invalid"}, clear=True)
 def test_invalid_seed():
-    """Ensure that we still fix the seed even if an invalid seed is given."""
-    with pytest.warns(UserWarning, match="Invalid seed found"):
-        seed = seed_everything()
-    assert seed == 0
+    """Ensure that a ValueError is raised if an invalid seed is given."""
+    with pytest.raises(ValueError, match="Invalid seed specified"):
+        seed_everything()
 
 
 @mock.patch.dict(os.environ, {}, clear=True)
 @pytest.mark.parametrize("seed", [10e9, -10e9])
 def test_out_of_bounds_seed(seed):
-    """Ensure that we still fix the seed even if an out-of-bounds seed is given."""
-    with pytest.warns(UserWarning, match="is not in bounds"):
-        actual = seed_everything(seed)
-    assert actual == 0
+    """Ensure that a ValueError is raised if an out-of-bounds seed is given."""
+    with pytest.raises(ValueError, match="is not in bounds"):
+        seed_everything(seed)
+
+
+def test_seed_everything_accepts_valid_seed_argument():
+    """Ensure that seed_everything returns the provided valid seed."""
+    seed_value = 45
+    assert seed_everything(seed_value) == seed_value
+
+
+@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "17"}, clear=True)
+def test_seed_everything_accepts_valid_seed_from_env():
+    """Ensure that seed_everything uses the valid seed from the PL_GLOBAL_SEED environment variable."""
+    assert seed_everything() == 17
 
 
 def test_reset_seed_no_op():
diff --git a/tests/tests_pytorch/loops/test_double_iter_in_iterable_dataset.py b/tests/tests_pytorch/loops/test_double_iter_in_iterable_dataset.py
@@ -0,0 +1,76 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This test tests the resuming of training from a checkpoint file using an IterableDataset.
+# And contains code mentioned in the issue: #19427.
+# Ref: https://github.com/Lightning-AI/pytorch-lightning/issues/19427
+import multiprocessing as mp
+import os
+import sys
+from collections.abc import Iterator
+from pathlib import Path
+from queue import Queue
+
+import numpy as np
+import pytest
+from torch.utils.data import DataLoader, IterableDataset
+
+from lightning.pytorch import Trainer
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+
+class QueueDataset(IterableDataset):
+    def __init__(self, queue: Queue) -> None:
+        super().__init__()
+        self.queue = queue
+
+    def __iter__(self) -> Iterator:
+        for _ in range(5):
+            tensor, _ = self.queue.get(timeout=5)
+            yield tensor
+
+
+def train_model(queue: Queue, max_epochs: int, ckpt_path: Path) -> None:
+    dataloader = DataLoader(QueueDataset(queue), num_workers=1, batch_size=None)
+    trainer = Trainer(
+        max_epochs=max_epochs,
+        enable_progress_bar=False,
+        enable_checkpointing=False,
+        devices=1,
+        logger=False,
+    )
+    if ckpt_path.exists():
+        trainer.fit(BoringModel(), dataloader, ckpt_path=str(ckpt_path))
+    else:
+        trainer.fit(BoringModel(), dataloader)
+        trainer.save_checkpoint(str(ckpt_path))
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="Skip on macOS due to multiprocessing issues")
+def test_resume_training_with(tmp_path):
+    """Test resuming training from checkpoint file using a IterableDataset."""
+    q = mp.Queue()
+    arr = np.random.random([1, 32]).astype(np.float32)
+    for idx in range(20):
+        q.put((arr, idx))
+
+    max_epoch = 2
+    ckpt_path = tmp_path / "model.ckpt"
+    train_model(q, max_epoch, ckpt_path)
+
+    assert os.path.exists(ckpt_path), f"Checkpoint file '{ckpt_path}' wasn't created"
+    ckpt_size = os.path.getsize(ckpt_path)
+    assert ckpt_size > 0, f"Checkpoint file is empty (size: {ckpt_size} bytes)"
+
+    train_model(q, max_epoch + 2, ckpt_path)
diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py
@@ -336,6 +336,12 @@ def test_advanced_profiler_deepcopy(advanced_profiler):
     assert deepcopy(advanced_profiler)
 
 
+def test_advanced_profiler_nested(advanced_profiler):
+    """Ensure AdvancedProfiler does not raise ValueError for nested profiling actions (Python 3.12+ compatibility)."""
+    with advanced_profiler.profile("outer"), advanced_profiler.profile("inner"):
+        pass  # Should not raise ValueError
+
+
 @pytest.fixture
 def pytorch_profiler(tmp_path):
     return PyTorchProfiler(dirpath=tmp_path, filename="profiler")
diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).`
`16`	`16`
`17`	`17`	`-`
`18`	`18`
	`19`	`+### Changed`
	`20`	`+`
	`21`	+- Raise ValueError when seed is `out-of-bounds` or `cannot be cast to int` ([#21029](https://github.com/Lightning-AI/pytorch-lightning/pull/21029))
	`22`	`+`
`19`	`23`
`20`	`24`	`---`
`21`	`25`