Merge branch 'master' into feature/9580-rich-defaults

Borda · Borda · commit ee7c4bbbd857 · 2025-08-08T18:31:54.000+02:00
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -113,14 +113,28 @@ ______________________________________________________________________
 
 To set up a local development environment, we recommend using `uv`, which can be installed following their [instructions](https://docs.astral.sh/uv/getting-started/installation/).
 
-Once `uv` has been installed, begin by cloning the repository:
+Once `uv` has been installed, begin by cloning the forked repository:
 
 ```bash
-git clone https://github.com/Lightning-AI/lightning.git
-cd lightning
+git clone https://github.com/{YOUR_GITHUB_USERNAME}/pytorch-lightning.git
+cd pytorch-lightning
 ```
 
-Once in root level of the repository, create a new virtual environment and install the project dependencies.
+> If you're using [Lightning Studio](https://lightning.ai) or already have your `uv venv` activated, you can quickly set up the project by running:
+
+```bash
+make setup
+```
+
+This will:
+
+- Install all required dependencies.
+- Perform an editable install of the `pytorch-lightning` project.
+- Install and configure `pre-commit`.
+
+#### Manual Setup (Optional)
+
+If you prefer more fine-grained control over the dependencies, you can set up the environment manually:
 
 ```bash
 uv venv
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -135,7 +135,8 @@ subprojects:
       - "build-pl (3.11, 2.4, 12.1.1)"
       - "build-pl (3.12, 2.5, 12.1.1)"
       - "build-pl (3.12, 2.6, 12.4.1)"
-      - "build-pl (3.12, 2.7, 12.6.3, true)"
+      - "build-pl (3.12, 2.7, 12.6.3)"
+      - "build-pl (3.12, 2.8, 12.6.3, true)"
 
   # SECTION: lightning_fabric
 
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -49,7 +49,8 @@ jobs:
           - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.1.1" }
           - { python_version: "3.12", pytorch_version: "2.5", cuda_version: "12.1.1" }
           - { python_version: "3.12", pytorch_version: "2.6", cuda_version: "12.4.1" }
-          - { python_version: "3.12", pytorch_version: "2.7", cuda_version: "12.6.3", latest: "true" }
+          - { python_version: "3.12", pytorch_version: "2.7", cuda_version: "12.6.3" }
+          - { python_version: "3.12", pytorch_version: "2.8", cuda_version: "12.6.3", latest: "true" }
     steps:
       - uses: actions/checkout@v4
         with:
@@ -97,7 +98,7 @@ jobs:
         # adding dome more images as Thunder mainly using python 3.10,
         # and we need to support integrations as for example LitGPT
         python_version: ["3.10"]
-        pytorch_version: ["2.6.0", "2.7.1"]
+        pytorch_version: ["2.7.1", "2.8.0"]
         cuda_version: ["12.6.3"]
         include:
           # These are the base images for PL release docker images.
@@ -109,6 +110,7 @@ jobs:
           - { python_version: "3.12", pytorch_version: "2.5.1", cuda_version: "12.1.1" }
           - { python_version: "3.12", pytorch_version: "2.6.0", cuda_version: "12.4.1" }
           - { python_version: "3.12", pytorch_version: "2.7.1", cuda_version: "12.6.3" }
+          - { python_version: "3.12", pytorch_version: "2.8.0", cuda_version: "12.6.3" }
     steps:
       - uses: actions/checkout@v4
       - uses: docker/setup-buildx-action@v3
@@ -129,6 +131,7 @@ jobs:
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             CUDA_VERSION=${{ matrix.cuda_version }}
+            MAKE_FLAGS="-j2"
           file: dockers/base-cuda/Dockerfile
           push: ${{ env.PUSH_NIGHTLY }}
           tags: "pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ env.PT_VERSION }}-cuda${{ matrix.cuda_version }}"
@@ -157,6 +160,8 @@ jobs:
         continue-on-error: true
         uses: docker/build-push-action@v6
         with:
+          build-args: |
+            PYTORCH_VERSION="25.04"
           file: dockers/nvidia/Dockerfile
           push: false
         timeout-minutes: 55
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: test clean docs
+.PHONY: test clean docs setup
 
 # to imitate SLURM set only single node
 export SLURM_LOCALID=0
@@ -7,6 +7,23 @@ export SPHINX_MOCK_REQUIREMENTS=1
 # install only Lightning Trainer packages
 export PACKAGE_NAME=pytorch
 
+setup:
+	uv pip install -r requirements.txt \
+	    -r requirements/pytorch/base.txt \
+	    -r requirements/pytorch/test.txt \
+	    -r requirements/pytorch/extra.txt \
+	    -r requirements/pytorch/strategies.txt \
+	    -r requirements/fabric/base.txt \
+	    -r requirements/fabric/test.txt \
+	    -r requirements/fabric/strategies.txt \
+	    -r requirements/typing.txt \
+	    -e ".[all]" \
+	    pre-commit
+	pre-commit install
+	@echo "-----------------------------"
+	@echo "✅ Environment setup complete. Ready to Contribute ⚡️!"
+
+
 clean:
 	# clean all temp runs
 	rm -rf $(shell find . -name "mlruns")
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -19,8 +19,9 @@ ARG CUDA_VERSION=11.7.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 
 ARG PYTHON_VERSION=3.10
-ARG PYTORCH_VERSION=2.1
+ARG PYTORCH_VERSION=2.8
 ARG MAX_ALLOWED_NCCL=2.22.3
+ARG MAKE_FLAGS="-j$(nproc)"
 
 SHELL ["/bin/bash", "-c"]
 # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -30,8 +31,7 @@ ENV \
     PATH="$PATH:/root/.local/bin" \
     CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
     MKL_THREADING_LAYER="GNU" \
-    # MAKEFLAGS="-j$(nproc)"
-    MAKEFLAGS="-j2"
+    MAKEFLAGS=${MAKE_FLAGS}
 
 RUN \
     CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTORCH_VERSION=22.09
+ARG PYTORCH_VERSION=24.05
 
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes
 FROM nvcr.io/nvidia/pytorch:${PYTORCH_VERSION}-py3
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 ARG PYTHON_VERSION=3.10
-ARG PYTORCH_VERSION=2.0
-ARG CUDA_VERSION=11.8.0
+ARG PYTORCH_VERSION=2.8
+ARG CUDA_VERSION=12.6.3
 
 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}
 
diff --git a/docs/source-pytorch/advanced/speed.rst b/docs/source-pytorch/advanced/speed.rst
@@ -464,7 +464,7 @@ takes a great deal of care to be optimized for this.
 Clear Cache
 ===========
 
-Don't call :func:`torch.cuda.empty_cache` unnecessarily! Every time you call this, ALL your GPUs have to wait to sync.
+Don't call ``torch.cuda.empty_cache`` unnecessarily! Every time you call this, ALL your GPUs have to wait to sync.
 
 Transferring Tensors to Device
 ==============================
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -16,6 +16,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 -
 
+### Changed
+
+- Raise ValueError when seed is `out-of-bounds` or `cannot be cast to int` ([#21029](https://github.com/Lightning-AI/pytorch-lightning/pull/21029))
+
 
 ---
 
diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py
@@ -27,7 +27,8 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False, verbose:
     Args:
         seed: the integer value seed for global random state in Lightning.
             If ``None``, it will read the seed from ``PL_GLOBAL_SEED`` env variable. If ``None`` and the
-            ``PL_GLOBAL_SEED`` env variable is not set, then the seed defaults to 0.
+            ``PL_GLOBAL_SEED`` env variable is not set, then the seed defaults to 0. If seed is
+            not in bounds or cannot be cast to int, a ValueError is raised.
         workers: if set to ``True``, will properly configure all dataloaders passed to the
             Trainer with a ``worker_init_fn``. If the user already provides such a function
             for their dataloaders, setting this argument will have no influence. See also:
@@ -44,14 +45,12 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False, verbose:
             try:
                 seed = int(env_seed)
             except ValueError:
-                seed = 0
-                rank_zero_warn(f"Invalid seed found: {repr(env_seed)}, seed set to {seed}")
+                raise ValueError(f"Invalid seed specified via PL_GLOBAL_SEED: {repr(env_seed)}")
     elif not isinstance(seed, int):
         seed = int(seed)
 
     if not (min_seed_value <= seed <= max_seed_value):
-        rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
-        seed = 0
+        raise ValueError(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
 
     if verbose:
         log.info(rank_prefixed_message(f"Seed set to {seed}", _get_rank()))
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -10,7 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added support for general mappings being returned from `training_step` when using manual optimization ([#21011](https://github.com/Lightning-AI/pytorch-lightning/pull/21011))
+
 
 - Default to RichProgressBar and RichModelSummary if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))
 
@@ -28,9 +29,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 - fix progress bar console clearing for Rich `14.1+` ([#21016](https://github.com/Lightning-AI/pytorch-lightning/pull/21016))
+
+
 - fix `AdvancedProfiler` to handle nested profiling actions for Python 3.12+ ([#20809](https://github.com/Lightning-AI/pytorch-lightning/pull/20809))
 
 
+- Fix support for more dtypes in `ModelSummary` ([#21034](https://github.com/Lightning-AI/pytorch-lightning/pull/21034))
+
+
+- Fixed metrics in `RichProgressBar` being updated according to user provided `refresh_rate` ([#21032](https://github.com/Lightning-AI/pytorch-lightning/pull/21032))
+
 ---
 
 ## [2.5.2] - 2025-06-20
diff --git a/src/lightning/pytorch/callbacks/progress/rich_progress.py b/src/lightning/pytorch/callbacks/progress/rich_progress.py
@@ -183,11 +183,8 @@ def render(self, task: "Task") -> Text:
 
         def _generate_metrics_texts(self) -> Generator[str, None, None]:
             for name, value in self._metrics.items():
-                if not isinstance(value, str):
-                    try:
-                        value = f"{value:{self._metrics_format}}"
-                    except (TypeError, ValueError):
-                        value = str(value)
+                if not isinstance(value, (str, int)):
+                    value = f"{value:{self._metrics_format}}"
                 yield f"{name}: {value}"
 
 
@@ -549,12 +546,12 @@ def on_train_batch_end(
             # can happen when resuming from a mid-epoch restart
             self._initialize_train_progress_bar_id()
         self._update(self.train_progress_bar_id, batch_idx + 1)
-        self._update_metrics(trainer, pl_module)
+        self._update_metrics(trainer, pl_module, batch_idx + 1)
         self.refresh()
 
     @override
     def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
-        self._update_metrics(trainer, pl_module)
+        self._update_metrics(trainer, pl_module, total_batches=True)
 
     @override
     def on_validation_batch_end(
@@ -642,7 +639,21 @@ def get_metrics(
                 items[k] = v.item()
         return items
 
-    def _update_metrics(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+    def _update_metrics(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        current: Optional[int] = None,
+        total_batches: bool = False,
+    ) -> None:
+        if not self.is_enabled or self._metric_component is None:
+            return
+
+        if current is not None and not total_batches:
+            total = self.total_train_batches
+            if not self._should_update(current, total):
+                return
+
         metrics = self.get_metrics(trainer, pl_module)
         if self._metric_component:
             self._metric_component.update(metrics)
diff --git a/src/lightning/pytorch/loops/optimization/manual.py b/src/lightning/pytorch/loops/optimization/manual.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import OrderedDict
+from collections.abc import Mapping
 from contextlib import suppress
 from dataclasses import dataclass, field
 from typing import Any
@@ -45,7 +46,7 @@ class ManualResult(OutputResult):
     @classmethod
     def from_training_step_output(cls, training_step_output: STEP_OUTPUT) -> "ManualResult":
         extra = {}
-        if isinstance(training_step_output, dict):
+        if isinstance(training_step_output, Mapping):
             extra = training_step_output.copy()
         elif isinstance(training_step_output, Tensor):
             extra = {"loss": training_step_output}
diff --git a/src/lightning/pytorch/utilities/model_summary/model_summary.py b/src/lightning/pytorch/utilities/model_summary/model_summary.py
@@ -26,6 +26,7 @@
 from torch.utils.hooks import RemovableHandle
 
 import lightning.pytorch as pl
+from lightning.fabric.utilities import rank_zero_warn
 from lightning.fabric.utilities.distributed import _is_dtensor
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4
 from lightning.pytorch.utilities.model_helpers import _ModuleMode
@@ -227,7 +228,22 @@ def __init__(self, model: "pl.LightningModule", max_depth: int = 1) -> None:
         self._layer_summary = self.summarize()
         # 1 byte -> 8 bits
         # TODO: how do we compute precision_megabytes in case of mixed precision?
-        precision_to_bits = {"64": 64, "32": 32, "16": 16, "bf16": 16}
+        precision_to_bits = {
+            "64": 64,
+            "32": 32,
+            "16": 16,
+            "bf16": 16,
+            "16-true": 16,
+            "bf16-true": 16,
+            "32-true": 32,
+            "64-true": 64,
+        }
+        if self._model._trainer and self._model.trainer.precision not in precision_to_bits:
+            rank_zero_warn(
+                f"Precision {self._model.trainer.precision} is not supported by the model summary. "
+                " Estimated model size in MB will not be accurate. Using 32 bits instead.",
+                category=UserWarning,
+            )
         precision = precision_to_bits.get(self._model.trainer.precision, 32) if self._model._trainer else 32
         self._precision_megabytes = (precision / 8.0) * 1e-6
 
diff --git a/tests/tests_fabric/utilities/test_seed.py b/tests/tests_fabric/utilities/test_seed.py
@@ -47,19 +47,29 @@ def test_correct_seed_with_environment_variable():
 
 @mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "invalid"}, clear=True)
 def test_invalid_seed():
-    """Ensure that we still fix the seed even if an invalid seed is given."""
-    with pytest.warns(UserWarning, match="Invalid seed found"):
-        seed = seed_everything()
-    assert seed == 0
+    """Ensure that a ValueError is raised if an invalid seed is given."""
+    with pytest.raises(ValueError, match="Invalid seed specified"):
+        seed_everything()
 
 
 @mock.patch.dict(os.environ, {}, clear=True)
 @pytest.mark.parametrize("seed", [10e9, -10e9])
 def test_out_of_bounds_seed(seed):
-    """Ensure that we still fix the seed even if an out-of-bounds seed is given."""
-    with pytest.warns(UserWarning, match="is not in bounds"):
-        actual = seed_everything(seed)
-    assert actual == 0
+    """Ensure that a ValueError is raised if an out-of-bounds seed is given."""
+    with pytest.raises(ValueError, match="is not in bounds"):
+        seed_everything(seed)
+
+
+def test_seed_everything_accepts_valid_seed_argument():
+    """Ensure that seed_everything returns the provided valid seed."""
+    seed_value = 45
+    assert seed_everything(seed_value) == seed_value
+
+
+@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "17"}, clear=True)
+def test_seed_everything_accepts_valid_seed_from_env():
+    """Ensure that seed_everything uses the valid seed from the PL_GLOBAL_SEED environment variable."""
+    assert seed_everything() == 17
 
 
 def test_reset_seed_no_op():
diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py
@@ -246,6 +246,9 @@ def test_rich_progress_bar_with_refresh_rate(tmp_path, refresh_rate, train_batch
     with mock.patch.object(
         trainer.progress_bar_callback.progress, "update", wraps=trainer.progress_bar_callback.progress.update
     ) as progress_update:
+        metrics_update = mock.MagicMock()
+        trainer.progress_bar_callback._update_metrics = metrics_update
+
         trainer.fit(model)
         assert progress_update.call_count == expected_call_count
 
@@ -260,6 +263,9 @@ def test_rich_progress_bar_with_refresh_rate(tmp_path, refresh_rate, train_batch
         assert fit_val_bar.total == val_batches
         assert not fit_val_bar.visible
 
+    # one call for each train batch + one at the end of training epoch + one for validation end
+    assert metrics_update.call_count == train_batches + (1 if train_batches > 0 else 0) + (1 if val_batches > 0 else 0)
+
 
 @RunIf(rich=True)
 @pytest.mark.parametrize("limit_val_batches", [1, 5])
diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py
diff --git a/tests/tests_pytorch/utilities/test_model_summary.py b/tests/tests_pytorch/utilities/test_model_summary.py

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).`
`16`	`16`
`17`	`17`	`-`
`18`	`18`
	`19`	`+### Changed`
	`20`	`+`
	`21`	+- Raise ValueError when seed is `out-of-bounds` or `cannot be cast to int` ([#21029](https://github.com/Lightning-AI/pytorch-lightning/pull/21029))
	`22`	`+`
`19`	`23`
`20`	`24`	`---`
`21`	`25`