Merge branch 'master' into feature/manual_optimization_tensordict

SkafteNicki · web-flow · commit ef6b20ce188e · 2025-08-05T07:13:43.000+02:00
diff --git a/.github/workflows/call-clear-cache.yml b/.github/workflows/call-clear-cache.yml
@@ -23,7 +23,7 @@ on:
 jobs:
   cron-clear:
     if: github.event_name == 'schedule' || github.event_name == 'pull_request'
-    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.14.3
+    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.15.0
     with:
       scripts-ref: v0.14.3
       dry-run: ${{ github.event_name == 'pull_request' }}
@@ -32,7 +32,7 @@ jobs:
 
   direct-clear:
     if: github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.14.3
+    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.15.0
     with:
       scripts-ref: v0.14.3
       dry-run: ${{ github.event_name == 'pull_request' }}
diff --git a/.github/workflows/ci-schema.yml b/.github/workflows/ci-schema.yml
@@ -8,7 +8,7 @@ on:
 
 jobs:
   check:
-    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.14.3
+    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.15.0
     with:
       # skip azure due to the wrong schema file by MSFT
       # https://github.com/Lightning-AI/lightning-flash/pull/1455#issuecomment-1244793607
diff --git a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst
@@ -9,7 +9,7 @@ The :doc:`Tensor Parallelism documentation <tp>` and a general understanding of
 
 .. raw:: html
 
-    <a target="_blank" href="https://lightning.ai/lightning-ai/studios/tensor-parallelism-supercharging-large-model-training-with-lightning-fabric">
+    <a target="_blank" href="https://lightning.ai/lightning-ai/studios/pretrain-an-llm-with-pytorch-lightning">
       <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio" style="width: auto; max-width: none;"/>
     </a>
 
diff --git a/docs/source-pytorch/accelerators/accelerator_prepare.rst b/docs/source-pytorch/accelerators/accelerator_prepare.rst
@@ -78,7 +78,7 @@ Synchronize validation and test logging
 ***************************************
 
 When running in distributed mode, we have to ensure that the validation and test step logging calls are synchronized across processes.
-This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the validation and test step.
+This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the validation and test step. This will automatically average values across all processes.
 This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers.
 The ``sync_dist`` option can also be used in logging calls during the step methods, but be aware that this can lead to significant communication overhead and slow down your training.
 
diff --git a/docs/source-pytorch/advanced/model_parallel/tp.rst b/docs/source-pytorch/advanced/model_parallel/tp.rst
@@ -8,7 +8,7 @@ This method is most effective for models with very large layers, significantly e
 
 .. raw:: html
 
-    <a target="_blank" href="https://lightning.ai/lightning-ai/studios/tensor-parallelism-supercharging-large-model-training-with-pytorch-lightning">
+    <a target="_blank" href="https://lightning.ai/lightning-ai/studios/pretrain-an-llm-with-pytorch-lightning">
       <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio" style="width: auto; max-width: none;"/>
     </a>
 
diff --git a/docs/source-pytorch/extensions/logging.rst b/docs/source-pytorch/extensions/logging.rst
@@ -137,7 +137,7 @@ The :meth:`~lightning.pytorch.core.LightningModule.log` method has a few options
 * ``logger``: Logs to the logger like ``Tensorboard``, or any other custom logger passed to the :class:`~lightning.pytorch.trainer.trainer.Trainer` (Default: ``True``).
 * ``reduce_fx``: Reduction function over step values for end of epoch. Uses :func:`torch.mean` by default and is not applied when a :class:`torchmetrics.Metric` is logged.
 * ``enable_graph``: If True, will not auto detach the graph.
-* ``sync_dist``: If True, reduces the metric across devices. Use with care as this may lead to a significant communication overhead.
+* ``sync_dist``: If True, averages the metric across devices. Use with care as this may lead to a significant communication overhead.
 * ``sync_dist_group``: The DDP group to sync across.
 * ``add_dataloader_idx``: If True, appends the index of the current dataloader to the name (when using multiple dataloaders). If False, user needs to give unique names for each dataloader to not mix the values.
 * ``batch_size``: Current batch size used for accumulating logs logged with ``on_epoch=True``. This will be directly inferred from the loaded batch, but for some data structures you might need to explicitly provide it.
diff --git a/requirements/ci.txt b/requirements/ci.txt
@@ -1,6 +1,6 @@
 setuptools <80.9.1
 wheel <0.46.0
-awscli >=1.30.0, <1.42.0
+awscli >=1.30.0, <1.43.0
 twine ==6.1.0
 importlib-metadata <9.0.0
 wget
diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -3,7 +3,7 @@ myst-parser >=0.18.1, <4.0.0
 nbsphinx >=0.8.5, <=0.9.7
 nbconvert >7.14, <7.17
 pandoc >=1.0, <=2.4
-docutils>=0.18.1,<=0.19
+docutils>=0.18.1,<=0.22
 sphinxcontrib-fulltoc >=1.0, <=1.2.0
 sphinxcontrib-mockautodoc
 sphinx-autobuild
@@ -17,7 +17,7 @@ sphinx-rtd-dark-mode
 sphinxcontrib-video ==0.4.1
 jinja2 <3.2.0
 
-lightning-utilities >=0.11.1, <0.15.0
+lightning-utilities >=0.11.1, <0.16.0
 
 # installed from S3 location and fetched in advance
 lai-sphinx-theme
diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt
@@ -5,4 +5,4 @@ torch >=2.1.0, <2.8.0
 fsspec[http] >=2022.5.0, <2025.8.0
 packaging >=20.0, <=25.0
 typing-extensions >=4.5.0, <4.15.0
-lightning-utilities >=0.10.0, <0.15.0
+lightning-utilities >=0.10.0, <0.16.0
diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt
@@ -2,4 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 torchvision >=0.16.0, <0.23.0
-torchmetrics >=0.10.0, <1.8.0
+torchmetrics >=0.10.0, <1.9.0
diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt
@@ -1,4 +1,4 @@
-coverage ==7.9.2
+coverage ==7.10.2
 numpy >=1.17.2, <1.27.0
 pytest ==8.4.1
 pytest-cov ==6.2.1
diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt
@@ -5,7 +5,7 @@ torch >=2.1.0, <=2.8.0
 tqdm >=4.57.0, <4.68.0
 PyYAML >5.4, <6.1.0
 fsspec[http] >=2022.5.0, <2025.8.0
-torchmetrics >0.7.0, <1.8.0
+torchmetrics >0.7.0, <1.9.0
 packaging >=20.0, <=25.0
 typing-extensions >=4.5.0, <4.15.0
-lightning-utilities >=0.10.0, <0.15.0
+lightning-utilities >=0.10.0, <0.16.0
diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt
@@ -4,4 +4,4 @@
 requests <2.33.0
 torchvision >=0.16.0, <0.23.0
 ipython[all] <8.19.0
-torchmetrics >=0.10.0, <1.8.0
+torchmetrics >=0.10.0, <1.9.0
diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt
@@ -6,6 +6,6 @@ matplotlib>3.1, <3.10.0
 omegaconf >=2.2.3, <2.4.0
 hydra-core >=1.2.0, <1.4.0
 jsonargparse[signatures,jsonnet] >=4.39.0, <4.41.0
-rich >=12.3.0, <14.1.0
+rich >=12.3.0, <14.2.0
 tensorboardX >=2.2, <2.7.0  # min version is set by torch.onnx missing attribute
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
@@ -1,4 +1,4 @@
-coverage ==7.9.2
+coverage ==7.10.2
 pytest ==8.4.1
 pytest-cov ==6.2.1
 pytest-timeout ==2.4.0
diff --git a/requirements/typing.txt b/requirements/typing.txt
@@ -1,4 +1,4 @@
-mypy==1.17.0
+mypy==1.17.1
 torch==2.7.1
 
 types-Markdown
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -25,7 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- fix progress bar console clearing for Rich `14.1+` ([#21016](https://github.com/Lightning-AI/pytorch-lightning/pull/21016))
 
 
 ---
diff --git a/src/lightning/pytorch/callbacks/progress/rich_progress.py b/src/lightning/pytorch/callbacks/progress/rich_progress.py
@@ -331,7 +331,19 @@ def _init_progress(self, trainer: "pl.Trainer") -> None:
             self._reset_progress_bar_ids()
             reconfigure(**self._console_kwargs)
             self._console = get_console()
-            self._console.clear_live()
+
+            # Compatibility shim for Rich >= 14.1.0:
+            if hasattr(self._console, "_live_stack"):
+                # In recent Rich releases, the internal `_live` variable was replaced with `_live_stack` (a list)
+                # to support nested Live displays. This broke our original call to `clear_live()`,
+                # because it now only pops one Live instance instead of clearing them all.
+                # We check for `_live_stack` and clear it manually for compatibility across
+                # both old and new Rich versions.
+                if len(self._console._live_stack) > 0:
+                    self._console.clear_live()
+            else:
+                self._console.clear_live()
+
             self._metric_component = MetricsTextColumn(
                 trainer,
                 self.theme.metrics,
@@ -447,6 +459,11 @@ def _add_task(self, total_batches: Union[int, float], description: str, visible:
             visible=visible,
         )
 
+    def _initialize_train_progress_bar_id(self) -> None:
+        total_batches = self.total_train_batches
+        train_description = self._get_train_description(self.trainer.current_epoch)
+        self.train_progress_bar_id = self._add_task(total_batches, train_description)
+
     def _update(self, progress_bar_id: Optional["TaskID"], current: int, visible: bool = True) -> None:
         if self.progress is not None and self.is_enabled:
             assert progress_bar_id is not None
@@ -531,6 +548,9 @@ def on_train_batch_end(
         batch: Any,
         batch_idx: int,
     ) -> None:
+        if not self.is_disabled and self.train_progress_bar_id is None:
+            # can happen when resuming from a mid-epoch restart
+            self._initialize_train_progress_bar_id()
         self._update(self.train_progress_bar_id, batch_idx + 1)
         self._update_metrics(trainer, pl_module)
         self.refresh()
diff --git a/src/lightning/pytorch/loops/loop.py b/src/lightning/pytorch/loops/loop.py
@@ -23,6 +23,7 @@ class _Loop:
     def __init__(self, trainer: "pl.Trainer") -> None:
         self._restarting = False
         self._loaded_from_state_dict = False
+        self._resuming_from_checkpoint = False
         self.trainer = trainer
 
     @property
@@ -38,6 +39,11 @@ def restarting(self, restarting: bool) -> None:
             if isinstance(loop, _Loop):
                 loop.restarting = restarting
 
+    @property
+    def is_resuming(self) -> bool:
+        """Indicates whether training is being resumed from a checkpoint."""
+        return self._resuming_from_checkpoint
+
     def reset_restart_stage(self) -> None:
         pass
 
@@ -87,6 +93,7 @@ def load_state_dict(
                 v.load_state_dict(state_dict.copy(), prefix + k + ".")
         self.restarting = True
         self._loaded_from_state_dict = True
+        self._resuming_from_checkpoint = True
 
     def _load_from_state_dict(self, state_dict: dict, prefix: str) -> None:
         for k, v in self.__dict__.items():
@@ -102,4 +109,5 @@ def _load_from_state_dict(self, state_dict: dict, prefix: str) -> None:
     def on_iteration_done(self) -> None:
         self._restarting = False
         self._loaded_from_state_dict = False
+        self._resuming_from_checkpoint = False
         self.reset_restart_stage()
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -237,7 +237,11 @@ def reset(self) -> None:
 
     def on_run_start(self, data_fetcher: _DataFetcher) -> None:
         # `iter()` was called once in `FitLoop.setup_data()` already
-        if self.trainer.current_epoch > 0 and not self.restarting:
+        # Call `iter()` again only when:
+        #       1. Not restarting
+        #       2. Not resuming from checkpoint (not is_resuming)
+        #       3. Past first epoch (current_epoch > 0)
+        if self.trainer.current_epoch > 0 and not self.trainer.fit_loop.is_resuming and not self.restarting:
             iter(data_fetcher)  # creates the iterator inside the fetcher
 
         # add the previous `fetched` value to properly track `is_last_batch` with no prefetching
diff --git a/src/lightning/pytorch/strategies/xla.py b/src/lightning/pytorch/strategies/xla.py
@@ -247,7 +247,10 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
 
     @override
     def reduce(
-        self, output: Union[Tensor, Any], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
+        self,
+        output: Union[Tensor, Any],
+        group: Optional[Any] = None,
+        reduce_op: Optional[Union[ReduceOp, str]] = "mean",
     ) -> Tensor:
         if not isinstance(output, Tensor):
             output = torch.tensor(output, device=self.root_device)
diff --git a/tests/tests_pytorch/loops/test_double_iter_in_iterable_dataset.py b/tests/tests_pytorch/loops/test_double_iter_in_iterable_dataset.py
@@ -0,0 +1,76 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This test tests the resuming of training from a checkpoint file using an IterableDataset.
+# And contains code mentioned in the issue: #19427.
+# Ref: https://github.com/Lightning-AI/pytorch-lightning/issues/19427
+import multiprocessing as mp
+import os
+import sys
+from collections.abc import Iterator
+from pathlib import Path
+from queue import Queue
+
+import numpy as np
+import pytest
+from torch.utils.data import DataLoader, IterableDataset
+
+from lightning.pytorch import Trainer
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+
+class QueueDataset(IterableDataset):
+    def __init__(self, queue: Queue) -> None:
+        super().__init__()
+        self.queue = queue
+
+    def __iter__(self) -> Iterator:
+        for _ in range(5):
+            tensor, _ = self.queue.get(timeout=5)
+            yield tensor
+
+
+def train_model(queue: Queue, max_epochs: int, ckpt_path: Path) -> None:
+    dataloader = DataLoader(QueueDataset(queue), num_workers=1, batch_size=None)
+    trainer = Trainer(
+        max_epochs=max_epochs,
+        enable_progress_bar=False,
+        enable_checkpointing=False,
+        devices=1,
+        logger=False,
+    )
+    if ckpt_path.exists():
+        trainer.fit(BoringModel(), dataloader, ckpt_path=str(ckpt_path))
+    else:
+        trainer.fit(BoringModel(), dataloader)
+        trainer.save_checkpoint(str(ckpt_path))
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="Skip on macOS due to multiprocessing issues")
+def test_resume_training_with(tmp_path):
+    """Test resuming training from checkpoint file using a IterableDataset."""
+    q = mp.Queue()
+    arr = np.random.random([1, 32]).astype(np.float32)
+    for idx in range(20):
+        q.put((arr, idx))
+
+    max_epoch = 2
+    ckpt_path = tmp_path / "model.ckpt"
+    train_model(q, max_epoch, ckpt_path)
+
+    assert os.path.exists(ckpt_path), f"Checkpoint file '{ckpt_path}' wasn't created"
+    ckpt_size = os.path.getsize(ckpt_path)
+    assert ckpt_size > 0, f"Checkpoint file is empty (size: {ckpt_size} bytes)"
+
+    train_model(q, max_epoch + 2, ckpt_path)