Merge branch 'master' into feat/to_tensorrt

Borda · web-flow · commit ce2cdda6d279 · 2025-08-12T22:44:14.000+03:00
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -16,11 +16,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 -
 
+
 ### Changed
 
 - Raise ValueError when seed is `out-of-bounds` or `cannot be cast to int` ([#21029](https://github.com/Lightning-AI/pytorch-lightning/pull/21029))
 
 
+### Fixed
+
+- Fix XLA strategy to add support for `global_ordinal`, `local_ordinal`, `world_size` which came instead of deprecated methods ([#20852](https://github.com/Lightning-AI/pytorch-lightning/issues/20852))
+
+
 - fix: remove extra `name` parameter in accelerator registry decorator ([#20975](https://github.com/Lightning-AI/pytorch-lightning/pull/20975))
 
 
diff --git a/src/lightning/fabric/plugins/environments/xla.py b/src/lightning/fabric/plugins/environments/xla.py
@@ -66,6 +66,11 @@ def world_size(self) -> int:
         The output is cached for performance.
 
         """
+        if _XLA_GREATER_EQUAL_2_1:
+            from torch_xla import runtime as xr
+
+            return xr.world_size()
+
         import torch_xla.core.xla_model as xm
 
         return xm.xrt_world_size()
@@ -82,6 +87,11 @@ def global_rank(self) -> int:
         The output is cached for performance.
 
         """
+        if _XLA_GREATER_EQUAL_2_1:
+            from torch_xla import runtime as xr
+
+            return xr.global_ordinal()
+
         import torch_xla.core.xla_model as xm
 
         return xm.get_ordinal()
@@ -98,6 +108,11 @@ def local_rank(self) -> int:
         The output is cached for performance.
 
         """
+        if _XLA_GREATER_EQUAL_2_1:
+            from torch_xla import runtime as xr
+
+            return xr.local_ordinal()
+
         import torch_xla.core.xla_model as xm
 
         return xm.get_local_ordinal()
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Allow returning `ONNXProgram` when calling `to_onnx(dynamo=True)` ([#20811](https://github.com/Lightning-AI/pytorch-lightning/pull/20811))
 
 
+- Default to RichProgressBar and RichModelSummary if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))
+
+
 ### Removed
 
 -
diff --git a/src/lightning/pytorch/callbacks/progress/rich_progress.py b/src/lightning/pytorch/callbacks/progress/rich_progress.py
@@ -17,15 +17,15 @@
 from datetime import timedelta
 from typing import Any, Optional, Union, cast
 
-from lightning_utilities.core.imports import RequirementCache
+import torch
+from lightning_utilities.core.apply_func import apply_to_collection
 from typing_extensions import override
 
 import lightning.pytorch as pl
 from lightning.pytorch.callbacks.progress.progress_bar import ProgressBar
+from lightning.pytorch.utilities.imports import _RICH_AVAILABLE
 from lightning.pytorch.utilities.types import STEP_OUTPUT
 
-_RICH_AVAILABLE = RequirementCache("rich>=10.2.2")
-
 if _RICH_AVAILABLE:
     from rich import get_console, reconfigure
     from rich.console import Console, RenderableType
@@ -171,7 +171,7 @@ def render(self, task: "Task") -> Text:
                 return Text()
             if self._trainer.training and task.id not in self._tasks:
                 self._tasks[task.id] = "None"
-                if self._renderable_cache:
+                if self._renderable_cache and self._current_task_id in self._renderable_cache:
                     self._current_task_id = cast(TaskID, self._current_task_id)
                     self._tasks[self._current_task_id] = self._renderable_cache[self._current_task_id][1]
                 self._current_task_id = task.id
@@ -184,8 +184,11 @@ def render(self, task: "Task") -> Text:
 
         def _generate_metrics_texts(self) -> Generator[str, None, None]:
             for name, value in self._metrics.items():
-                if not isinstance(value, (str, int)):
-                    value = f"{value:{self._metrics_format}}"
+                if not isinstance(value, str):
+                    try:
+                        value = f"{value:{self._metrics_format}}"
+                    except (TypeError, ValueError):
+                        value = str(value)
                 yield f"{name}: {value}"
 
 
@@ -465,17 +468,12 @@ def _initialize_train_progress_bar_id(self) -> None:
         self.train_progress_bar_id = self._add_task(total_batches, train_description)
 
     def _update(self, progress_bar_id: Optional["TaskID"], current: int, visible: bool = True) -> None:
-        if self.progress is not None and self.is_enabled:
-            assert progress_bar_id is not None
+        if self.progress is not None and self.is_enabled and progress_bar_id is not None:
             total = self.progress.tasks[progress_bar_id].total
             assert total is not None
             if not self._should_update(current, total):
                 return
-
-            leftover = current % self.refresh_rate
-            advance = leftover if (current == total and leftover != 0) else self.refresh_rate
-            self.progress.update(progress_bar_id, advance=advance, visible=visible)
-            self.refresh()
+            self.progress.update(progress_bar_id, completed=current, visible=visible)
 
     def _should_update(self, current: int, total: Union[int, float]) -> bool:
         return current % self.refresh_rate == 0 or current == total
@@ -572,9 +570,13 @@ def on_validation_batch_end(
         if self.is_disabled:
             return
         if trainer.sanity_checking:
-            self._update(self.val_sanity_progress_bar_id, batch_idx + 1)
-        elif self.val_progress_bar_id is not None:
-            self._update(self.val_progress_bar_id, batch_idx + 1)
+            if self.val_sanity_progress_bar_id is not None:
+                self._update(self.val_sanity_progress_bar_id, batch_idx + 1)
+            return
+
+        if self.val_progress_bar_id is None:
+            return
+        self._update(self.val_progress_bar_id, batch_idx + 1)
         self.refresh()
 
     @override
@@ -587,9 +589,8 @@ def on_test_batch_end(
         batch_idx: int,
         dataloader_idx: int = 0,
     ) -> None:
-        if self.is_disabled:
+        if self.is_disabled or self.test_progress_bar_id is None:
             return
-        assert self.test_progress_bar_id is not None
         self._update(self.test_progress_bar_id, batch_idx + 1)
         self.refresh()
 
@@ -603,9 +604,8 @@ def on_predict_batch_end(
         batch_idx: int,
         dataloader_idx: int = 0,
     ) -> None:
-        if self.is_disabled:
+        if self.is_disabled or self.predict_progress_bar_id is None:
             return
-        assert self.predict_progress_bar_id is not None
         self._update(self.predict_progress_bar_id, batch_idx + 1)
         self.refresh()
 
@@ -632,6 +632,14 @@ def _reset_progress_bar_ids(self) -> None:
         self.test_progress_bar_id = None
         self.predict_progress_bar_id = None
 
+    @override
+    def get_metrics(
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"
+    ) -> dict[str, Union[int, str, float, dict[str, float]]]:
+        items = super().get_metrics(trainer, pl_module)
+        # convert all metrics to float before sending to rich
+        return apply_to_collection(items, torch.Tensor, lambda x: x.item())
+
     def _update_metrics(
         self,
         trainer: "pl.Trainer",
diff --git a/src/lightning/pytorch/callbacks/progress/tqdm_progress.py b/src/lightning/pytorch/callbacks/progress/tqdm_progress.py
@@ -274,7 +274,7 @@ def on_train_batch_end(
         self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", outputs: STEP_OUTPUT, batch: Any, batch_idx: int
     ) -> None:
         n = batch_idx + 1
-        if self._should_update(n, self.train_progress_bar.total):
+        if self.train_progress_bar is not None and self._should_update(n, self.train_progress_bar.total):
             _update_n(self.train_progress_bar, n)
             self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
 
@@ -322,7 +322,7 @@ def on_validation_batch_end(
         dataloader_idx: int = 0,
     ) -> None:
         n = batch_idx + 1
-        if self._should_update(n, self.val_progress_bar.total):
+        if self.val_progress_bar is not None and self._should_update(n, self.val_progress_bar.total):
             _update_n(self.val_progress_bar, n)
 
     @override
@@ -363,7 +363,7 @@ def on_test_batch_end(
         dataloader_idx: int = 0,
     ) -> None:
         n = batch_idx + 1
-        if self._should_update(n, self.test_progress_bar.total):
+        if self.test_progress_bar is not None and self._should_update(n, self.test_progress_bar.total):
             _update_n(self.test_progress_bar, n)
 
     @override
@@ -402,7 +402,7 @@ def on_predict_batch_end(
         dataloader_idx: int = 0,
     ) -> None:
         n = batch_idx + 1
-        if self._should_update(n, self.predict_progress_bar.total):
+        if self.predict_progress_bar is not None and self._should_update(n, self.predict_progress_bar.total):
             _update_n(self.predict_progress_bar, n)
 
     @override
diff --git a/src/lightning/pytorch/callbacks/rich_model_summary.py b/src/lightning/pytorch/callbacks/rich_model_summary.py
@@ -16,7 +16,7 @@
 from typing_extensions import override
 
 from lightning.pytorch.callbacks import ModelSummary
-from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
+from lightning.pytorch.utilities.imports import _RICH_AVAILABLE
 from lightning.pytorch.utilities.model_summary import get_human_readable_count
 
 
diff --git a/src/lightning/pytorch/loops/evaluation_loop.py b/src/lightning/pytorch/loops/evaluation_loop.py
@@ -25,7 +25,6 @@
 
 import lightning.pytorch as pl
 from lightning.fabric.utilities.data import _set_sampler_epoch
-from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
 from lightning.pytorch.loops.fetchers import _DataFetcher, _DataLoaderIterDataFetcher
 from lightning.pytorch.loops.loop import _Loop
 from lightning.pytorch.loops.progress import _BatchProgress
@@ -44,6 +43,7 @@
 from lightning.pytorch.utilities.combined_loader import CombinedLoader
 from lightning.pytorch.utilities.data import has_len_all_ranks
 from lightning.pytorch.utilities.exceptions import SIGTERMException
+from lightning.pytorch.utilities.imports import _RICH_AVAILABLE
 from lightning.pytorch.utilities.model_helpers import _ModuleMode, is_overridden
 from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature
 
diff --git a/src/lightning/pytorch/trainer/connectors/callback_connector.py b/src/lightning/pytorch/trainer/connectors/callback_connector.py
@@ -37,6 +37,7 @@
 from lightning.pytorch.callbacks.timer import Timer
 from lightning.pytorch.trainer import call
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
+from lightning.pytorch.utilities.imports import _RICH_AVAILABLE
 from lightning.pytorch.utilities.model_helpers import is_overridden
 from lightning.pytorch.utilities.rank_zero import rank_zero_info
 
@@ -125,14 +126,8 @@ def _configure_model_summary_callback(self, enable_model_summary: bool) -> None:
             )
             return
 
-        progress_bar_callback = self.trainer.progress_bar_callback
-        is_progress_bar_rich = isinstance(progress_bar_callback, RichProgressBar)
-
         model_summary: ModelSummary
-        if progress_bar_callback is not None and is_progress_bar_rich:
-            model_summary = RichModelSummary()
-        else:
-            model_summary = ModelSummary()
+        model_summary = RichModelSummary() if _RICH_AVAILABLE else ModelSummary()
         self.trainer.callbacks.append(model_summary)
 
     def _configure_progress_bar(self, enable_progress_bar: bool = True) -> None:
@@ -157,7 +152,7 @@ def _configure_progress_bar(self, enable_progress_bar: bool = True) -> None:
             )
 
         if enable_progress_bar:
-            progress_bar_callback = TQDMProgressBar()
+            progress_bar_callback = RichProgressBar() if _RICH_AVAILABLE else TQDMProgressBar()
             self.trainer.callbacks.append(progress_bar_callback)
 
     def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, dict[str, int]]] = None) -> None:
diff --git a/src/lightning/pytorch/utilities/imports.py b/src/lightning/pytorch/utilities/imports.py
@@ -30,6 +30,7 @@
 
 _OMEGACONF_AVAILABLE = package_available("omegaconf")
 _TORCHVISION_AVAILABLE = RequirementCache("torchvision")
+_RICH_AVAILABLE = RequirementCache("rich>=10.2.2")
 
 
 @functools.lru_cache(maxsize=128)
diff --git a/src/lightning/pytorch/utilities/testing/_runif.py b/src/lightning/pytorch/utilities/testing/_runif.py
@@ -17,9 +17,8 @@
 
 from lightning.fabric.utilities.testing import _runif_reasons as fabric_run_if
 from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
-from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
 from lightning.pytorch.core.module import _ONNX_AVAILABLE, _ONNXSCRIPT_AVAILABLE, _TORCH_TRT_AVAILABLE
-from lightning.pytorch.utilities.imports import _OMEGACONF_AVAILABLE
+from lightning.pytorch.utilities.imports import _OMEGACONF_AVAILABLE, _RICH_AVAILABLE
 
 _SKLEARN_AVAILABLE = RequirementCache("scikit-learn")
 
diff --git a/tests/tests_fabric/plugins/environments/test_xla.py b/tests/tests_fabric/plugins/environments/test_xla.py
@@ -97,3 +97,64 @@ def test_detect(monkeypatch):
 
     monkeypatch.setattr(lightning.fabric.accelerators.xla.XLAAccelerator, "is_available", lambda: True)
     assert XLAEnvironment.detect()
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch("lightning.fabric.accelerators.xla._XLA_GREATER_EQUAL_2_1", True)
+@mock.patch("lightning.fabric.plugins.environments.xla._XLA_GREATER_EQUAL_2_1", True)
+def test_world_size_from_xla_runtime_greater_2_1(xla_available):
+    """Test that world_size uses torch_xla.runtime when XLA >= 2.1."""
+    env = XLAEnvironment()
+
+    with mock.patch("torch_xla.runtime.world_size", return_value=4) as mock_world_size:
+        env.world_size.cache_clear()
+        assert env.world_size() == 4
+        mock_world_size.assert_called_once()
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch("lightning.fabric.accelerators.xla._XLA_GREATER_EQUAL_2_1", True)
+@mock.patch("lightning.fabric.plugins.environments.xla._XLA_GREATER_EQUAL_2_1", True)
+def test_global_rank_from_xla_runtime_greater_2_1(xla_available):
+    """Test that global_rank uses torch_xla.runtime when XLA >= 2.1."""
+    env = XLAEnvironment()
+
+    with mock.patch("torch_xla.runtime.global_ordinal", return_value=2) as mock_global_ordinal:
+        env.global_rank.cache_clear()
+        assert env.global_rank() == 2
+        mock_global_ordinal.assert_called_once()
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch("lightning.fabric.accelerators.xla._XLA_GREATER_EQUAL_2_1", True)
+@mock.patch("lightning.fabric.plugins.environments.xla._XLA_GREATER_EQUAL_2_1", True)
+def test_local_rank_from_xla_runtime_greater_2_1(xla_available):
+    """Test that local_rank uses torch_xla.runtime when XLA >= 2.1."""
+    env = XLAEnvironment()
+
+    with mock.patch("torch_xla.runtime.local_ordinal", return_value=1) as mock_local_ordinal:
+        env.local_rank.cache_clear()
+        assert env.local_rank() == 1
+        mock_local_ordinal.assert_called_once()
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch("lightning.fabric.accelerators.xla._XLA_GREATER_EQUAL_2_1", True)
+@mock.patch("lightning.fabric.plugins.environments.xla._XLA_GREATER_EQUAL_2_1", True)
+def test_setters_readonly_when_xla_runtime_greater_2_1(xla_available):
+    """Test that set_world_size and set_global_rank don't affect values when using XLA runtime >= 2.1."""
+    env = XLAEnvironment()
+
+    with (
+        mock.patch("torch_xla.runtime.world_size", return_value=4),
+        mock.patch("torch_xla.runtime.global_ordinal", return_value=2),
+    ):
+        env.world_size.cache_clear()
+        env.global_rank.cache_clear()
+
+        # Values should come from XLA runtime and not be affected by setters
+        env.set_world_size(100)
+        assert env.world_size() == 4
+
+        env.set_global_rank(100)
+        assert env.global_rank() == 2
diff --git a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py
diff --git a/tests/tests_pytorch/callbacks/test_callbacks.py b/tests/tests_pytorch/callbacks/test_callbacks.py
diff --git a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py
diff --git a/tests/tests_pytorch/trainer/connectors/test_rich_integration.py b/tests/tests_pytorch/trainer/connectors/test_rich_integration.py
diff --git a/tests/tests_pytorch/trainer/logging_/test_eval_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_eval_loop_logging.py

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).`
`21`	`21`	- Allow returning `ONNXProgram` when calling `to_onnx(dynamo=True)` ([#20811](https://github.com/Lightning-AI/pytorch-lightning/pull/20811))
`22`	`22`
`23`	`23`
	`24`	`+- Default to RichProgressBar and RichModelSummary if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))`
	`25`	`+`
	`26`	`+`
`24`	`27`	`### Removed`
`25`	`28`
`26`	`29`	`-`