diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 1bba5e4ca0da7..bd197d6c4e974 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -55,6 +55,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246)) +- Fixed redundant host-device sync in progressbar printing ([#21233](https://github.com/Lightning-AI/pytorch-lightning/pull/21233)) + + --- ## [2.5.5] - 2025-09-05 diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index 85f631ee40f75..bd7b272f9f947 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -656,11 +656,17 @@ def __check_allowed(v: Any, name: str, value: Any) -> None: raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged") def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor: - value = ( - value.clone().detach() - if isinstance(value, Tensor) - else torch.tensor(value, device=self.device, dtype=_get_default_dtype()) - ) + if isinstance(value, Tensor): + # Keep tensor on its original device to avoid unnecessary transfers + value = value.clone().detach() + else: + # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization. + # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric + # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient. + # For non-CUDA devices, maintain original behavior + device = "cpu" if self.device.type == "cuda" else self.device + value = torch.tensor(value, device=device, dtype=_get_default_dtype()) + if not torch.numel(value) == 1: raise ValueError( f"`self.log({name}, {value})` was called, but the tensor must have a single element." diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py index f6e8885ee050a..9db1a6a595943 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py @@ -234,7 +234,9 @@ def metrics(self) -> _METRICS: """This function returns either batch or epoch metrics.""" on_step = self._first_loop_iter is not None assert self.trainer._results is not None - return self.trainer._results.metrics(on_step) + # Only include progress bar metrics if a progress bar callback is present + include_pbar_metrics = self.trainer.progress_bar_callback is not None + return self.trainer._results.metrics(on_step, include_pbar_metrics=include_pbar_metrics) @property def callback_metrics(self) -> _OUT_DICT: diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py index 1a4aa7c401960..8298d280c98fc 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py @@ -468,7 +468,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str forked_name += dataloader_suffix return name, forked_name - def metrics(self, on_step: bool) -> _METRICS: + def metrics(self, on_step: bool, include_pbar_metrics: bool = True) -> _METRICS: metrics = _METRICS(callback={}, log={}, pbar={}) for _, result_metric in self.valid_items(): @@ -489,7 +489,7 @@ def metrics(self, on_step: bool) -> _METRICS: metrics["callback"][forked_name] = value # populate progress_bar metrics. convert tensors to numbers - if result_metric.meta.prog_bar: + if result_metric.meta.prog_bar and include_pbar_metrics: metrics["pbar"][forked_name] = convert_tensors_to_scalars(value) return metrics diff --git a/tests/tests_pytorch/core/test_lightning_module.py b/tests/tests_pytorch/core/test_lightning_module.py index c33488a4f2626..c724567ab7ba8 100644 --- a/tests/tests_pytorch/core/test_lightning_module.py +++ b/tests/tests_pytorch/core/test_lightning_module.py @@ -594,3 +594,53 @@ def __init__(self): fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=1e-3, max_norm=None) else: fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=None, max_norm=1e-3) + + +@RunIf(min_cuda_gpus=1) +def test_log_no_cuda_sync(): + """Test logging scalars and tensors doesn't introduce CUDA sync.""" + + class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.to("cuda") + + def training_step(self, batch, batch_idx): + # Create tensors before enabling sync debug mode to avoid sync + cuda_tensor = torch.tensor(0.7, device=self.device) + cpu_tensor = torch.tensor(1.0, device="cpu") + + # Enable sync debug mode to catch any synchronization + torch.cuda.set_sync_debug_mode("error") + try: + # Test scalar value (should be placed on CPU to avoid sync) + self.log("scalar_loss", 0.5) + + # Test CUDA tensor (should stay on original device) + self.log("cuda_tensor", cuda_tensor) + + # Test CPU tensor (should stay on CPU) + self.log("cpu_tensor", cpu_tensor) + + except RuntimeError as e: + if "called a synchronizing CUDA operation" in str(e): + msg = f"Unexpected CUDA synchronization: {e}" + pytest.fail(msg) + else: + raise + finally: + torch.cuda.set_sync_debug_mode("default") + + return super().training_step(batch, batch_idx) + + model = TestModel() + trainer = Trainer( + max_epochs=1, + limit_train_batches=1, + limit_val_batches=0, + accelerator="gpu", + devices=1, + enable_progress_bar=False, + enable_checkpointing=False, + ) + trainer.fit(model) diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index d3d355edb003b..4af5812e441f5 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -660,3 +660,103 @@ def test_result_collection_changes_device(): # same device as the new tensor results.log(fx, name, log_val, on_step=True, on_epoch=False, reduce_fx="mean") assert results[f"{fx}.{name}"].cumulated_batch_size.device == log_val.device + + +@RunIf(min_cuda_gpus=1) +def test_logger_connector_no_sync_without_progress_bar(): + """Test logger connector doesn't sync when no progress bar.""" + from lightning.pytorch import Trainer + from lightning.pytorch.demos.boring_classes import BoringModel + + class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.to("cuda") + + def training_step(self, batch, batch_idx): + # Log some metrics with progress bar enabled + loss = super().training_step(batch, batch_idx)["loss"] + + # Enable sync debug mode to catch any synchronization + torch.cuda.set_sync_debug_mode("error") + try: + # These logs have prog_bar=True but should not sync + # when progress bar callback is not present + self.log("train_loss", loss, prog_bar=True) + self.log("train_acc", 0.95, prog_bar=True) + + except RuntimeError as e: + if "called a synchronizing CUDA operation" in str(e): + msg = f"Unexpected CUDA synchronization: {e}" + pytest.fail(msg) + else: + raise + finally: + torch.cuda.set_sync_debug_mode("default") + + return loss + + model = TestModel() + trainer = Trainer( + max_epochs=1, + limit_train_batches=1, + limit_val_batches=0, + accelerator="gpu", + devices=1, + enable_progress_bar=False, # Key - no progress bar callback + enable_checkpointing=False, + ) + trainer.fit(model) + + +def test_result_collection_metrics_include_pbar_parameter(): + """Test metrics method handles include_pbar_metrics parameter.""" + from lightning.pytorch.trainer.connectors.logger_connector.result import ( + _ResultCollection, + ) + + results = _ResultCollection(training=True) + + # Log some metrics with different prog_bar settings + results.log( + "training_step", + "regular_metric", + torch.tensor(1.0), + on_step=True, + on_epoch=False, + prog_bar=False, + ) + results.log( + "training_step", + "pbar_metric", + torch.tensor(2.0), + on_step=True, + on_epoch=False, + prog_bar=True, + ) + results.log( + "training_step", + "both_metric", + torch.tensor(3.0), + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + + # Test with include_pbar_metrics=True (default behavior) + metrics_with_pbar = results.metrics(on_step=True, include_pbar_metrics=True) + assert "pbar_metric" in metrics_with_pbar["pbar"] + assert "both_metric" in metrics_with_pbar["pbar"] + assert "both_metric" in metrics_with_pbar["log"] + + # Test with include_pbar_metrics=False (optimization) + metrics_without_pbar = results.metrics(on_step=True, include_pbar_metrics=False) + # No progress bar metrics should be included + assert len(metrics_without_pbar["pbar"]) == 0 + # Logger metrics should still be included + assert "both_metric" in metrics_without_pbar["log"] + + # Verify callback metrics are not affected + assert "regular_metric" in metrics_with_pbar["callback"] + assert "regular_metric" in metrics_without_pbar["callback"]