Skip to content
3 changes: 3 additions & 0 deletions src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))


- Fixed redundant host-device sync in progressbar printing ([#21233](https://github.com/Lightning-AI/pytorch-lightning/pull/21233))


---

## [2.5.5] - 2025-09-05
Expand Down
16 changes: 11 additions & 5 deletions src/lightning/pytorch/core/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,11 +656,17 @@ def __check_allowed(v: Any, name: str, value: Any) -> None:
raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged")

def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor:
value = (
value.clone().detach()
if isinstance(value, Tensor)
else torch.tensor(value, device=self.device, dtype=_get_default_dtype())
)
if isinstance(value, Tensor):
# Keep tensor on its original device to avoid unnecessary transfers
value = value.clone().detach()
else:
# Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization.
# `torch.tensor(value, device="cuda")` contains such synchronization, while the metric
# itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient.
# For non-CUDA devices, maintain original behavior
device = "cpu" if self.device.type == "cuda" else self.device
value = torch.tensor(value, device=device, dtype=_get_default_dtype())

if not torch.numel(value) == 1:
raise ValueError(
f"`self.log({name}, {value})` was called, but the tensor must have a single element."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,9 @@ def metrics(self) -> _METRICS:
"""This function returns either batch or epoch metrics."""
on_step = self._first_loop_iter is not None
assert self.trainer._results is not None
return self.trainer._results.metrics(on_step)
# Only include progress bar metrics if a progress bar callback is present
include_pbar_metrics = self.trainer.progress_bar_callback is not None
return self.trainer._results.metrics(on_step, include_pbar_metrics=include_pbar_metrics)

@property
def callback_metrics(self) -> _OUT_DICT:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str
forked_name += dataloader_suffix
return name, forked_name

def metrics(self, on_step: bool) -> _METRICS:
def metrics(self, on_step: bool, include_pbar_metrics: bool = True) -> _METRICS:
metrics = _METRICS(callback={}, log={}, pbar={})

for _, result_metric in self.valid_items():
Expand All @@ -489,7 +489,7 @@ def metrics(self, on_step: bool) -> _METRICS:
metrics["callback"][forked_name] = value

# populate progress_bar metrics. convert tensors to numbers
if result_metric.meta.prog_bar:
if result_metric.meta.prog_bar and include_pbar_metrics:
metrics["pbar"][forked_name] = convert_tensors_to_scalars(value)

return metrics
Expand Down
50 changes: 50 additions & 0 deletions tests/tests_pytorch/core/test_lightning_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,3 +594,53 @@ def __init__(self):
fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=1e-3, max_norm=None)
else:
fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=None, max_norm=1e-3)


@RunIf(min_cuda_gpus=1)
def test_log_no_cuda_sync():
"""Test logging scalars and tensors doesn't introduce CUDA sync."""

class TestModel(BoringModel):
def __init__(self):
super().__init__()
self.to("cuda")

def training_step(self, batch, batch_idx):
# Create tensors before enabling sync debug mode to avoid sync
cuda_tensor = torch.tensor(0.7, device=self.device)
cpu_tensor = torch.tensor(1.0, device="cpu")

# Enable sync debug mode to catch any synchronization
torch.cuda.set_sync_debug_mode("error")
try:
# Test scalar value (should be placed on CPU to avoid sync)
self.log("scalar_loss", 0.5)

# Test CUDA tensor (should stay on original device)
self.log("cuda_tensor", cuda_tensor)

# Test CPU tensor (should stay on CPU)
self.log("cpu_tensor", cpu_tensor)

except RuntimeError as e:
if "called a synchronizing CUDA operation" in str(e):
msg = f"Unexpected CUDA synchronization: {e}"
pytest.fail(msg)
else:
raise
finally:
torch.cuda.set_sync_debug_mode("default")

return super().training_step(batch, batch_idx)

model = TestModel()
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=0,
accelerator="gpu",
devices=1,
enable_progress_bar=False,
enable_checkpointing=False,
)
trainer.fit(model)
100 changes: 100 additions & 0 deletions tests/tests_pytorch/trainer/logging_/test_logger_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,3 +660,103 @@ def test_result_collection_changes_device():
# same device as the new tensor
results.log(fx, name, log_val, on_step=True, on_epoch=False, reduce_fx="mean")
assert results[f"{fx}.{name}"].cumulated_batch_size.device == log_val.device


@RunIf(min_cuda_gpus=1)
def test_logger_connector_no_sync_without_progress_bar():
"""Test logger connector doesn't sync when no progress bar."""
from lightning.pytorch import Trainer
from lightning.pytorch.demos.boring_classes import BoringModel

class TestModel(BoringModel):
def __init__(self):
super().__init__()
self.to("cuda")

def training_step(self, batch, batch_idx):
# Log some metrics with progress bar enabled
loss = super().training_step(batch, batch_idx)["loss"]

# Enable sync debug mode to catch any synchronization
torch.cuda.set_sync_debug_mode("error")
try:
# These logs have prog_bar=True but should not sync
# when progress bar callback is not present
self.log("train_loss", loss, prog_bar=True)
self.log("train_acc", 0.95, prog_bar=True)

except RuntimeError as e:
if "called a synchronizing CUDA operation" in str(e):
msg = f"Unexpected CUDA synchronization: {e}"
pytest.fail(msg)
else:
raise
finally:
torch.cuda.set_sync_debug_mode("default")

return loss

model = TestModel()
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=0,
accelerator="gpu",
devices=1,
enable_progress_bar=False, # Key - no progress bar callback
enable_checkpointing=False,
)
trainer.fit(model)


def test_result_collection_metrics_include_pbar_parameter():
"""Test metrics method handles include_pbar_metrics parameter."""
from lightning.pytorch.trainer.connectors.logger_connector.result import (
_ResultCollection,
)

results = _ResultCollection(training=True)

# Log some metrics with different prog_bar settings
results.log(
"training_step",
"regular_metric",
torch.tensor(1.0),
on_step=True,
on_epoch=False,
prog_bar=False,
)
results.log(
"training_step",
"pbar_metric",
torch.tensor(2.0),
on_step=True,
on_epoch=False,
prog_bar=True,
)
results.log(
"training_step",
"both_metric",
torch.tensor(3.0),
on_step=True,
on_epoch=False,
prog_bar=True,
logger=True,
)

# Test with include_pbar_metrics=True (default behavior)
metrics_with_pbar = results.metrics(on_step=True, include_pbar_metrics=True)
assert "pbar_metric" in metrics_with_pbar["pbar"]
assert "both_metric" in metrics_with_pbar["pbar"]
assert "both_metric" in metrics_with_pbar["log"]

# Test with include_pbar_metrics=False (optimization)
metrics_without_pbar = results.metrics(on_step=True, include_pbar_metrics=False)
# No progress bar metrics should be included
assert len(metrics_without_pbar["pbar"]) == 0
# Logger metrics should still be included
assert "both_metric" in metrics_without_pbar["log"]

# Verify callback metrics are not affected
assert "regular_metric" in metrics_with_pbar["callback"]
assert "regular_metric" in metrics_without_pbar["callback"]
Loading