Skip to content

Commit 293233c

Browse files
committed
Add unittest for metric sync-free changes.
Signed-off-by: Wil Kong <[email protected]>
1 parent 7c1a4b9 commit 293233c

File tree

2 files changed

+154
-0
lines changed

2 files changed

+154
-0
lines changed

tests/tests_pytorch/core/test_lightning_module.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,3 +594,53 @@ def __init__(self):
594594
fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=1e-3, max_norm=None)
595595
else:
596596
fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=None, max_norm=1e-3)
597+
598+
599+
@RunIf(min_cuda_gpus=1)
600+
def test_log_no_cuda_sync():
601+
"""Test logging scalars and tensors doesn't introduce CUDA sync."""
602+
603+
class TestModel(BoringModel):
604+
def __init__(self):
605+
super().__init__()
606+
self.to("cuda")
607+
608+
def training_step(self, batch, batch_idx):
609+
# Create tensors before enabling sync debug mode to avoid sync
610+
cuda_tensor = torch.tensor(0.7, device=self.device)
611+
cpu_tensor = torch.tensor(1.0, device="cpu")
612+
613+
# Enable sync debug mode to catch any synchronization
614+
torch.cuda.set_sync_debug_mode("error")
615+
try:
616+
# Test scalar value (should be placed on CPU to avoid sync)
617+
self.log("scalar_loss", 0.5)
618+
619+
# Test CUDA tensor (should stay on original device)
620+
self.log("cuda_tensor", cuda_tensor)
621+
622+
# Test CPU tensor (should stay on CPU)
623+
self.log("cpu_tensor", cpu_tensor)
624+
625+
except RuntimeError as e:
626+
if "called a synchronizing CUDA operation" in str(e):
627+
msg = f"Unexpected CUDA synchronization: {e}"
628+
pytest.fail(msg)
629+
else:
630+
raise
631+
finally:
632+
torch.cuda.set_sync_debug_mode("default")
633+
634+
return super().training_step(batch, batch_idx)
635+
636+
model = TestModel()
637+
trainer = Trainer(
638+
max_epochs=1,
639+
limit_train_batches=1,
640+
limit_val_batches=0,
641+
accelerator="gpu",
642+
devices=1,
643+
enable_progress_bar=False,
644+
enable_checkpointing=False,
645+
)
646+
trainer.fit(model)

tests/tests_pytorch/trainer/logging_/test_logger_connector.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,3 +660,107 @@ def test_result_collection_changes_device():
660660
# same device as the new tensor
661661
results.log(fx, name, log_val, on_step=True, on_epoch=False, reduce_fx="mean")
662662
assert results[f"{fx}.{name}"].cumulated_batch_size.device == log_val.device
663+
664+
665+
@RunIf(min_cuda_gpus=1)
666+
def test_logger_connector_no_sync_without_progress_bar():
667+
"""Test logger connector doesn't sync when no progress bar."""
668+
from lightning.pytorch import Trainer
669+
from lightning.pytorch.demos.boring_classes import BoringModel
670+
671+
class TestModel(BoringModel):
672+
def __init__(self):
673+
super().__init__()
674+
self.to("cuda")
675+
676+
def training_step(self, batch, batch_idx):
677+
# Log some metrics with progress bar enabled
678+
loss = super().training_step(batch, batch_idx)["loss"]
679+
680+
# Enable sync debug mode to catch any synchronization
681+
torch.cuda.set_sync_debug_mode("error")
682+
try:
683+
# These logs have prog_bar=True but should not sync
684+
# when progress bar callback is not present
685+
self.log("train_loss", loss, prog_bar=True)
686+
self.log("train_acc", 0.95, prog_bar=True)
687+
688+
except RuntimeError as e:
689+
if "called a synchronizing CUDA operation" in str(e):
690+
msg = f"Unexpected CUDA synchronization: {e}"
691+
pytest.fail(msg)
692+
else:
693+
raise
694+
finally:
695+
torch.cuda.set_sync_debug_mode("default")
696+
697+
return loss
698+
699+
model = TestModel()
700+
trainer = Trainer(
701+
max_epochs=1,
702+
limit_train_batches=1,
703+
limit_val_batches=0,
704+
accelerator="gpu",
705+
devices=1,
706+
enable_progress_bar=False, # Key - no progress bar callback
707+
enable_checkpointing=False,
708+
)
709+
trainer.fit(model)
710+
711+
712+
def test_result_collection_metrics_include_pbar_parameter():
713+
"""Test metrics method handles include_pbar_metrics parameter."""
714+
from lightning.pytorch.trainer.connectors.logger_connector.result import (
715+
_ResultCollection,
716+
)
717+
718+
results = _ResultCollection(training=True)
719+
720+
# Log some metrics with different prog_bar settings
721+
results.log(
722+
"training_step",
723+
"regular_metric",
724+
torch.tensor(1.0),
725+
on_step=True,
726+
on_epoch=False,
727+
prog_bar=False,
728+
)
729+
results.log(
730+
"training_step",
731+
"pbar_metric",
732+
torch.tensor(2.0),
733+
on_step=True,
734+
on_epoch=False,
735+
prog_bar=True,
736+
)
737+
results.log(
738+
"training_step",
739+
"both_metric",
740+
torch.tensor(3.0),
741+
on_step=True,
742+
on_epoch=False,
743+
prog_bar=True,
744+
logger=True,
745+
)
746+
747+
# Test with include_pbar_metrics=True (default behavior)
748+
metrics_with_pbar = results.metrics(
749+
on_step=True, include_pbar_metrics=True
750+
)
751+
assert "pbar_metric" in metrics_with_pbar["pbar"]
752+
assert "both_metric" in metrics_with_pbar["pbar"]
753+
assert "both_metric" in metrics_with_pbar["log"]
754+
755+
# Test with include_pbar_metrics=False (optimization)
756+
metrics_without_pbar = results.metrics(
757+
on_step=True, include_pbar_metrics=False
758+
)
759+
# No progress bar metrics should be included
760+
assert len(metrics_without_pbar["pbar"]) == 0
761+
# Logger metrics should still be included
762+
assert "both_metric" in metrics_without_pbar["log"]
763+
764+
# Verify callback metrics are not affected
765+
assert "regular_metric" in metrics_with_pbar["callback"]
766+
assert "regular_metric" in metrics_without_pbar["callback"]

0 commit comments

Comments
 (0)