Fix test_train to not rely on 'Sanity Checking' stdout in multi-GPU runs (#10478)

drivanov · pre-commit-ci[bot] · akihironitta · web-flow · commit 5e8f24482848 · 2025-10-10T11:16:46.000-07:00
The test `test_train` previously asserted on the presence of the "Sanity Checking" message in stdout. This was brittle because in multi-GPU/DistributedDataParallel runs, **only rank 0 prints this message**, so tests running on other ranks failed. This PR updates the test to: - Remove the fragile stdout assertion. - Assert trainer state (`!trainer.sanity_checking`, `current_epoch >= 0`). - Use LoggerCallback to verify that both training and validation ran. This makes the test deterministic and robust across single-GPU, multi-GPU, and CI environments. [PassingLog.TXT](https://github.com/user-attachments/files/22671419/PassingLog.TXT) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <akihiro@kumo.ai> Co-authored-by: Rishi Puri <riship@nvidia.com>
diff --git a/test/graphgym/test_graphgym.py b/test/graphgym/test_graphgym.py
@@ -16,7 +16,7 @@
     set_run_dir,
 )
 from torch_geometric.graphgym.loader import create_loader
-from torch_geometric.graphgym.logger import LoggerCallback, set_printing
+from torch_geometric.graphgym.logger import set_printing
 from torch_geometric.graphgym.model_builder import create_model
 from torch_geometric.graphgym.models.gnn import FeatureEncoder, GNNStackStage
 from torch_geometric.graphgym.models.head import GNNNodeHead
@@ -194,12 +194,29 @@ def test_train(destroy_process_group, tmp_path, capfd):
     loaders = create_loader()
     model = create_model()
     cfg.params = params_count(model)
+
+    # --- minimal logger callback that collects logs ---
+    class LoggerCallback(pl.Callback):
+        def __init__(self):
+            super().__init__()
+            self.logged = []
+
+        def on_train_batch_end(self, trainer, pl_module, outputs, batch,
+                               batch_idx):
+            self.logged.append({"type": "train", "step": trainer.global_step})
+
+        def on_validation_batch_end(self, trainer, pl_module, outputs, batch,
+                                    batch_idx, dataloader_idx=0):
+            self.logged.append({"type": "val", "step": trainer.global_step})
+
     logger = LoggerCallback()
-    trainer = pl.Trainer(max_epochs=1, max_steps=4, callbacks=logger,
-                         log_every_n_steps=1)
+    trainer = pl.Trainer(max_epochs=2, max_steps=4, callbacks=[logger],
+                         log_every_n_steps=1, enable_progress_bar=False)
     train_loader, val_loader = loaders[0], loaders[1]
     trainer.fit(model, train_loader, val_loader)
 
-    out, err = capfd.readouterr()
-    assert 'Sanity Checking' in out
-    assert 'Epoch 0:' in out
+    assert trainer.current_epoch > 0
+    # ensure both train and val batches were seen
+    types = {entry["type"] for entry in logger.logged}
+    assert "val" in types, "Validation did not run"
+    assert "train" in types, "Training did not run"