fix logging bug

jiemingz · jiemingz · commit 2a1ccfa9fe4c · 2026-01-06T15:20:02.000-08:00
Signed-off-by: Jieming Zhang &lt;jiemingz@nvidia.com&gt;
diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py
@@ -281,7 +281,6 @@ def record_bwd_graph(cls, runner):
     def create_cudagraphs(cls):
         """Iterate through 'cudagraph_record' creating graphs in the order in which
         they were recorded."""
-
         # Cudagraphs have already been created, check that no cudagraphed modules ran in eager mode
         if cls.cudagraph_created:
             assert len(cls.cudagraph_record) == 0, (
@@ -303,11 +302,11 @@ def create_cudagraphs(cls):
                     [isinstance(m, TransformerEngineBaseModule) for m in base_module.modules()]
                 )
 
-        if torch.distributed.get_rank() == 0:
-            time_start = time.time()
-            mem_stats_start = torch.cuda.memory_stats()
+        progress_bar = enumerate(cls.cudagraph_record)
+        time_start = time.time()
+        mem_stats_start = torch.cuda.memory_stats()
 
-            progress_bar = enumerate(cls.cudagraph_record)
+        if torch.distributed.get_rank() == 0:
             if HAVE_TQDM:
                 progress_bar = tqdm(
                     progress_bar, "create cuda graphs", total=len(cls.cudagraph_record)
@@ -361,22 +360,22 @@ def format_mem_bytes(mem_bytes):
                 assert fwd_buffer_reuse_ref_count == 0
                 runner.create_bwd_graph()
 
-        if torch.distributed.get_rank() == 0:
-            # Memory usage.
-            time_end = time.time()
-            mem_stats_end = torch.cuda.memory_stats()
-            capture_stats = {
-                "time": time_end - time_start,
-                "allocated_bytes": (
-                    mem_stats_end["allocated_bytes.all.current"]
-                    - mem_stats_start["allocated_bytes.all.current"]
-                ),
-                "reserved_bytes": (
-                    mem_stats_end["reserved_bytes.all.current"]
-                    - mem_stats_start["reserved_bytes.all.current"]
-                ),
-            }
+        # Memory usage.
+        time_end = time.time()
+        mem_stats_end = torch.cuda.memory_stats()
+        capture_stats = {
+            "time": time_end - time_start,
+            "allocated_bytes": (
+                mem_stats_end["allocated_bytes.all.current"]
+                - mem_stats_start["allocated_bytes.all.current"]
+            ),
+            "reserved_bytes": (
+                mem_stats_end["reserved_bytes.all.current"]
+                - mem_stats_start["reserved_bytes.all.current"]
+            ),
+        }
 
+        if torch.distributed.get_rank() == 0:
             logger.info(
                 "> built %d cuda graph(s) in %.2f sec, with total memory usage: "
                 "allocated %s, reserved %s."