[graph_trainer] Log transformed graph to tlparse via trace_structured (pytorch#2619)

yiming0416 · mori360 · commit e3d3ba8cc2c4 · 2026-03-23T13:22:39.000-07:00
This PR adds `tlparse_log_graph_pass` that logs post-transform
forward/backward graphs to tlparse, replacing
`logger.debug(gm.print_readable(...))` calls.
diff --git a/torchtitan/experiments/graph_trainer/cudagraph.py b/torchtitan/experiments/graph_trainer/cudagraph.py
@@ -145,6 +145,13 @@ def __init__(
         # (debug only) whether check static input tensor addresses during runtime
         self._should_check_address = should_check_address
 
+        self._gm = runnable if isinstance(runnable, torch.fx.GraphModule) else None
+
+    def print_readable(self, *args, **kwargs):
+        """Delegate to the inner GraphModule's print_readable."""
+        assert self._gm is not None, "print_readable requires a GraphModule runnable"
+        return self._gm.print_readable(*args, **kwargs)
+
     def _copy_non_static_inputs(self, *args):
         for i in self._input_indices_to_copy:
             self._args[i].copy_(args[i])
diff --git a/torchtitan/experiments/graph_trainer/graph_utils.py b/torchtitan/experiments/graph_trainer/graph_utils.py
@@ -67,12 +67,6 @@ def export_joint(
         torch.fx.traceback.preserve_node_meta(),
     ):
         gm = dynamo_graph_capture_for_export(model)(*args, **kwargs)
-        logger.debug("Dynamo gm:")
-        logger.debug(
-            gm.print_readable(
-                print_output=False, include_stride=True, include_device=True
-            )
-        )
         _dump_gm(dump_folder, gm, "dynamo_gm")
 
         tracing_context = gm.meta["tracing_context"]
@@ -288,10 +282,6 @@ def compiler(
     if passes is None:
         passes = DEFAULT_COMPILER_PASSES
 
-    logger.debug(f"{name} before compiler:")
-    logger.debug(
-        gm.print_readable(print_output=False, include_stride=True, include_device=True)
-    )
     _dump_gm(dump_folder, gm, f"{name}_before_compiler")
 
     if end_with_pass(passes, ["cudagraph_pass"]):
@@ -317,14 +307,18 @@ def compiler(
     # Only try to print/dump if gm is still a GraphModule
     # (compile_fx_inner returns a CompiledFxGraph which doesn't have print_readable)
     if hasattr(gm, "print_readable"):
-        logger.debug(f"{name} after compiler:")
-        logger.debug(
-            gm.print_readable(
-                print_output=False, include_stride=True, include_device=True
-            )
-        )
         _dump_gm(dump_folder, gm, f"{name}_after_compiler")
 
+        # Log the final transformed graph to tlparse.
+        from torchtitan.experiments.graph_trainer.passes import tlparse_log_graph_pass
+
+        graph_name = (
+            "aot_forward_graph_transformed"
+            if is_forward
+            else "aot_backward_graph_transformed"
+        )
+        tlparse_log_graph_pass(gm, example_inputs, graph_name=graph_name)
+
     return gm
 
 
diff --git a/torchtitan/experiments/graph_trainer/passes.py b/torchtitan/experiments/graph_trainer/passes.py
@@ -27,6 +27,7 @@
 )
 from torch._inductor.fx_passes.overlap_manual_scheduling import manual_overlap_bucketing
 from torch._inductor.fx_passes.overlap_scheduling import schedule_overlap_bucketing
+from torch._logging import trace_structured
 from torch.fx.passes.regional_inductor import regional_inductor
 from torch.utils.checkpoint import CheckpointPolicy
 
@@ -407,6 +408,44 @@ def reassign_to_pg_pass(
     return gm
 
 
+def tlparse_log_graph_pass(
+    gm: torch.fx.GraphModule,
+    example_inputs: Sequence[Any],
+    *,
+    graph_name: str,
+) -> torch.fx.GraphModule:
+    """Log the transformed graph to tlparse via trace_structured.
+
+    This pass should be added as the last transform in fwd/bwd_transforms
+    so that the logged graph reflects all prior transformations.
+
+    Args:
+        gm: The graph module to log.
+        example_inputs: The example inputs (unused, required by protocol).
+        graph_name: The name for this graph artifact
+            (e.g. "aot_forward_graph_transformed").
+
+    Returns:
+        The graph module unchanged.
+    """
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": graph_name,
+            "encoding": "string",
+        },
+        payload_fn=lambda: gm.print_readable(
+            print_output=False,
+            include_stride=True,
+            include_device=True,
+            expanded_def=True,
+        ),
+        expect_trace_id=False,
+    )
+
+    return gm
+
+
 # Registry mapping pass names to pass functions (for AOT mode fwd/bwd passes)
 AVAILABLE_COMPILER_PASSES = {
     "auto_bucketing": autobucketing_reordering_pass,