[graph_trainer] Annotate ac region id for transformer blocks (#2609)

yiming0416 · web-flow · commit 3d4425ead4b2 · 2026-03-17T09:59:18.000-07:00
Without per-transformer-block AC region IDs, the min-cut partitioner
sees the entire model as a single region. In practice, the partitioner
can still rely on existing `MUST_SAVE` nodes as anchors to limit
recomputation scope. But recomputation could trace all the way back to
the beginning of the model when it doesn't hit `MUST_SAVE node.

By assigning a unique `ac_graph_id` to each transformer block, the
partitioner is forced to `MUST_SAVE` at region boundaries (i.e., between
transformer blocks). This ensures recomputation during the backward pass
is always contained within a single block.

This PR:
- Adds `annotate_ac_regions()` to tag each transformer block's forward
with a unique `ac_region_id`.
- Updates `apply_sac_pass` to read the `ac_region_id` from node custom
metadata and set it as the `ac_graph_id`.
diff --git a/torchtitan/experiments/graph_trainer/common_utils.py b/torchtitan/experiments/graph_trainer/common_utils.py
@@ -8,13 +8,30 @@
 
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 from torch.distributed.tensor import DTensor, Replicate
+from torch.fx.traceback import annotate_fn
 from torch.utils._pytree import register_pytree_node, tree_map
 
 from torchtitan.config import CompileConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.tools.logging import logger
 
+_AC_REGION_ID = "ac_region_id"
+
+
+def annotate_ac_regions(model: nn.Module) -> None:
+    """Annotate each transformer block with a unique AC region ID.
+
+    This enables apply_sac_pass to assign different ac_graph_id values
+    per block, creating AC region boundaries between transformer blocks.
+    """
+    layers = model.get_submodule("layers")
+    for layer_id, transformer_block in layers.named_children():
+        transformer_block.forward = annotate_fn({_AC_REGION_ID: int(layer_id)})(
+            transformer_block.forward
+        )
+
 
 def parallelize_inputs(parallel_dims, args, kwargs):
     if not parallel_dims.tp_enabled:
diff --git a/torchtitan/experiments/graph_trainer/deepseek_v3/parallelize.py b/torchtitan/experiments/graph_trainer/deepseek_v3/parallelize.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
 from torch.fx.traceback import annotate_fn
 
@@ -20,8 +19,14 @@
 
 from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.experiments.graph_trainer.common_utils import maybe_disable_eager_ac
+from torchtitan.experiments.graph_trainer.common_utils import (
+    annotate_ac_regions,
+    maybe_disable_eager_ac,
+)
 from torchtitan.experiments.graph_trainer.compile import apply_compile
+from torchtitan.experiments.graph_trainer.deepseek_v3.model import (
+    GraphTrainerDeepSeekV3Model,
+)
 from torchtitan.experiments.graph_trainer.simple_fsdp import (
     data_parallel,
     MixedPrecisionPolicy,
@@ -31,7 +36,7 @@
 from torchtitan.tools.logging import logger
 
 
-def annotate_deepseekv3() -> None:
+def annotate_deepseekv3(model: GraphTrainerDeepSeekV3Model) -> None:
     """Attach annotations to FX graph nodes with ``torch.fx.traceback.annotate_fn``
 
     - Expert Parallel (EP) annotations: Tags "dispatch", "combine", and "compute"
@@ -40,6 +45,9 @@ def annotate_deepseekv3() -> None:
       {"compile_with_inductor": "flex_attention"} so the compiler can apply
       regional inductor pass based on the annotation. Regional inductor is now only
       supported in AOT mode.
+    - AC region annotation: Tags each transformer block's forward with a unique
+      ac_region_id so that apply_sac_pass can assign per-block ac_graph_id
+      boundaries for the min-cut partitioner.
 
     """
     from torchtitan.distributed.expert_parallel import ExpertParallel
@@ -58,10 +66,12 @@ def annotate_deepseekv3() -> None:
         {"compile_with_inductor": "flex_attention"}
     )(FlexAttentionWrapper.forward)
 
+    annotate_ac_regions(model)
+
 
 # Adapted from llama4/infra/parallelize.py
 def parallelize_deepseekv3(
-    model: nn.Module,
+    model: GraphTrainerDeepSeekV3Model,
     *,
     parallel_dims: ParallelDims,
     training: TrainingConfig,
@@ -87,7 +97,7 @@ def parallelize_deepseekv3(
     ):
         raise NotImplementedError("CP support is only supported for SDPA.")
 
-    annotate_deepseekv3()
+    annotate_deepseekv3(model)
 
     maybe_disable_eager_ac(compile_config, ac_config)
 
diff --git a/torchtitan/experiments/graph_trainer/llama3/parallelize.py b/torchtitan/experiments/graph_trainer/llama3/parallelize.py
@@ -18,18 +18,20 @@
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.experiments.graph_trainer.common_utils import maybe_disable_eager_ac
+from torchtitan.experiments.graph_trainer.common_utils import (
+    annotate_ac_regions,
+    maybe_disable_eager_ac,
+)
 from torchtitan.experiments.graph_trainer.compile import apply_compile
+from torchtitan.experiments.graph_trainer.llama3.model import GraphTrainerLlama3Model
 from torchtitan.experiments.graph_trainer.simple_fsdp import (
     data_parallel,
     MixedPrecisionPolicy,
 )
-from torchtitan.models.llama3.model import Llama3Model
 from torchtitan.models.llama3.parallelize import apply_tp
 from torchtitan.protocols.model_converter import ModelConvertersContainer
 from torchtitan.tools.logging import logger
 
-
 # for selective op activation checkpointing
 _op_sac_save_list = {
     torch.ops.aten.mm.default,
@@ -50,24 +52,29 @@
 }
 
 
-def annotate_llama() -> None:
+def annotate_llama(model: GraphTrainerLlama3Model) -> None:
     """Attach annotations to FX graph nodes with ``torch.fx.traceback.annotate_fn``
 
     - Flex attention annotation: Tags FlexAttentionWrapper.forward with
       {"compile_with_inductor": "flex_attention"} so the compiler can apply
       regional inductor pass based on the annotation. Regional inductor is now only
       supported in AOT mode.
 
+    - AC region annotation: Tags each transformer block's forward with a unique
+      ac_region_id so that apply_sac_pass can assign per-block ac_graph_id
+      boundaries for the min-cut partitioner.
     """
     from torchtitan.models.common.attention import FlexAttentionWrapper
 
     FlexAttentionWrapper.forward = annotate_fn(
         {"compile_with_inductor": "flex_attention"}
     )(FlexAttentionWrapper.forward)
 
+    annotate_ac_regions(model)
+
 
 def parallelize_llama(
-    model: Llama3Model,
+    model: GraphTrainerLlama3Model,
     *,
     parallel_dims: ParallelDims,
     training: TrainingConfig,
@@ -94,7 +101,7 @@ def parallelize_llama(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    annotate_llama()
+    annotate_llama(model)
 
     maybe_disable_eager_ac(compile_config, ac_config)
 
diff --git a/torchtitan/experiments/graph_trainer/passes.py b/torchtitan/experiments/graph_trainer/passes.py
@@ -15,6 +15,7 @@
 - Compiler passes: Applied to the partitioned forward/backward graphs
 """
 import operator
+from collections import defaultdict
 from collections.abc import Sequence
 from typing import Any
 
@@ -29,6 +30,7 @@
 from torch.fx.passes.regional_inductor import regional_inductor
 from torch.utils.checkpoint import CheckpointPolicy
 
+from torchtitan.experiments.graph_trainer.common_utils import _AC_REGION_ID
 from torchtitan.experiments.graph_trainer.reshard_after_forward import (
     annotate_fsdp_all_gather,
 )
@@ -182,6 +184,9 @@ def apply_sac_pass(
         op_list_to_save = _get_default_sac_save_ops()
 
     mm_count = 0
+    ac_region_stats: dict[int, dict[str, int]] = defaultdict(
+        lambda: {"save": 0, "recompute": 0}
+    )
 
     for node in gm.graph.nodes:
         if node.op != "call_function":
@@ -205,25 +210,37 @@ def apply_sac_pass(
                 node.meta["ac_graph_id"] = parent.meta.get("ac_graph_id", 0)
             continue
 
-        node.meta["ac_graph_id"] = 0
+        custom_meta = node.meta.get("custom", {})
+        ac_region_id = custom_meta.get(_AC_REGION_ID, 0)
+        node.meta["ac_graph_id"] = ac_region_id
 
         if node.target is torch.ops.aten.mm.default:
             mm_count += 1
             # Save every odd mm, recompute every even mm
             if mm_count % 2 == 0:
-                node.meta["recompute"] = CheckpointPolicy.PREFER_RECOMPUTE
+                policy = CheckpointPolicy.PREFER_RECOMPUTE
             else:
-                node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
+                policy = CheckpointPolicy.MUST_SAVE
         elif node.target in op_list_to_save:
-            node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
+            policy = CheckpointPolicy.MUST_SAVE
+        else:
+            policy = CheckpointPolicy.PREFER_RECOMPUTE
+
+        node.meta["recompute"] = policy
+        if policy == CheckpointPolicy.MUST_SAVE:
+            ac_region_stats[ac_region_id]["save"] += 1
         else:
-            node.meta["recompute"] = CheckpointPolicy.PREFER_RECOMPUTE
+            ac_region_stats[ac_region_id]["recompute"] += 1
 
     gm.recompile()
-    logger.info(
-        "Applied selective activation checkpointing (SAC) graph pass "
-        f"({mm_count} mm ops found, {mm_count - mm_count // 2} saved)"
-    )
+    logger.info("Applied selective activation checkpointing (SAC) graph pass.")
+    for ac_region_id in sorted(ac_region_stats):
+        stats = ac_region_stats[ac_region_id]
+        logger.info(
+            f"  AC region {ac_region_id}: "
+            f"{stats['save']} nodes annotated with MUST_SAVE, "
+            f"{stats['recompute']} nodes annotated with PREFER_RECOMPUTE"
+        )
     return gm
 
 
diff --git a/torchtitan/experiments/graph_trainer/tests/test_passes.py b/torchtitan/experiments/graph_trainer/tests/test_passes.py
@@ -18,6 +18,7 @@
 from torch.utils.checkpoint import checkpoint, CheckpointPolicy
 
 from torchtitan.distributed import ParallelDims
+from torchtitan.experiments.graph_trainer.common_utils import _AC_REGION_ID
 from torchtitan.experiments.graph_trainer.graph_utils import export_joint
 from torchtitan.experiments.graph_trainer.passes import (
     apply_sac_pass,
@@ -215,11 +216,16 @@ def _build_gm(self, op_targets):
         x = graph.placeholder("x")
         y = graph.placeholder("y")
         last = x
-        for target in op_targets:
+        for i, target in enumerate(op_targets):
             if target is operator.getitem:
                 last = graph.call_function(target, args=(last, 0))
             else:
                 last = graph.call_function(target, args=(last, y))
+                # If the next op is getitem, wrap in a tuple so getitem has
+                # a proper tuple/list input.
+                if i + 1 < len(op_targets) and op_targets[i + 1] is operator.getitem:
+                    _make_tuple = lambda x: (x, x)
+                    last = graph.call_function(_make_tuple, args=(last,))
         graph.output(last)
         return torch.fx.GraphModule(torch.nn.Module(), graph)
 
@@ -248,41 +254,55 @@ def test_save_ops_marked_must_save(self):
         self.assertEqual(len(nodes), 1)
         self.assertEqual(nodes[0].meta["recompute"], CheckpointPolicy.MUST_SAVE)
 
-    def test_getitem_propagates_parent_tag(self):
-        """operator.getitem nodes should inherit the parent's recompute tag."""
+    def test_getitem_propagates_parent_tags(self):
+        """operator.getitem nodes should inherit the parent's recompute tag and ac_graph_id."""
         gm = self._build_gm(
             [
                 torch.ops.aten.add.Tensor,
                 operator.getitem,
                 torch.ops.aten.relu.default,
             ]
         )
-        apply_sac_pass(gm)
         nodes = self._get_call_function_nodes(gm)
-        add_node = nodes[0]
-        getitem_node = nodes[1]
-        self.assertEqual(add_node.target, torch.ops.aten.add.Tensor)
-        self.assertEqual(getitem_node.target, operator.getitem)
-        self.assertEqual(getitem_node.meta["recompute"], add_node.meta["recompute"])
-
-    def test_wait_tensor_propagates_parent_tag(self):
-        """wait_tensor nodes should inherit the parent's recompute tag."""
+        # nodes: [add, make_tuple, getitem, relu]
+        # make_tuple is the tuple-returning parent of getitem
+        self.assertEqual(nodes[0].target, torch.ops.aten.add.Tensor)
+        self.assertEqual(nodes[2].target, operator.getitem)
+
+        # Set ac_region_id on the tuple-returning parent (the direct parent of getitem)
+        nodes[1].meta["custom"] = {_AC_REGION_ID: 3}
+
+        apply_sac_pass(gm)
+
+        tuple_node = nodes[1]
+        getitem_node = nodes[2]
+        self.assertEqual(getitem_node.meta["recompute"], tuple_node.meta["recompute"])
+        self.assertEqual(tuple_node.meta["ac_graph_id"], 3)
+        self.assertEqual(getitem_node.meta["ac_graph_id"], 3)
+
+    def test_wait_tensor_propagates_parent_tags(self):
+        """wait_tensor nodes should inherit the parent's recompute tag and ac_graph_id."""
         custom_save = {torch.ops._c10d_functional.reduce_scatter_tensor.default}
         gm = self._build_gm(
             [
                 torch.ops._c10d_functional.reduce_scatter_tensor.default,
                 torch.ops._c10d_functional.wait_tensor.default,
             ]
         )
-        apply_sac_pass(gm, op_list_to_save=custom_save)
         nodes = self._get_call_function_nodes(gm)
+        nodes[0].meta["custom"] = {_AC_REGION_ID: 3}
+
+        apply_sac_pass(gm, op_list_to_save=custom_save)
+
         rs_node = nodes[0]
         wait_node = nodes[1]
         self.assertEqual(rs_node.meta["recompute"], CheckpointPolicy.MUST_SAVE)
         self.assertEqual(wait_node.meta["recompute"], CheckpointPolicy.MUST_SAVE)
+        self.assertEqual(rs_node.meta["ac_graph_id"], 3)
+        self.assertEqual(wait_node.meta["ac_graph_id"], 3)
 
-    def test_ac_graph_id_set(self):
-        """All annotated nodes should have ac_graph_id = 0."""
+    def test_ac_graph_id_defaults_to_zero(self):
+        """Nodes without ac_region_id annotation should have ac_graph_id = 0."""
         gm = self._build_gm(
             [
                 torch.ops.aten.add.Tensor,
@@ -295,6 +315,24 @@ def test_ac_graph_id_set(self):
             if node.target is not operator.getitem:
                 self.assertEqual(node.meta["ac_graph_id"], 0)
 
+    def test_ac_graph_id_from_annotation(self):
+        """Nodes with _AC_REGION_ID_KEY in custom metadata should use that as ac_graph_id."""
+        gm = self._build_gm(
+            [
+                torch.ops.aten.add.Tensor,
+                torch.ops.aten.relu.default,
+            ]
+        )
+        nodes = self._get_call_function_nodes(gm)
+        # Simulate annotate_fn setting custom metadata on different nodes
+        nodes[0].meta["custom"] = {_AC_REGION_ID: 1}
+        nodes[1].meta["custom"] = {_AC_REGION_ID: 2}
+
+        apply_sac_pass(gm)
+
+        self.assertEqual(nodes[0].meta["ac_graph_id"], 1)
+        self.assertEqual(nodes[1].meta["ac_graph_id"], 2)
+
     def test_custom_op_list_to_save(self):
         """A custom op_list_to_save should override the defaults."""
         custom_save = {torch.ops.aten.relu.default}