set full to zero bubble

mori360 · mori360 · commit 674852ac1514 · 2026-03-11T18:58:32.000-07:00
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
@@ -147,6 +147,7 @@ def build_features_test_list() -> list[OverrideDefinitions]:
                 [
                     "--parallelism.pipeline_parallel_degree 4",
                     "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
+                    "--activation_checkpoint.mode full",
                 ],
             ],
             "PP looped zero bubble test",
@@ -158,6 +159,7 @@ def build_features_test_list() -> list[OverrideDefinitions]:
                 [
                     "--parallelism.pipeline_parallel_degree 2",
                     "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
+                    "--activation_checkpoint.mode full",
                 ],
             ],
             "PP zero bubble test (v shaped)",
diff --git a/tests/unit_tests/test_activation_checkpoint.py b/tests/unit_tests/test_activation_checkpoint.py
@@ -280,23 +280,50 @@ def run_fwd_bwd(model, batch):
             torch.testing.assert_close(g_ref, g_fl)
 
     def test_skip_mm_fqns(self):
-        """Test that per_op_sac_skip_mm_fqns excludes matched linears from alternation."""
-
-        def get_bw_flops(model_fn):
-            x = torch.randn(512, 512, requires_grad=True)
-            out = model_fn(x)
-            out.backward()
-
-            x = torch.randn(512, 512, requires_grad=True)
-            out = model_fn(x)
-            with FlopCounterMode(display=False) as mode:
+        """Test that per_op_sac_skip_mm_fqns controls exactly which matmuls
+        are recomputed vs stored during backward.
+
+        Approach: during backward, we count aten.mm calls per weight tensor.
+        Each Linear's weight participates in exactly one gradient mm (grad_input).
+        If the Linear's forward mm was recomputed, the weight also appears in the
+        recomputed forward mm, giving count=2. If stored, count=1.
+        """
+        from torch.utils._python_dispatch import TorchDispatchMode
+
+        class MmWeightTracker(TorchDispatchMode):
+            def __init__(self, weight_data_ptrs):
+                super().__init__()
+                self._ptrs = weight_data_ptrs
+                self.counts = {name: 0 for name in weight_data_ptrs.values()}
+
+            def __torch_dispatch__(self, func, types, args, kwargs=None):
+                if func == torch.ops.aten.mm.default:
+                    for arg in args:
+                        name = self._ptrs.get(arg.data_ptr())
+                        if name is not None:
+                            self.counts[name] += 1
+                            break
+                return func(*args, **(kwargs or {}))
+
+        def is_recomputed(model):
+            """Return {linear_short_name: bool} — True means recomputed."""
+            ptr_to_name = {}
+            for fqn, mod in model.named_modules():
+                if isinstance(mod, nn.Linear):
+                    ptr_to_name[mod.weight.data_ptr()] = fqn.rsplit(".", 1)[-1]
+
+            x = torch.randn(64, 512, requires_grad=True)
+            out = model(x)
+            tracker = MmWeightTracker(ptr_to_name)
+            with tracker:
                 out.backward()
-            return mode.get_total_flops() / (512**3 * 2)
+            return {name: count == 2 for name, count in tracker.counts.items()}
 
-        # Without skip: all 3 linears participate in the alternating counter.
-        model_no_skip = ToyModule()
+        # Baseline SAC — alternating "save every other mm":
+        # gate(1st→saved), wq(2nd→recomputed), output(3rd→saved)
+        m = ToyModule()
         apply_ac(
-            model_no_skip,
+            m,
             ACConfig(
                 mode="selective",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=[],
@@ -305,13 +332,16 @@ def get_bw_flops(model_fn):
             ),
             model_compile_enabled=False,
         )
-        flops_no_skip = get_bw_flops(model_no_skip)
-
-        # With skip on "moe": moe.router.gate is excluded from the alternating
-        # counter and always recomputed.
-        model_with_skip = ToyModule()
+        r = is_recomputed(m)
+        self.assertFalse(r["gate"], "gate should be stored (1st in alternation)")
+        self.assertTrue(r["wq"], "wq should be recomputed (2nd in alternation)")
+        self.assertFalse(r["output"], "output should be stored (3rd in alternation)")
+
+        # skip="moe" — gate excluded from alternation (always recomputed).
+        # Remaining alternation: wq(1st→saved), output(2nd→recomputed)
+        m = ToyModule()
         apply_ac(
-            model_with_skip,
+            m,
             ACConfig(
                 mode="selective",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=[],
@@ -320,45 +350,28 @@ def get_bw_flops(model_fn):
             ),
             model_compile_enabled=False,
         )
-        flops_with_skip = get_bw_flops(model_with_skip)
-
-        self.assertNotEqual(flops_no_skip, flops_with_skip)
-
-    def test_skip_mm_fqns_correctness(self):
-        """Test that skip_mm_fqns produces correct gradients."""
-        model_ref = ToyModule()
-
-        model_skip = ToyModule()
-        model_skip.load_state_dict(model_ref.state_dict())
+        r = is_recomputed(m)
+        self.assertTrue(r["gate"], "gate should be recomputed (skipped)")
+        self.assertFalse(r["wq"], "wq should be stored (1st in alternation)")
+        self.assertTrue(r["output"], "output should be recomputed (2nd in alternation)")
+
+        # skip="attention" — wq excluded from alternation (always recomputed).
+        # Remaining alternation: gate(1st→saved), output(2nd→recomputed)
+        m = ToyModule()
         apply_ac(
-            model_skip,
+            m,
             ACConfig(
                 mode="selective",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=[],
-                per_op_sac_skip_mm_fqns=["moe"],
+                per_op_sac_skip_mm_fqns=["attention"],
+                early_stop=False,
             ),
             model_compile_enabled=False,
         )
-
-        batch = torch.randn(64, 512)
-
-        # Reference: no AC
-        model_ref.zero_grad(set_to_none=True)
-        x_ref = batch.clone().detach().requires_grad_(True)
-        out_ref = model_ref(x_ref)
-        out_ref.backward()
-
-        # With skip AC
-        model_skip.zero_grad(set_to_none=True)
-        x_skip = batch.clone().detach().requires_grad_(True)
-        out_skip = model_skip(x_skip)
-        out_skip.backward()
-
-        torch.testing.assert_close(out_ref.detach(), out_skip.detach())
-        torch.testing.assert_close(x_ref.grad, x_skip.grad)
-        for p_ref, p_skip in zip(model_ref.parameters(), model_skip.parameters()):
-            if p_ref.grad is not None and p_skip.grad is not None:
-                torch.testing.assert_close(p_ref.grad, p_skip.grad)
+        r = is_recomputed(m)
+        self.assertFalse(r["gate"], "gate should be stored (1st in alternation)")
+        self.assertTrue(r["wq"], "wq should be recomputed (skipped)")
+        self.assertTrue(r["output"], "output should be recomputed (2nd in alternation)")
 
 
 if __name__ == "__main__":
diff --git a/torchtitan/distributed/activation_checkpoint.py b/torchtitan/distributed/activation_checkpoint.py
@@ -90,6 +90,9 @@ def _resolve_ops(op_specs: list) -> dict:
     # DeepEP (available when deepep is installed)
     (torch.ops, "deepep.dispatch.default"),
     (torch.ops, "deepep.combine.default"),
+    # HybridEP (available when hybridep is installed)
+    (torch.ops, "hybridep.dispatch.default"),
+    (torch.ops, "hybridep.combine.default"),
 ]
 
 
diff --git a/torchtitan/experiments/graph_trainer/tests/integration_tests.py b/torchtitan/experiments/graph_trainer/tests/integration_tests.py
@@ -48,7 +48,6 @@ def _build_llama3_tests() -> list[OverrideDefinitions]:
                     "--config graph_trainer_llama3_debugmodel",
                     "--compile.mode jit",
                     "--activation_checkpoint.mode selective",
-                    "--activation_checkpoint.selective_ac_option op",
                 ],
             ],
             "JIT 1D with selective op AC",
diff --git a/torchtitan/models/deepseek_v3/parallelize.py b/torchtitan/models/deepseek_v3/parallelize.py
@@ -104,8 +104,6 @@ def parallelize_deepseekv3(
 
         else:
             import torchtitan.distributed.deepep  # noqa: F401
-    else:
-        use_deepep = False
 
     if parallel_dims.tp_enabled or parallel_dims.ep_enabled:
         dual_pipe_v = get_dual_pipe_v_flag(
diff --git a/torchtitan/models/llama4/parallelize.py b/torchtitan/models/llama4/parallelize.py
@@ -133,9 +133,6 @@ def parallelize_llama(
         else:
             import torchtitan.distributed.deepep  # noqa: F401
 
-    else:
-        use_deepep = False
-
     if parallel_dims.tp_enabled or parallel_dims.ep_enabled:
         dual_pipe_v = get_dual_pipe_v_flag(
             parallelism=parallelism, ac_config=ac_config, parallel_dims=parallel_dims

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,9 @@ def _resolve_ops(op_specs: list) -> dict:`
`90`	`90`	`# DeepEP (available when deepep is installed)`
`91`	`91`	`(torch.ops, "deepep.dispatch.default"),`
`92`	`92`	`(torch.ops, "deepep.combine.default"),`
	`93`	`+ # HybridEP (available when hybridep is installed)`
	`94`	`+ (torch.ops, "hybridep.dispatch.default"),`
	`95`	`+ (torch.ops, "hybridep.combine.default"),`
`93`	`96`	`]`
`94`	`97`
`95`	`98`