Make MoE models non-strict tracing friendly

ydwu4 · ydwu4 · commit b516c04ef830 · 2026-03-17T17:18:17.000-07:00
diff --git a/torchtitan/experiments/graph_trainer/tests/test_trace_module.py b/torchtitan/experiments/graph_trainer/tests/test_trace_module.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import contextlib
 import unittest
 from collections import Counter
 
@@ -352,5 +353,224 @@ def test_patch_engine_restores_original(self):
         self.assertIs(torch.autograd._engine_run_backward, orig_fn)
 
 
+@contextlib.contextmanager
+def _use_raw_flex_attn():
+    """Swap the compiled flex_attention with the raw (uncompiled) version.
+
+    FlexAttentionWrapper uses torch.compile'd flex_attention by default.
+    torch.compile inside make_fx tracing is not supported and raises:
+      "Detected that you are using FX to symbolically trace a
+       dynamo-optimized function."
+    Using the raw version lets make_fx decompose flex_attention into
+    plain aten ops (bmm, softmax, etc.) which trace correctly.
+
+    Note: make_fx(..., pre_dispatch=True) with raw flex_attention preserves
+    it as a FlexAttentionHOP higher-order op in the graph instead of
+    decomposing it, which is what torch.export also does.
+    """
+    from torch.nn.attention.flex_attention import flex_attention as raw_flex_attention
+
+    from torchtitan.models.common.attention import FlexAttentionWrapper
+
+    original = FlexAttentionWrapper._compiled_flex_attn
+    FlexAttentionWrapper._compiled_flex_attn = staticmethod(raw_flex_attention)
+    try:
+        yield
+    finally:
+        FlexAttentionWrapper._compiled_flex_attn = original
+
+
+@unittest.skipUnless(torch.cuda.is_available(), "CUDA required")
+class TestTraceModels(unittest.TestCase):
+    DEVICE = "cuda"
+    DTYPE = torch.float32
+    BATCH_SIZE = 2
+    SEQ_LEN = 128
+    NUM_STEPS = 5
+    LR = 1e-3
+
+    def setUp(self):
+        torch.manual_seed(42)
+        torch.use_deterministic_algorithms(True)
+
+    def tearDown(self):
+        torch.use_deterministic_algorithms(False)
+
+    def _run_bitwise_test(
+        self,
+        model_ref,
+        model_copy,
+        fwd_args,
+        labels,
+        check_collective_ops=False,
+        num_steps=5,
+        lr=1e-3,
+    ):
+        train_step_ref = TrainStepModule(model_ref, get_loss)
+
+        with _use_raw_flex_attn():
+            traced_result = trace_module(train_step_ref, (*fwd_args, labels))
+
+        if check_collective_ops:
+            ag = sum(
+                1
+                for n in traced_result.gm.graph.nodes
+                if "all_gather_into_tensor" in str(n.target)
+            )
+            rs = sum(
+                1
+                for n in traced_result.gm.graph.nodes
+                if "reduce_scatter_tensor" in str(n.target)
+            )
+            self.assertTrue(
+                ag > 0 and rs > 0,
+                f"Expected collective ops in FSDP graph (ag={ag}, rs={rs})",
+            )
+
+        opt_ref = torch.optim.Adam(model_ref.parameters(), lr=lr)
+        opt_copy = torch.optim.Adam(model_copy.parameters(), lr=lr)
+
+        for step in range(1, num_steps + 1):
+            with _use_raw_flex_attn():
+                logits_ref = model_ref(*fwd_args)
+            loss_ref = get_loss(logits_ref, labels)
+            loss_ref.backward()
+            grads_ref = [p.grad.clone() for p in model_ref.parameters()]
+            opt_ref.step()
+            opt_ref.zero_grad()
+
+            train_step_copy = TrainStepModule(model_copy, get_loss)
+            pab = _get_params_and_buffers(train_step_copy)
+            wrapped = run_traced_module(traced_result, pab, (*fwd_args, labels))
+            loss_tr = wrapped[0]
+            grads_tr = wrapped[1:]
+            for p, g in zip(model_copy.parameters(), grads_tr, strict=True):
+                p.grad = g
+            opt_copy.step()
+            opt_copy.zero_grad()
+
+            self.assertTrue(
+                torch.equal(loss_ref, loss_tr), f"Step {step}: loss mismatch"
+            )
+            for gr, gt in zip(grads_ref, grads_tr, strict=True):
+                self.assertTrue(torch.equal(gr, gt), f"Step {step}: grad mismatch")
+
+    def _run_model_test(self, config_cls, model_config, use_attn_masks=False):
+        vocab_size = model_config.vocab_size
+        model_ref = create_model(config_cls, model_config, self.DEVICE, self.DTYPE)
+        model_copy = create_model(config_cls, model_config, self.DEVICE, self.DTYPE)
+        model_copy.load_state_dict(model_ref.state_dict())
+        tokens = torch.randint(
+            0, vocab_size, (self.BATCH_SIZE, self.SEQ_LEN), device=self.DEVICE
+        )
+        labels = torch.randint(
+            0, vocab_size, (self.BATCH_SIZE, self.SEQ_LEN), device=self.DEVICE
+        )
+
+        if use_attn_masks:
+            from torchtitan.models.common.attention import (
+                create_attention_mask,
+                get_causal_mask_mod,
+            )
+
+            attn_masks = create_attention_mask(
+                get_causal_mask_mod(), 1, None, self.SEQ_LEN, self.SEQ_LEN
+            )
+            self._run_bitwise_test(
+                model_ref,
+                model_copy,
+                (tokens, attn_masks),
+                labels,
+                num_steps=self.NUM_STEPS,
+                lr=self.LR,
+            )
+            return
+
+        self._run_bitwise_test(
+            model_ref,
+            model_copy,
+            (tokens,),
+            labels,
+            num_steps=self.NUM_STEPS,
+            lr=self.LR,
+        )
+
+    def test_llama3(self):
+        from torchtitan.models.llama3 import llama3_configs, Llama3Model
+
+        self._run_model_test(Llama3Model, llama3_configs["debugmodel"])
+
+    def test_qwen3(self):
+        from torchtitan.models.qwen3 import qwen3_configs
+        from torchtitan.models.qwen3.model import Qwen3Model
+
+        self._run_model_test(Qwen3Model, qwen3_configs["debugmodel"])
+
+    def test_qwen3_moe(self):
+        from torchtitan.models.qwen3 import qwen3_configs
+        from torchtitan.models.qwen3.model import Qwen3Model
+
+        self._run_model_test(Qwen3Model, qwen3_configs["debugmodel_moe"])
+
+    def test_deepseek_v3(self):
+        from torchtitan.models.deepseek_v3 import deepseekv3_configs
+        from torchtitan.models.deepseek_v3.model import DeepSeekV3Model
+
+        self._run_model_test(DeepSeekV3Model, deepseekv3_configs["debugmodel"])
+
+    def test_llama4(self):
+        from torchtitan.models.llama4 import llama4_configs
+        from torchtitan.models.llama4.model import Llama4Model
+
+        self._run_model_test(
+            Llama4Model, llama4_configs["debugmodel"], use_attn_masks=True
+        )
+
+    def test_gpt_oss(self):
+        from torch.nn.attention.flex_attention import and_masks
+
+        from torchtitan.models.common.attention import (
+            create_attention_mask,
+            get_causal_mask_mod,
+            get_sliding_window_mask_mod,
+        )
+        from torchtitan.models.gpt_oss import gptoss_configs
+        from torchtitan.models.gpt_oss.model import GptOssModel
+
+        config = gptoss_configs["debugmodel"]
+        vocab_size = config.vocab_size
+        model_ref = create_model(GptOssModel, config, self.DEVICE, self.DTYPE)
+        model_copy = create_model(GptOssModel, config, self.DEVICE, self.DTYPE)
+        model_copy.load_state_dict(model_ref.state_dict())
+        tokens = torch.randint(
+            0, vocab_size, (self.BATCH_SIZE, self.SEQ_LEN), device=self.DEVICE
+        )
+        labels = torch.randint(
+            0, vocab_size, (self.BATCH_SIZE, self.SEQ_LEN), device=self.DEVICE
+        )
+        causal = get_causal_mask_mod()
+        sw_size = config.layer.attention.sliding_window_size
+        basic_mask = create_attention_mask(causal, 1, None, self.SEQ_LEN, self.SEQ_LEN)
+        sliding_window_mask = create_attention_mask(
+            and_masks(causal, get_sliding_window_mask_mod(sw_size)),
+            1,
+            None,
+            self.SEQ_LEN,
+            self.SEQ_LEN,
+        )
+        attn_masks = {
+            "basic_mask": basic_mask,
+            "sliding_window_mask": sliding_window_mask,
+        }
+        self._run_bitwise_test(
+            model_ref,
+            model_copy,
+            (tokens, attn_masks),
+            labels,
+            num_steps=self.NUM_STEPS,
+            lr=self.LR,
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchtitan/models/common/moe/kernels.py b/torchtitan/models/common/moe/kernels.py
@@ -68,7 +68,7 @@ def _fill_indices_kernel(
 # ==============
 
 
-def fill_indices_wrapper(
+def _fill_indices_impl(
     tokens_per_expert_group: torch.Tensor,
     start_index_values: torch.Tensor,
     write_offsets: torch.Tensor,
@@ -77,7 +77,7 @@ def fill_indices_wrapper(
     max_len: int,
     block_size: int = 128,
     max_blocks: int = 1024,  # cap on total number of blocks to launch
-):
+) -> torch.Tensor:
     # preallocate output
     permuted_indices = torch.full(
         (max_len,), -1, dtype=torch.int64, device=tokens_per_expert_group.device
@@ -104,6 +104,45 @@ def fill_indices_wrapper(
     return permuted_indices
 
 
+@torch.library.custom_op("torchtitan::fill_indices", mutates_args=())
+def fill_indices_wrapper(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+    block_size: int = 128,
+    max_blocks: int = 1024,
+) -> torch.Tensor:
+    return _fill_indices_impl(
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+        block_size,
+        max_blocks,
+    )
+
+
+@fill_indices_wrapper.register_fake
+def _fill_indices_fake(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+    block_size: int = 128,
+    max_blocks: int = 1024,
+) -> torch.Tensor:
+    return torch.empty(
+        max_len, dtype=torch.int64, device=tokens_per_expert_group.device
+    )
+
+
 # reference
 def fill_indices_cpu(
     tokens_per_expert_group: torch.Tensor,