[EP] bug fixes (#1586)

tianyu-l · web-flow · commit 9233d8318826 · 2025-08-17T17:50:59.000-07:00
fixes bug introduced in #1555
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -365,13 +365,18 @@ def wrapper(
 # This class is to support Sequence Parallel for ETP=1
 # when EP borrows from all TP and part of DP
 class ReordererSequenceParallel(ParallelStyle):
+    def __init__(self):
+        super().__init__()
+        self.num_tokens = None
+
     def _prepare_inputput_fn(self, mod, inputs, device_mesh):
         top_scores, selected_experts_indices = inputs
 
         top_scores = DTensor.from_local(top_scores, device_mesh, (Replicate(),))
         selected_experts_indices = DTensor.from_local(
             selected_experts_indices, device_mesh, (Replicate(),)
         )
+        self.num_tokens = top_scores.shape[0]
 
         # TODO: If needed, we can pad tokens in case bs*slen is not divisible by TP degree
         # if top_scores.shape[0] % device_mesh.size() != 0:
@@ -380,7 +385,7 @@ def _prepare_inputput_fn(self, mod, inputs, device_mesh):
         #     n_pad = (num_tokens // tp_size + 1) * tp_size - num_tokens
         #     selected_experts_indices = F.pad(selected_experts_indices, [0, 0, 0, n_pad])
         #     top_scores = F.pad(top_scores, [0, 0, 0, n_pad])
-        assert top_scores.shape[0] % device_mesh.size() == 0
+        assert self.num_tokens % device_mesh.size() == 0
 
         # split on the bs*slen dimension
         top_scores = top_scores.redistribute(device_mesh, (Shard(0),)).to_local()
@@ -395,9 +400,10 @@ def _prepare_output_fn(self, mod, outputs, device_mesh):
 
         # NOTE: As we shard routed tokens along bs*slen dim across the TP ranks,
         #       the MoE gather and scatter still require global token indices.
-        num_tokens = top_scores.shape[0]
         local_rank = device_mesh.get_local_rank()
-        token_indices_experts_sorted += num_tokens // device_mesh.size() * local_rank
+        token_indices_experts_sorted += (
+            self.num_tokens // device_mesh.size() * local_rank
+        )
 
         return top_scores, token_indices_experts_sorted, num_tokens_per_expert
 
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -401,7 +401,7 @@ def apply_moe_ep_tp(
                 # replicate computation for the router
                 "moe.router.gate": NoParallel(),
             }
-            if not etp_enabled:
+            if ep_mesh is not None and not etp_enabled:
                 # If TP is borrowed for EP, then split the tokens across TP ranks so that
                 # the reorderer, the all-to-all comms, and routed experts computation
                 # are effectively running Sequence Parallel (split along the folded bs*slen dim)
diff --git a/torchtitan/experiments/llama4/train_configs/debug_model.toml b/torchtitan/experiments/llama4/train_configs/debug_model.toml
@@ -51,6 +51,7 @@ fsdp_reshard_after_forward = "default" # default / never / always
 tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
 pipeline_parallel_degree = 1
+pipeline_parallel_schedule = "1F1B"
 context_parallel_degree = 1
 expert_parallel_degree = 1
 expert_tensor_parallel_degree = 1
diff --git a/torchtitan/models/deepseek_v3/train_configs/debug_model.toml b/torchtitan/models/deepseek_v3/train_configs/debug_model.toml
@@ -52,6 +52,7 @@ tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
 pipeline_parallel_degree = 1
 pipeline_parallel_schedule = "1F1B"
+context_parallel_degree = 1
 expert_parallel_degree = 1
 expert_tensor_parallel_degree = 1
 

Original file line number	Diff line number	Diff line change
`@@ -401,7 +401,7 @@ def apply_moe_ep_tp(`
`401`	`401`	`# replicate computation for the router`
`402`	`402`	`"moe.router.gate": NoParallel(),`
`403`	`403`	`}`
`404`		`- if not etp_enabled:`
	`404`	`+ if ep_mesh is not None and not etp_enabled:`
`405`	`405`	`# If TP is borrowed for EP, then split the tokens across TP ranks so that`
`406`	`406`	`# the reorderer, the all-to-all comms, and routed experts computation`
`407`	`407`	`# are effectively running Sequence Parallel (split along the folded bs*slen dim)`