feat:[AutoDeploy] Update MoE pattern matcher to drop expert selection logic (#3283)

Fridah-nv · web-flow · commit d008d6412f79 · 2025-05-15T13:53:09.000+08:00
* update matcher to match expert compute first, then extract other args with LCA

Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

* support 3D and 2D input in torch.ops.moe.trtllm_fused_moe

Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

* update custom ops to support 3D and 2D inputs

Signed-off-by: Ubuntu &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

* update deepseek patch

Signed-off-by: Ubuntu &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

---------

Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe.py
@@ -35,7 +35,9 @@ def torch_moe(
         torch.Tensor: Output tensor with the same shape as the input x.
     """
 
-    hidden_dim = x.shape[-1]
+    x_shape = x.shape
+    hidden_dim = x_shape[-1]
+    x = x.view(-1, hidden_dim)
     num_experts = len(w1_weight)
 
     final_hidden_states = torch.zeros_like(x)
@@ -63,7 +65,7 @@ def torch_moe(
             0, top_x, current_hidden_states.to(final_hidden_states.dtype)
         )
 
-    return final_hidden_states.view_as(x)
+    return final_hidden_states.view(x_shape)
 
 
 @torch_moe.register_fake
@@ -104,6 +106,8 @@ def torch_fused_moe(
     Returns:
         torch.Tensor: Output tensor with the same shape as the input x.
     """
+    x_shape = x.shape
+    x = x.view(-1, x_shape[-1])
     num_experts = w2_stacked_weight.shape[0]
     intermediate_size = w3_w1_stacked_weight.shape[1] // 2
     results = torch.zeros_like(x)
@@ -129,7 +133,7 @@ def torch_fused_moe(
         scaling = routing_weights[batch_idx, nth_expert].unsqueeze(-1)
         results[batch_idx] += scaling * expert_out
 
-    return results.view_as(x)
+    return results.view(x_shape)
 
 
 @torch_fused_moe.register_fake
@@ -151,6 +155,9 @@ def trtllm_fused_moe(
     w3_w1_stacked_weight: torch.Tensor,
     w2_stacked_weight: torch.Tensor,
 ) -> torch.Tensor:
+    x_shape = x.shape
+    x = x.view(-1, x_shape[-1])
+
     routing_weights = routing_weights.to(torch.float32)
     selected_experts = selected_experts.to(torch.int32)
     quant_scales = []
@@ -167,7 +174,7 @@ def trtllm_fused_moe(
         tp_rank=0,
         ep_size=1,
         ep_rank=0,
-    )[0]
+    )[0].view(x_shape)
 
 
 @trtllm_fused_moe.register_fake
diff --git a/tensorrt_llm/_torch/auto_deploy/models/deepseek.py b/tensorrt_llm/_torch/auto_deploy/models/deepseek.py
@@ -129,12 +129,8 @@ def deepseek_v3_moe_exact(self, hidden_states):
 @torch.inference_mode()
 def deepseek_v3_moe(self, hidden_states):
     """DeepSeekV3MoE forward function rewritten in Mixtral style to enable torch export."""
-    identity = hidden_states
-    batch_size, sequence_length, hidden_dim = hidden_states.shape
 
     selected_experts, routing_weights, *_ = self.gate(hidden_states)
-    hidden_states = hidden_states.view(-1, hidden_dim)
-
     final_hidden_states = torch.ops.moe.torch_moe(
         hidden_states,
         selected_experts,
@@ -144,10 +140,8 @@ def deepseek_v3_moe(self, hidden_states):
         w3_weight=[expert.up_proj.weight for expert in self.experts],
     )
 
-    final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
-
     if self.config.n_shared_experts is not None:
-        final_hidden_states = final_hidden_states + self.shared_experts(identity)
+        final_hidden_states = final_hidden_states + self.shared_experts(hidden_states)
 
     return final_hidden_states.to(hidden_states.dtype)
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/fused_moe.py