[None][fix] Disable DeepGEMM for Qwen3 MoE Attention layers (NVIDIA#8087)

achartier · dominicshanshan · commit 78f2cbb29257 · 2025-11-02T23:10:33.000-08:00
Signed-off-by: Aurelien Chartier &lt;2567591+achartier@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3.py b/tensorrt_llm/_torch/models/modeling_qwen3.py
@@ -34,6 +34,7 @@ def __init__(
         fuse_qk_norm_rope: bool = True,
         attn_output_gate: bool = False,
         use_gemma_rms_norm: bool = False,
+        disable_deep_gemm: bool = False,
     ):
         config = model_config.pretrained_config
         self.pretrained_config = config
@@ -71,6 +72,7 @@ def __init__(
             config=model_config,
             attn_output_gate=self.attn_output_gate,
             use_gemma_rms_norm=use_gemma_rms_norm,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
 
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -167,6 +167,7 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig],
         self.self_attn = Qwen3Attention(
             model_config,
             layer_idx=layer_idx,
+            disable_deep_gemm=True,
         )
         self.mapping = model_config.mapping
         self.enable_attention_dp = self.mapping.enable_attention_dp

Original file line number	Diff line number	Diff line change
`@@ -167,6 +167,7 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig],`
`167`	`167`	`self.self_attn = Qwen3Attention(`
`168`	`168`	`model_config,`
`169`	`169`	`layer_idx=layer_idx,`
	`170`	`+ disable_deep_gemm=True,`
`170`	`171`	`)`
`171`	`172`	`self.mapping = model_config.mapping`
`172`	`173`	`self.enable_attention_dp = self.mapping.enable_attention_dp`