[TRTLLM-11289][feat] Replace DeepSeek router GEMM with CuTe DSL BF16 GEMM (FP32 output)

peaceh-nv · claude · peaceh-nv · commit 8f624e993481 · 2026-03-15T19:43:53.000-07:00
Enable CuTe DSL BF16 GEMM kernel for DeepseekV3Gate router GEMM on Blackwell.
The router computes BF16 input @ BF16 weight -&gt; FP32 logits, which our
persistent GEMM kernel already supports via FP32 accumulator and FP32 output.

Key changes:
- Support FP32 output dtype in CuteDSLBf16BlackwellGemmRunner (detect from
  output tensor instead of hardcoding BF16, add c_dtype to kernel cache key)
- Relax cute_dsl_bf16_gemm_blackwell custom op to accept BF16 or FP32 output
- Add CuTe DSL dispatch in DeepseekV3Gate.forward() gated by
  use_cute_dsl_bf16_gemm flag, with fallback to dsv3_router_gemm_op

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: peaceh &lt;103117813+peaceh-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -4052,11 +4052,14 @@ def get_valid_tactics(
                 )
                 return []
 
-            # input: [M, K], weight: [N, K]
+            # input: [M, K], weight: [N, K], output: [M, N]
             m, k = inputs[0].shape[0], inputs[0].shape[1]
             n = inputs[1].shape[0]
             batch_size = 1
 
+            # Detect output dtype from the output tensor (supports BF16 and FP32)
+            c_dtype_cutlass = _TORCH_TO_CUTLASS_DTYPE[inputs[2].dtype]
+
             # Layouts: A is [M, K] K-major, B is [N, K] K-major
             a_major = "k"
             b_major = "k"
@@ -4083,7 +4086,7 @@ def get_valid_tactics(
                 if self.__class__.kernel_class.can_implement(
                     cutlass.BFloat16,  # ab_dtype
                     cutlass.Float32,  # acc_dtype
-                    cutlass.BFloat16,  # c_dtype
+                    c_dtype_cutlass,  # c_dtype
                     use_2cta_instrs,
                     mma_tiler_mn,
                     cluster_shape_mn,
@@ -4109,7 +4112,7 @@ def forward(
                 inputs (List[torch.Tensor]):
                     inputs[0]: Input tensor of shape (m, k), dtype: bf16.
                     inputs[1]: Weight tensor of shape (n, k), dtype: bf16.
-                    inputs[2]: Output tensor of shape (m, n), dtype: bf16.
+                    inputs[2]: Output tensor of shape (m, n), dtype: bf16 or fp32.
                 tactic: Tiling and cluster strategy, typically a tuple
                     (use_2cta_instrs, mma_tiler_mn, cluster_shape_mn).
             """
@@ -4146,6 +4149,9 @@ def forward(
             # c_buf is [M, N], permute to [M, N, 1] for cute layout
             c_tmp = c_buf.unsqueeze(-1)  # [M, N, 1]
 
+            # Detect output dtype (supports BF16 and FP32)
+            c_dtype_cutlass = _TORCH_TO_CUTLASS_DTYPE[c_tensor.dtype]
+
             if not self.use_tvm_ffi:
                 a_ptr = make_ptr(
                     cutlass.BFloat16,
@@ -4169,6 +4175,7 @@ def forward(
                 mma_tiler_mn,
                 cluster_shape_mn,
                 self.use_tvm_ffi,
+                c_dtype_cutlass,
             )
             if cache_key not in self.__class__.kernel_cache:
                 if self.use_tvm_ffi:
@@ -4290,5 +4297,6 @@ def _(
     ) -> None:
         m, k = mat_a.shape[0], mat_a.shape[1]
         n = mat_b.shape[0]
-        assert output.dtype == torch.bfloat16, "CuTe DSL bf16 gemm output dtype must be bf16"
+        assert output.dtype in (torch.bfloat16, torch.float32), \
+            "CuTe DSL bf16 gemm output dtype must be bf16 or fp32"
         assert output.shape == (m, n), "CuTe DSL bf16 gemm output shape is incorrect"
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -41,7 +41,7 @@
 from tensorrt_llm._ipc_utils import can_access_peer
 from tensorrt_llm._torch.models.checkpoints.base_weight_loader import \
     ConsumableWeightsDict
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_version, is_sm_100f
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
@@ -852,8 +852,10 @@ def __init__(
         fuse_routing_kernel: bool = True,
         apply_routing: bool = False,
         moe_backend: str = 'CUTLASS',
+        use_cute_dsl_bf16_gemm: bool = False,
     ):
         super().__init__()
+        self.use_cute_dsl_bf16_gemm = use_cute_dsl_bf16_gemm
         self.weight = nn.Parameter(torch.empty((num_experts, hidden_size),
                                                dtype=dtype),
                                    requires_grad=False)
@@ -878,10 +880,24 @@ def __init__(
             is_fused=fuse_routing_kernel)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        logits = torch.ops.trtllm.dsv3_router_gemm_op(hidden_states,
-                                                      self.weight.t(),
-                                                      bias=None,
-                                                      out_dtype=torch.float32)
+        if (self.use_cute_dsl_bf16_gemm and is_sm_100f()
+                and self.weight.dtype == torch.bfloat16):
+            input_2d = hidden_states.view(-1, hidden_states.shape[-1])
+            m, k = input_2d.shape
+            n = self.weight.shape[0]
+            output = torch.empty(m,
+                                 n,
+                                 dtype=torch.float32,
+                                 device=hidden_states.device)
+            torch.ops.trtllm.cute_dsl_bf16_gemm_blackwell(
+                input_2d.contiguous(), self.weight, output)
+            logits = output.view(*hidden_states.shape[:-1], n)
+        else:
+            logits = torch.ops.trtllm.dsv3_router_gemm_op(
+                hidden_states,
+                self.weight.t(),
+                bias=None,
+                out_dtype=torch.float32)
         return logits
 
     def load_weights(self, weights: List[Dict]):
@@ -947,7 +963,8 @@ def __init__(self,
                              dtype=dtype,
                              fuse_routing_kernel=True,
                              apply_routing=False,
-                             moe_backend=model_config.moe_backend)
+                             moe_backend=model_config.moe_backend,
+                             use_cute_dsl_bf16_gemm=model_config.use_cute_dsl_bf16_gemm)
         self.experts = create_moe(
             num_experts=num_experts,
             routing_method=self.gate.routing_method,