Enable nvfp4 output for trtllm-gen keepsMmaAb kernel

PerkzZheng · PerkzZheng · commit 2632da465895 · 2026-03-21T13:32:54.000Z
- Update cubin artifact path/checksum to new build with nvfp4 output support
- Fix kernel selection: remove E2M1 output dtype condition from mixed-precision path,
  allowing nvfp4 output to use GQA generation kernel selection heuristics
- Always invoke selectTileSizeQForGqaGeneration (not just for maxSeqLenQ &gt; 1)
- Add mUsesSharedPagedKvIdx field to KernelParams for vLLM/FlashInfer paged KV index
- Remove speculative-decode skip for nvfp4 output in tests
- Expand test coverage: head_dim [64, 128, 256], additional batch configs

AI-assisted

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;

revert

revert

revert
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -747,7 +747,7 @@ class TllmGenFmhaKernel {
     int& tileSizeQ = selectKernelParams.mTileSizeQ;
 
     // Mixed precision kernels don't work with groupsTokensHeadsQ = true for now.
-    if (mDtypeQ != mDtypeKv || mDtypeOut == DATA_TYPE_E2M1) {
+    if (mDtypeQ != mDtypeKv) {
       tileSizeQ = params.mNumHeadsQPerKv <= 8 ? 8 : 16;
       kernelType = FmhaKernelType::SwapsMmaAbForGeneration;
       return;
@@ -773,11 +773,8 @@ class TllmGenFmhaKernel {
       kernelType = FmhaKernelType::KeepsMmaAbForGeneration;
     }
 
-    // When maxSeqLenQ > 1, use an experimental kernel-timing model to select the best kernel that
-    // groups both tokensQ and headsQ into one CTA.
-    if (params.mMaxSeqLenQ > 1) {
-      selectTileSizeQForGqaGeneration(params, selectKernelParams);
-    }
+    // Use an experimental kernel-timing model to select the best tileSizeQ.
+    selectTileSizeQForGqaGeneration(params, selectKernelParams);
   }
 
   // Select a kernel based on the heuristic.
diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py
@@ -846,15 +846,6 @@ def _test_trtllm_batch_decode(
     if backend == "xqa" and q_dtype == "fp8":
         pytest.skip("xqa backend only supports fp16 and bf16 query")
 
-    if o_dtype == "nvfp4" and (
-        q_len_per_req is not None
-        and q_len_per_req > 1
-        or max_q_len is not None
-        and max_q_len > 1
-    ):
-        # todo(Yingyi): add support for nvfp4 with speculative decoding
-        pytest.skip("nvfp4 is not supported for q_len_per_req > 1 or max_q_len > 1 yet")
-
     if backend == "trtllm-gen" and o_dtype == "fp8" and q_dtype != "fp8":
         pytest.skip("trtllm-gen backend only supports fp8 output for fp8 query")
 
@@ -1181,7 +1172,7 @@ def _test_trtllm_batch_decode(
 @pytest.mark.parametrize("enable_pdl", [True, False, None])
 @pytest.mark.parametrize("enable_sink", [True, False])
 @pytest.mark.parametrize("max_in_kv_len", [110])
-@pytest.mark.parametrize("head_dim", [128])
+@pytest.mark.parametrize("head_dim", [64, 128, 256])
 @pytest.mark.parametrize("non_contiguous_query", [False, True])
 @pytest.mark.parametrize("skips_softmax", [False, True])
 def test_trtllm_batch_decode(
@@ -1632,25 +1623,27 @@ def make_query_non_contiguous(
 @pytest.mark.parametrize("backend", ["trtllm-gen"])
 @pytest.mark.parametrize("kv_layout", ["HND", "NHD"])
 @pytest.mark.parametrize(
-    "batch_size,max_q_len,page_size,num_kv_heads,head_grp_size",
+    "batch_size,max_q_len,page_size,num_kv_heads,head_grp_size,head_dim",
     [
-        (4, 1, 16, 2, 1),
-        (4, 1, 32, 2, 5),
-        (4, 2, 64, 2, 5),
-        (4, 3, 32, 2, 5),
-        (4, 3, 64, 2, 1),
-        (4, 4, 64, 4, 1),
-        (4, 5, 64, 4, 8),
-        (128, 1, 64, 2, 5),
-        (128, 2, 32, 4, 1),
-        (128, 3, 16, 4, 8),
-        (128, 4, 16, 2, 5),
-        (128, 5, 16, 2, 5),
-        (256, 1, 64, 4, 8),
-        (256, 2, 16, 2, 8),
-        (256, 3, 64, 4, 5),
-        (256, 4, 32, 2, 8),
-        (256, 5, 32, 2, 1),
+        (4, 1, 16, 2, 1, 128),
+        (4, 1, 32, 2, 5, 128),
+        (4, 2, 64, 2, 5, 128),
+        (4, 3, 32, 2, 5, 128),
+        (4, 3, 64, 2, 1, 128),
+        (4, 4, 64, 4, 1, 128),
+        (4, 5, 64, 4, 8, 128),
+        # Iterate over head_dim 64, 128, 256 for these configs to simplify
+        *[(bs, 4, 64, 4, 16, hd) for bs in [4, 8, 16, 32] for hd in [64, 128, 256]],
+        (128, 1, 64, 2, 5, 128),
+        (128, 2, 32, 4, 1, 128),
+        (128, 3, 16, 4, 8, 128),
+        (128, 4, 16, 2, 5, 128),
+        (128, 5, 16, 2, 5, 128),
+        (256, 1, 64, 4, 8, 256),
+        (256, 2, 16, 2, 8, 256),
+        (256, 3, 64, 4, 5, 256),
+        (256, 4, 32, 2, 8, 256),
+        (256, 16, 32, 2, 8, 256),
     ],
 )
 @pytest.mark.parametrize("window_left", [-1, 127])
@@ -1672,7 +1665,6 @@ def make_query_non_contiguous(
 @pytest.mark.parametrize("enable_pdl", [True, False, None])
 @pytest.mark.parametrize("enable_sink", [True, False])
 @pytest.mark.parametrize("max_in_kv_len", [110])
-@pytest.mark.parametrize("head_dim", [128])
 @pytest.mark.parametrize("skips_softmax", [False, True])
 def test_trtllm_batch_decode_spec(
     backend: str,