bugfix: Fix the bug of the kernel-selection heuristic in trtllm-gen (#1307)

PerkzZheng · web-flow · commit 18fd91a74ad6 · 2025-07-23T01:33:23.000-07:00
## 📌 Description this fixes the bug of still selecting low-latency (swapsMmaAb) MLA kernels when batch size is quite large under the high-throughput case (attention DP is used). The accuracy won't be impacted, but it might have much worse performance without the fix. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Signed-off-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com>
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -397,11 +397,19 @@ class TllmGenFmhaKernel {
                            clusterDimX);
   }
 
-  // Compute the seqLenPerCtaKv for selecting the MLA generation kernel.
-  int computeSeqLenPerCtaKv(RunnerParams const& params) const {
+  // Determine if we should use the SwapsMmaAbForGeneration kernel for MLA generation.
+  bool useSwapsMmaAbMlaGenKernel(RunnerParams const& params) const {
+    // Use the SwapsMmaAbForGeneration kernel for MLA generation when the following conditions are
+    // met:
+    // 1. The seqLenPerCtaKv <= 1024 based on the benchmark results (this might be fine-tuned
+    // later).
+    // 2. The numCtas (after splitting the heads across multiple CTAs) <=
+    // params.mMultiProcessorCount.
+
     // The maximum number Ctas per Kv sequence, which makes sure that each CtaKv has work to do.
     // Here we assume the stepKv is 256.
     int const maxNumCtasPerSeqKv = flashinfer::ceil_div(params.mMaxSeqLenKv, 256);
+    ;
     // The number of Ctas.
     int const numCtas = static_cast<int32_t>(params.mBatchSize * params.mMaxSeqLenQ *
                                              divUp(params.mNumHeadsQPerKv, 16));
@@ -410,8 +418,8 @@ class TllmGenFmhaKernel {
         std::min(maxNumCtasPerSeqKv, std::max(1, int32_t(params.mMultiProcessorCount / numCtas)));
     // Compute the seqLenPerCtaKv.
     int const seqLenPerCtaKv = flashinfer::ceil_div(params.mMaxSeqLenKv, numCtasPerSeqKv);
-    // Return the seqLenPerCtaKv.
-    return seqLenPerCtaKv;
+    // Whether we should use the SwapsMmaAbForGeneration kernel for MLA generation.
+    return seqLenPerCtaKv <= 1024 && numCtas <= params.mMultiProcessorCount;
   }
 
   std::pair<uint64_t, std::string> hashFromRunnerParams(
@@ -424,10 +432,12 @@ class TllmGenFmhaKernel {
       // following conditions are met:
       // 1. The number of headsQPerKv is <= 32.
       // 2. The seqLenPerCtaKv <= 1024 based on the benchmark results (this might be fine-tuned
-      // later).
+      // later) and
+      //    the numCtas (after splitting the heads across multiple CTAs) <=
+      //    params.mMultiProcessorCount.
 
       // Check the conditions.
-      if (params.mNumHeadsQPerKv <= 32 || computeSeqLenPerCtaKv(params) <= 1024) {
+      if (params.mNumHeadsQPerKv <= 32 || useSwapsMmaAbMlaGenKernel(params)) {
         kernelType = FmhaKernelType::SwapsMmaAbForGeneration;
       } else {
         // Otherwise, we use the high-throughput kernel.