bugfix: fix merge_attention_state in BatchAttention w/ gqa-group-size in Qwen family (#1614)

happierpig · web-flow · commit 9b861cdbb458 · 2025-09-02T16:42:33.000-07:00
## 📌 Description  This PR fixes precision issues of BatchAttention (Persistent FA2 of #1137), when `CTA_TILE_Q` is not a multiple of `gqa_group_size` (e.g., Qwen family models). Prior implementation assumes that all `qo_heads` of a `kv_head` on a specific token will all be split-kv or non-split-kv. However, when `gqa-group-size == 7`, some `qo_heads` can be non-split while the remaining can be split. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes  cc @Edenzzzz
diff --git a/include/flashinfer/attention/persistent.cuh b/include/flashinfer/attention/persistent.cuh
@@ -269,7 +269,8 @@ struct BlockBatchPagedAttentionPersistent {
       const uint32_t kv_chunk_idx = kv_start / len_kv_chunk;
       const uint32_t num_kv_chunks = ceil_div(
           CAUSAL
-              ? min((kv_len - q_len) + (packed_qo_start + cluster_tile_q) / gqa_group_size, kv_len)
+              ? min((kv_len - q_len) + ceil_div(packed_qo_start + cluster_tile_q, gqa_group_size),
+                    kv_len)
               : kv_len,
           len_kv_chunk);
       const uint32_t qo_packed_idx_base = packed_qo_start + blockIdx.x * CTA_TILE_Q +
@@ -517,23 +518,21 @@ struct BlockBatchReductionPersistent {
 
       // remap workload
       uint32_t packed_qo_idx = i / num_kv_heads;
+      uint32_t kv_head_idx = i % num_kv_heads;
       const uint32_t num_index_sets = indptr[packed_qo_idx + 1] - indptr[packed_qo_idx];
       if (num_index_sets == 0 || num_index_sets == 1) {
         // already write through, bypass
         PROFILER_EVENT_END(profiler_closure, PersistentProfileEventType::kReduction);
         continue;
       }
 
-      uint32_t kv_head_idx = i % num_kv_heads;
-      uint32_t qo_head_idx = packed_qo_idx % gqa_group_size;
-
       // index calculation
       auto partial_idx_to_offset = [&](uint32_t off) {
         return (indptr[packed_qo_idx] + off) * num_kv_heads + kv_head_idx;
       };
       auto merge_idx_to_offset = [&]() {
-        return (o_indices[packed_qo_idx] * num_kv_heads + kv_head_idx) * gqa_group_size +
-               qo_head_idx;
+        // NOTE (Yilong): qo_head_idx has been calculated in schedule.plan
+        return o_indices[packed_qo_idx] + kv_head_idx * gqa_group_size;
       };
 
       state_t<vec_size> st;
diff --git a/include/flashinfer/attention/scheduler.cuh b/include/flashinfer/attention/scheduler.cuh
@@ -1235,8 +1235,11 @@ inline cudaError_t TwoStageHolisticPlan(void* float_buffer, size_t float_workspa
           // non-split kv is directly written through
           for (int row = 0; row < row_tile_size; ++row) {
             merge_indptr.push_back(merge_indptr.back() + num_kv_tiles);
-            merge_o_indices.push_back(qo_indptr_h[i] +
-                                      (qo_tile_idx * cluster_tile_q + row) / gqa_group_size);
+            // output layout: [qo_len, num_kv_heads, gqa_group_size, head_dim]
+            // merge_o_indices is the indices of `gqa_group_size` dimension
+            auto q = (qo_tile_idx * cluster_tile_q + row) / gqa_group_size,
+                 r = (qo_tile_idx * cluster_tile_q + row) % gqa_group_size;
+            merge_o_indices.push_back((qo_indptr_h[i] + q) * num_kv_heads * gqa_group_size + r);
           }
           partial_o_nnz += row_tile_size * num_kv_tiles;
         }
diff --git a/tests/test_batch_attention.py b/tests/test_batch_attention.py
@@ -57,17 +57,15 @@ def _build_seq_len_configs():
     torch.manual_seed(42)
 
     seq_len_configs = [
+        [(146, 146)],
+        [(67, 67)],
         [(8190, 7939)],
-        [(2, 235)]
-        + [(1, 13353)],  # corner case with a large number of masked out tokens
-        [(67, 1)],
-        [(182, 1)],
-        [(2011, 1)],
         [(2048, 1)] * 77,  # decode-only
         [(4099, 129)] * 2,  # prefill-only
         [(600, 1)] * 132 * 2 + [(5000, 3)] * 128,
         [(1024, 1)] * 100 + [(8192, 17)] * 8,  # speculative decode
         [(766, 2)] * 99 + [(1024, 512)] * 1,  # chunked prefill
+        [(2, 235)] + [(1, 13353)],  # real workload
     ]
 
     # Construct random seqlen tests
@@ -142,7 +140,7 @@ def _run_attention(
 
     # --------- old scheduler --------- #
     wrapper_old = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        torch.empty(256 * 1024 * 1024, dtype=torch.uint8, device=dev),
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device=dev),
         kv_layout=layout,
         backend="fa2",
     )
@@ -190,8 +188,8 @@ def _run_attention(
 # -------------------------  PyTest test case  ----------------------------- #
 @pytest.mark.parametrize("seq_len_pairs", _build_seq_len_configs())
 @pytest.mark.parametrize("page_block_size", [1, 8, 16])
-@pytest.mark.parametrize("num_kv_heads", [8, 1, 4])
-@pytest.mark.parametrize("gqa_group_size", [1, 4, 7])
+@pytest.mark.parametrize("num_kv_heads", [1, 4])
+@pytest.mark.parametrize("gqa_group_size", [1, 4, 7, 8])
 @pytest.mark.parametrize("head_dim", [64, 128, 256])
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("layout", ["HND", "NHD"])
@@ -225,3 +223,17 @@ def test_batch_attention_correctness(
         logits_soft_cap=logits_soft_cap,
         device="cuda",
     )
+
+
+if __name__ == "__main__":
+    test_batch_attention_correctness(
+        seq_len_pairs=[(1000, 1000)],
+        page_block_size=1,
+        num_kv_heads=4,
+        gqa_group_size=7,
+        head_dim=128,
+        causal=True,
+        layout="NHD",
+        test_dtype=torch.bfloat16,
+        logits_soft_cap=0.0,
+    )

Original file line number	Diff line number	Diff line change
`@@ -1235,8 +1235,11 @@ inline cudaError_t TwoStageHolisticPlan(void* float_buffer, size_t float_workspa`
`1235`	`1235`	`// non-split kv is directly written through`
`1236`	`1236`	`for (int row = 0; row < row_tile_size; ++row) {`
`1237`	`1237`	`merge_indptr.push_back(merge_indptr.back() + num_kv_tiles);`
`1238`		`- merge_o_indices.push_back(qo_indptr_h[i] +`
`1239`		`- (qo_tile_idx * cluster_tile_q + row) / gqa_group_size);`
	`1238`	`+ // output layout: [qo_len, num_kv_heads, gqa_group_size, head_dim]`
	`1239`	+ // merge_o_indices is the indices of `gqa_group_size` dimension
	`1240`	`+ auto q = (qo_tile_idx * cluster_tile_q + row) / gqa_group_size,`
	`1241`	`+ r = (qo_tile_idx * cluster_tile_q + row) % gqa_group_size;`
	`1242`	`+ merge_o_indices.push_back((qo_indptr_h[i] + q) * num_kv_heads * gqa_group_size + r);`
`1240`	`1243`	`}`
`1241`	`1244`	`partial_o_nnz += row_tile_size * num_kv_tiles;`
`1242`	`1245`	`}`