Bugfix: fix o_strides in persistent kernel (#1865)

Edenzzzz · happierpig · web-flow · commit 0b1d8a6c5750 · 2025-10-05T13:47:11.000-07:00
## 📌 Description It will be buggy when q is non-contiguous (from torch.split) ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: happierpig <yilongzhao@berkeley.edu>
diff --git a/include/flashinfer/attention/persistent.cuh b/include/flashinfer/attention/persistent.cuh
@@ -408,10 +408,13 @@ struct BlockBatchPagedAttentionPersistent {
                           warp_idx, lane_idx, tid);
       } else {
         // write through
+        // o_stride_n = num_qo_heads* head_dim
+        const uint32_t o_stride_n = num_kv_heads * gqa_group_size * HEAD_DIM_VO,
+                       o_stride_h = HEAD_DIM_VO;
         DTypeO* o_ptr_base =
-            params.final_o + q_indptr * q_stride_n + (kv_head_idx * gqa_group_size) * q_stride_h;
+            params.final_o + q_indptr * o_stride_n + (kv_head_idx * gqa_group_size) * o_stride_h;
         write_o_reg_gmem<KTraits>(o_frag, &q_smem, o_ptr_base, qo_packed_idx_base, q_len,
-                                  q_stride_n, q_stride_h, gqa_group_size, tid);
+                                  o_stride_n, o_stride_h, gqa_group_size, tid);
       }
 
       if constexpr (variant.use_softmax) {
diff --git a/tests/attention/test_batch_attention.py b/tests/attention/test_batch_attention.py
@@ -97,6 +97,7 @@ def _run_attention(
     logits_soft_cap=0.0,
     device="cuda",
     causal=True,
+    is_chunked_q=False,
 ):
     """
     Run both implementations and return (output_old, lse_old, output_new, lse_new)
@@ -116,9 +117,19 @@ def _run_attention(
 
     num_blocks = kv_indptr[-1].item()
 
-    q = torch.rand(
-        q_indptr[-1].item(), num_qo_heads, head_dim, dtype=test_dtype, device=dev
-    )
+    if is_chunked_q:
+        q_base = torch.rand(
+            q_indptr[-1].item(),
+            num_qo_heads,
+            head_dim * 2,
+            dtype=test_dtype,
+            device=dev,
+        )
+        q = torch.chunk(q_base, 2, dim=-1)[0]
+    else:
+        q = torch.rand(
+            q_indptr[-1].item(), num_qo_heads, head_dim, dtype=test_dtype, device=dev
+        )
     if layout == "NHD":
         kv_data = torch.randn(
             num_blocks,
@@ -190,6 +201,45 @@ def _run_attention(
 
 
 # -------------------------  PyTest test case  ----------------------------- #
+@pytest.mark.xfail(
+    get_compute_capability(torch.device(device="cuda"))[0] == 12,
+    reason="Expected failure for SM120/121 for now since the tile size/number of stages is too large.",
+)
+def test_batch_attention_with_noncontiguous_q():
+    # Pick the first sequence length config's first pair
+    seq_len_pairs = _build_seq_len_configs()[0]
+    kv_lens = [p[0] for p in seq_len_pairs]
+    qo_lens = [p[1] for p in seq_len_pairs]
+
+    # Fixed single-case parameters
+    page_block_size = 1
+    num_kv_heads = 1
+    gqa_group_size = 1
+    num_qo_heads = num_kv_heads * gqa_group_size
+    head_dim = 64
+    test_dtype = torch.bfloat16
+    layout = "NHD"
+    logits_soft_cap = 0.0
+    v_scale = None
+    causal = True
+
+    _run_attention(
+        kv_lens=kv_lens,
+        qo_lens=qo_lens,
+        page_block_size=page_block_size,
+        num_kv_heads=num_kv_heads,
+        num_qo_heads=num_qo_heads,
+        head_dim=head_dim,
+        v_scale=v_scale,
+        causal=causal,
+        layout=layout,
+        test_dtype=test_dtype,
+        logits_soft_cap=logits_soft_cap,
+        device="cuda",
+        is_chunked_q=True,
+    )
+
+
 @pytest.mark.xfail(
     get_compute_capability(torch.device(device="cuda"))[0] == 12,
     reason="Expected failure for SM120/121 for now since the tile size/number of stages is too large.",