Fixes to the benchmark attention wrapper

Anerudhan · Anerudhan · commit 7b3cb4c97d45 · 2025-08-05T22:27:18.000-07:00
diff --git a/benchmarks/routines/attention.py b/benchmarks/routines/attention.py
@@ -814,6 +814,15 @@ def testBatchPrefillWithPagedKVCacheWrapper(args):
         .int()
         .to(device)
     )
+    qo_indptr_cudnn = torch.cat(
+        [
+            torch.tensor([0], device=device),
+            torch.cumsum(actual_seq_lens_q_device.view(-1), dim=0)
+            * head_dim_qk
+            * num_qo_heads,
+        ]
+    ).int()
+
     # Because actual_seq_lens_kv is the same as actual_seq_lens_q, kv_indptr will become the same as qo_indptr
     kv_indptr = (
         torch.cat(
@@ -935,12 +944,14 @@ def run_backend_wrapper(backend):
                 workspace_buffer,
                 max_token_per_sequence=s_qo,
                 max_sequence_kv=s_kv,
-                actual_seq_lens_q=actual_seq_lens_q,
-                actual_seq_lens_kv=actual_seq_lens_kv,
+                actual_seq_lens_q=actual_seq_lens_q_device,
+                actual_seq_lens_kv=actual_seq_lens_kv_device,
                 block_tables=block_tables,
                 causal=causal,
                 return_lse=True,
                 is_cuda_graph_compatible=is_cuda_graph_compatible,
+                batch_offsets_q=qo_indptr_cudnn,
+                batch_offsets_o=qo_indptr_cudnn,
             )[0]
         elif backend == "fa2":
             return fi_fa2_paged_wrapper.run(