fix: should pass global_override_indptr_cpu in fast_decode_plan param list (#1757)

yyihuang · zihaoye · web-flow · commit 60dec90fbaa5 · 2025-09-23T23:00:05.000-07:00
## 📌 Description fix #1745 ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: Zihao Ye <yezihhhao@gmail.com> Co-authored-by: Zihao Ye <98052487+zihaoye@users.noreply.github.com>
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -2368,9 +2368,6 @@ def trtllm_batch_decode_with_kv_cache_mla(
     return out
 
 
-global_override_indptr_cpu = None
-
-
 def fast_decode_plan(
     self,
     indptr: torch.Tensor,
@@ -2392,6 +2389,7 @@ def fast_decode_plan(
     non_blocking: bool = True,
     fixed_split_size: Optional[int] = None,
     disable_split_kv: bool = False,
+    global_override_indptr_cpu: Optional[torch.Tensor] = None,
 ) -> None:
     """
     A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for FlashInferMultiStepDraftBackend.
diff --git a/tests/test_batch_decode_kernels.py b/tests/test_batch_decode_kernels.py
@@ -186,6 +186,10 @@ def test_batch_decode_with_paged_kv_cache(
     torch.testing.assert_close(o, o_buffer, rtol=1e-3, atol=1e-3)
 
 
+global_override_indptr_cpu = None
+MAX_BATCH_SIZE = 128
+
+
 @pytest.mark.parametrize("batch_size", [12, 17, 128])
 @pytest.mark.parametrize("kv_len", [54, 97, 512, 2048, 16384])
 @pytest.mark.parametrize("page_size", [1, 8, 16])
@@ -218,6 +222,15 @@ def test_batch_decode_with_paged_kv_cache_with_fast_plan(
     num_pages_per_seq = (kv_len + page_size - 1) // page_size
     total_num_pages = num_pages_per_seq * batch_size
 
+    global global_override_indptr_cpu
+    if global_override_indptr_cpu is None:
+        global_override_indptr_cpu = torch.empty(MAX_BATCH_SIZE + 1, device="cpu")
+    if global_override_indptr_cpu is not None:
+        global_override_indptr_cpu = (
+            torch.arange(0, batch_size + 1, device="cpu", dtype=torch.int32)
+            * num_pages_per_seq
+        )
+
     if kv_layout == "HND":
         kv_shape = [total_num_pages, 2, num_kv_heads, page_size, head_dim]
     else:
@@ -280,6 +293,7 @@ def test_batch_decode_with_paged_kv_cache_with_fast_plan(
         data_type=kv_dtype,
         q_data_type=q_dtype,
         non_blocking=True,
+        global_override_indptr_cpu=global_override_indptr_cpu,
     )
     if return_lse:
         o, _ = wrapper.run(q, kv_data, return_lse=True)
diff --git a/tests/test_tensor_cores_decode.py b/tests/test_tensor_cores_decode.py
@@ -328,6 +328,10 @@ def test_batch_decode_tensor_cores_cuda_graph(
     torch.testing.assert_close(lse, lse_tensor_cores, rtol=1e-3, atol=1e-3)
 
 
+global_override_indptr_cpu = None
+MAX_BATCH_SIZE = 128
+
+
 @pytest.mark.parametrize("batch_size", [5, 12])
 @pytest.mark.parametrize("invariant_bs", [4])
 @pytest.mark.parametrize("kv_len", [4096, 8192, 5000])
@@ -358,6 +362,16 @@ def test_batch_decode_tensor_cores_with_fast_plan(
     )
     num_pages_per_seq = (kv_len + page_size - 1) // page_size
     total_num_pages = num_pages_per_seq * batch_size
+
+    global global_override_indptr_cpu
+    if global_override_indptr_cpu is None:
+        global_override_indptr_cpu = torch.empty(MAX_BATCH_SIZE + 1, device="cpu")
+    if global_override_indptr_cpu is not None:
+        global_override_indptr_cpu = (
+            torch.arange(0, batch_size + 1, device="cpu", dtype=torch.int32)
+            * num_pages_per_seq
+        )
+
     kv_data = (
         torch.randn(
             total_num_pages,
@@ -425,13 +439,15 @@ def test_batch_decode_tensor_cores_with_fast_plan(
         q_data_type=torch.float16,
         fixed_split_size=fixed_split_size if not disable_split_kv else None,
         disable_split_kv=disable_split_kv,
+        global_override_indptr_cpu=global_override_indptr_cpu,
     )
     o_tensor_cores, lse_tensor_cores = wrapper_tensor_cores.run(
         q, kv_data, return_lse=True
     )
 
     kv_indptr_invariant = kv_indptr[: invariant_bs + 1]
     kv_last_page_len_invariant = kv_last_page_len[:invariant_bs]
+    global_override_indptr_cpu = global_override_indptr_cpu[: invariant_bs + 1]
     wrapper_tensor_cores.plan(
         kv_indptr_invariant,
         kv_indices,
@@ -445,6 +461,7 @@ def test_batch_decode_tensor_cores_with_fast_plan(
         q_data_type=torch.float16,
         fixed_split_size=fixed_split_size if not disable_split_kv else None,
         disable_split_kv=disable_split_kv,
+        global_override_indptr_cpu=global_override_indptr_cpu,
     )
     o_tensor_cores_invariant, lse_tensor_cores_invariant = wrapper_tensor_cores.run(
         q[:invariant_bs], kv_data, return_lse=True
@@ -477,6 +494,16 @@ def test_batch_fast_decode_tensor_cores_cuda_graph(
     )
     num_pages_per_seq = (kv_len + page_size - 1) // page_size
     total_num_pages = num_pages_per_seq * batch_size
+
+    global global_override_indptr_cpu
+    if global_override_indptr_cpu is None:
+        global_override_indptr_cpu = torch.empty(MAX_BATCH_SIZE + 1, device="cpu")
+    if global_override_indptr_cpu is not None:
+        global_override_indptr_cpu = (
+            torch.arange(0, batch_size + 1, device="cpu", dtype=torch.int32)
+            * num_pages_per_seq
+        )
+
     kv_data = (
         torch.randn(
             total_num_pages,
@@ -562,6 +589,8 @@ def test_batch_fast_decode_tensor_cores_cuda_graph(
         paged_kv_indices_buffer=kv_indices,
         paged_kv_last_page_len_buffer=kv_last_page_len,
     )
+
+    # cache
     wrapper_tensor_cores.plan(
         kv_indptr,
         kv_indices,
@@ -574,6 +603,24 @@ def test_batch_fast_decode_tensor_cores_cuda_graph(
         data_type=torch.float16,
         q_data_type=torch.float16,
     )
+
+    wrapper_tensor_cores.plan = partial(
+        flashinfer.fast_decode_plan, wrapper_tensor_cores
+    )
+
+    wrapper_tensor_cores.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        pos_encoding_mode=pos_encoding_mode,
+        data_type=torch.float16,
+        q_data_type=torch.float16,
+        global_override_indptr_cpu=global_override_indptr_cpu,
+    )
     # warmup
     s = torch.cuda.Stream()
     s.wait_stream(torch.cuda.current_stream())
@@ -596,3 +643,10 @@ def test_batch_fast_decode_tensor_cores_cuda_graph(
 
     torch.testing.assert_close(o, o_tensor_cores, rtol=1e-3, atol=1e-3)
     torch.testing.assert_close(lse, lse_tensor_cores, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    test_batch_decode_tensor_cores_with_fast_plan(
+        5, 4, 4096, 2048, True, 1, 4, 1, 128, "HND", "NONE"
+    )
+    test_batch_fast_decode_tensor_cores_cuda_graph(12, 54, 1, 4, 1, 128, "HND", "NONE")