bugfix: Support passing kv_data_type to MultiLevelCascadeAttentionWrapper.plan() (#1350)

sarckk · web-flow · commit 7fdae775b453 · 2025-07-29T23:36:31.000-07:00
## 📌 Description `MultiLevelCascadeAttentionWrapper.plan()` ends up calling `plan()` on `BatchPrefillWithPagedKVCacheWrapper`. `BatchPrefillWithPagedKVCacheWrapper.plan()` supports `kv_data_type` but `MultiLevelCascadeAttentionWrapper.plan()` does not. ## 🔍 Related Issues Fixes vllm-project/vllm#21822 ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
diff --git a/flashinfer/cascade.py b/flashinfer/cascade.py
@@ -16,7 +16,7 @@
 
 import functools
 from functools import cache
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple, Union
 
 import torch
 
@@ -418,6 +418,7 @@ def plan(
         rope_scale: Optional[float] = None,
         rope_theta: Optional[float] = None,
         q_data_type: str = "float16",
+        kv_data_type: Optional[Union[str, torch.dtype]] = None,
     ):
         r"""Create auxiliary data structures for multi-level cascade attention for multiple
         forward calls within the same decode step. Please check
@@ -476,6 +477,8 @@ def plan(
             The theta used in RoPE, if not provided, will be set to ``1e4``.
         q_data_type : Optional[Union[str, torch.dtype]]
             The data type of the query tensor. If None, will be set to torch.float16.
+        kv_data_type : Optional[Union[str, torch.dtype]]
+            The data type of the key/value tensor. If None, will be set to :attr:`q_data_type`.
         """
         for i, (
             wrapper,
@@ -510,6 +513,7 @@ def plan(
                 rope_scale=rope_scale,
                 rope_theta=rope_theta,
                 q_data_type=q_data_type,
+                kv_data_type=kv_data_type,
             )
 
     begin_forward = plan