[feat] Enable chunked context for flashinfer (#4132)

mikeiovine · web-flow · commit f9adac3dea9c · 2025-05-15T10:59:38.000+08:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py
@@ -186,11 +186,6 @@ def prepare(self) -> None:
         assert self.request_ids is not None
         block_ids_per_seq = self.kv_cache_manager.get_batch_cache_indices(
             self.request_ids)
-        paged_kv_indices = torch.tensor(
-            [x for block_ids in block_ids_per_seq for x in block_ids],
-            dtype=torch.int32)
-        self._paged_kv_indices[:paged_kv_indices.size(0)].copy_(
-            paged_kv_indices, non_blocking=True)
 
         # number of tokens in the kv cache for each sequence in the batch
         cached_token_lens = torch.tensor(
@@ -212,13 +207,26 @@ def prepare(self) -> None:
                                              1])
 
         # number of cache blocks used by each sequence in the cache
-        self.num_blocks = [len(block_ids) for block_ids in block_ids_per_seq]
+        # NOTE: do not use len(block_ids) - that will give you a number
+        # that can be too big if using chunked prefill/kv cache reuse
+        # since we allocate all blocks ahead of time.
+        num_blocks = ((kv_lens + self.page_size - 1) // self.page_size)
+        self.num_blocks = num_blocks.tolist()
         self.num_context_blocks = sum(self.num_blocks[:self.num_contexts])
         self.num_generation_blocks = sum(self.num_blocks[self.num_contexts:])
 
+        paged_kv_indices_list = []
+        for i, block_ids in enumerate(block_ids_per_seq):
+            paged_kv_indices_list.extend(block_ids[:self.num_blocks[i]])
+
+        paged_kv_indices = torch.tensor(paged_kv_indices_list,
+                                        dtype=torch.int32)
+
+        self._paged_kv_indices[:paged_kv_indices.size(0)].copy_(
+            paged_kv_indices, non_blocking=True)
+
         # number of tokens in the last cache block used by each sequence
-        paged_kv_last_page_len = kv_lens - (torch.Tensor(
-            self.num_blocks).int().cuda(non_blocking=True) - 1) * self.page_size
+        paged_kv_last_page_len = kv_lens - (num_blocks - 1) * self.page_size
         self._paged_kv_last_page_len[:paged_kv_last_page_len.size(0)].copy_(
             paged_kv_last_page_len, non_blocking=True)
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -46,9 +46,7 @@ def create_py_executor(executor_config: ExecutorConfig,
             )
             executor_config.kv_cache_config.enable_block_reuse = False
 
-    if pytorch_backend_config.attn_backend in [
-            "FLASHINFER", "FLASHINFER_STAR_ATTENTION"
-    ] and executor_config.enable_chunked_context:
+    if pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION" and executor_config.enable_chunked_context:
         logger.warning(
             f"Disabling chunked context for {pytorch_backend_config.attn_backend} backend"
         )
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -54,6 +54,20 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
 
+    @pytest.mark.skip_less_device_memory(32000)
+    @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
+    def test_chunked_prefill(self, attn_backend):
+        pytorch_config = PyTorchConfig(attn_backend=attn_backend, )
+        llm = LLM(self.MODEL_PATH,
+                  enable_chunked_prefill=True,
+                  max_num_tokens=64,
+                  pytorch_backend_config=pytorch_config)
+        with llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @pytest.mark.skip_less_device_memory(32000)
     @parametrize_with_ids("torch_compile", [False, True])
     @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -31,6 +31,8 @@ l0_dgx_h100:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
   - disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]

Original file line number	Diff line number	Diff line change
`@@ -46,9 +46,7 @@ def create_py_executor(executor_config: ExecutorConfig,`
`46`	`46`	`)`
`47`	`47`	`executor_config.kv_cache_config.enable_block_reuse = False`
`48`	`48`
`49`		`- if pytorch_backend_config.attn_backend in [`
`50`		`- "FLASHINFER", "FLASHINFER_STAR_ATTENTION"`
`51`		`- ] and executor_config.enable_chunked_context:`
	`49`	`+ if pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION" and executor_config.enable_chunked_context:`
`52`	`50`	`logger.warning(`
`53`	`51`	`f"Disabling chunked context for {pytorch_backend_config.attn_backend} backend"`
`54`	`52`	`)`