flashinfer-ai · yyihuang · Aug 7, 2025 · Aug 10, 2025 · Aug 10, 2025 · Aug 10, 2025
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -1982,7 +1982,6 @@ def trtllm_batch_decode_with_kv_cache(
     workspace_buffer: torch.Tensor,
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
-    max_seq_len: int,
     bmm1_scale: float,
     bmm2_scale: float,  # todo(Yingyi): add dynamic scale tensor later
     window_left: int = -1,
@@ -2011,9 +2010,6 @@ def trtllm_batch_decode_with_kv_cache(
     seq_lens : torch.Tensor
         A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``
 
-    max_seq_len : int
-        max sequence length for kv_cache
-
     bmm1_scale : float
         fused scale for bmm1 input.
 
@@ -2110,6 +2106,9 @@ def trtllm_batch_decode_with_kv_cache(
     else:
         raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
+    page_size = k_cache.shape[3]
-    page_size = k_cache.shape[3]
+    page_size = k_cache.shape[2]
-    page_size = k_cache.shape[3]
+    page_size = k_cache.shape[2]
+    num_pages = block_tables.shape[1]
+    max_seq_len = num_pages * page_size
     run_func(
         out,
         out_scale_factor,

diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -3052,7 +3052,6 @@ def trtllm_batch_context_with_kv_cache(
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
     max_q_len: int,
-    max_kv_len: int,
     bmm1_scale: float,
     bmm2_scale: float,
     batch_size: int,
@@ -3081,8 +3080,6 @@ def trtllm_batch_context_with_kv_cache(
         A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``
     max_q_len : int
         max sequence length for query
-    max_kv_len : int
-        max sequence length for kv_cache
     bmm1_scale : float
         fused scale for bmm1 input.
     bmm2_scale : float
@@ -3178,6 +3175,9 @@ def trtllm_batch_context_with_kv_cache(
     else:
         raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
+    page_size = k_cache.shape[3]
-    page_size = k_cache.shape[3]
+    page_size = k_cache.shape[2]
-    page_size = k_cache.shape[3]
+    page_size = k_cache.shape[2]
+    num_pages = block_tables.shape[1]
+    max_kv_len = num_pages * page_size
     run_func(
         out,
         out_scale_factor,

diff --git a/tests/test_trtllm_gen_context.py b/tests/test_trtllm_gen_context.py
@@ -191,7 +191,6 @@ def test_trtllm_batch_context_wrapper(
         block_tables=block_tables,
         seq_lens=seq_lens,
         max_q_len=qo_len,
-        max_kv_len=kv_len,
         bmm1_scale=q_scale / math.sqrt(head_dim),
         bmm2_scale=1,
         batch_size=batch_size,
@@ -383,7 +382,6 @@ def test_trtllm_batch_prefill(
         block_tables,
         seq_lens_gpu,
         max_q_len,
-        max_seq_len,
         q_scale * k_scale * sm_scale,  # bmm1_scale
         v_scale / o_scale,  # bmm2_scale
         batch_size,

diff --git a/tests/test_trtllm_gen_decode.py b/tests/test_trtllm_gen_decode.py
@@ -276,7 +276,6 @@ def test_trtllm_batch_decode_fmha(
         workspace_buffer,
         block_tables,
         seq_lens_gpu,
-        max_seq_len,
         q_scale * k_scale * sm_scale,  # bmm1_scale
         v_scale / o_scale,  # bmm2_scale
         window_left,  # window_left