fix - fix multi bs core

Nancheng-11 · Nancheng-11 · commit bc1713c8ca24 · 2025-12-25T14:42:36.000+08:00
diff --git a/rtp_llm/models_py/modules/factory/attention/cuda_mla_impl/flashinfer_mla_wrapper.py b/rtp_llm/models_py/modules/factory/attention/cuda_mla_impl/flashinfer_mla_wrapper.py
@@ -215,7 +215,11 @@ def prepare(self, attn_inputs: PyAttentionInputs, use_cuda_graph: bool = False):
 
     def prepare_cuda_graph(self, attn_inputs: PyAttentionInputs):
         self.fmha_impl.cuda_graph_kv_indices = torch.empty(
-            (self.bs * self.max_context_len // self.seq_size_per_block),
+            (
+                (self.max_context_len + self.seq_size_per_block - 1)
+                // self.seq_size_per_block
+            )
+            * self.bs,
             dtype=torch.int32,
             device="cuda",
         )