reduce the memory of flashinfer (#850)

shihaobai · hiworldwzj · web-flow · commit 359063f9b18e · 2025-04-23T13:03:23.000+08:00
Co-authored-by: baishihao &lt;baishihao@sensetime.com&gt;
Co-authored-by: hiworldwzj &lt;30762946+hiworldwzj@users.noreply.github.com&gt;
diff --git a/lightllm/models/deepseek2/flashinfer_struct.py b/lightllm/models/deepseek2/flashinfer_struct.py
@@ -23,9 +23,14 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
         if not self.is_prefill:
             if get_env_start_args().enable_flashinfer_decode:
                 self.q_indptr = torch.arange(self.batch_size + 1, dtype=torch.int32).to(input_ids.device)
-                self.kv_indices = torch.empty(
-                    self.batch_size * self.flashinfer_extra_state.max_seq_length, dtype=torch.int32
-                ).to(input_ids.device)
+                if self.batch_size <= model.graph_max_batch_size:
+                    self.kv_indices = self.flashinfer_extra_state.kv_indices_buffer[self.microbatch_index][
+                        : self.batch_size * self.flashinfer_extra_state.max_seq_length
+                    ]
+                else:
+                    self.kv_indices = torch.empty(
+                        self.batch_size * self.flashinfer_extra_state.max_seq_length, dtype=torch.int32
+                    ).to(input_ids.device)
                 repack_kv_index(
                     self.req_manager.req_to_token_indexs,
                     self.b_req_idx,
diff --git a/lightllm/models/deepseek2/model.py b/lightllm/models/deepseek2/model.py
@@ -31,6 +31,14 @@ def __init__(self, model):
         self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(get_current_device_id())
         self.max_seq_length = model.max_seq_length
         self.softmax_scale = (self.qk_nope_head_dim + self.qk_rope_head_dim) ** (-0.5)
+        self.kv_indices_buffer = [
+            torch.empty(model.graph_max_batch_size * self.max_seq_length, dtype=torch.int32).to(
+                get_current_device_id()
+            ),
+            torch.empty(model.graph_max_batch_size * self.max_seq_length, dtype=torch.int32).to(
+                get_current_device_id()
+            ),
+        ]
         if model.config["rope_scaling"] is not None:
             rope_scaling = model.config["rope_scaling"]
             mscale_all_dim = rope_scaling.get("mscale_all_dim", 0)
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -142,7 +142,7 @@ def normal_or_p_d_start(args):
     else:
         # chunked 模式下
         if args.batch_max_tokens is None:
-            args.batch_max_tokens = min(args.max_req_total_len, 2 * args.chunked_prefill_size)
+            args.batch_max_tokens = min(args.max_req_total_len, 2 * args.chunked_prefill_size + 256)
 
         assert (
             args.batch_max_tokens >= args.chunked_prefill_size