fix

hiworldwzj · web-flow · commit d770a02d0e3b · 2025-04-24T21:26:31.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -76,7 +76,6 @@ def __init__(self, kvargs):
         self._verify_must()
         self._verify_params()
         self._init_quant()
-        self._init_inferstate_cls()
 
         # 更连续的显存分配可以有更好的性能
         if self.max_total_token_num is None:
@@ -92,6 +91,7 @@ def __init__(self, kvargs):
         self._init_infer_layer()
         self._init_some_value()
         self._init_custom()
+        self._init_inferstate_cls()
         self._init_cudagraph()
         self._check_max_len_infer()
         torch.cuda.empty_cache()
diff --git a/lightllm/models/llama/flashinfer_struct.py b/lightllm/models/llama/flashinfer_struct.py
@@ -23,9 +23,15 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
         if not self.is_prefill:
             if get_env_start_args().enable_flashinfer_decode:
                 self.kv_last_page_len_buffer = torch.full((self.batch_size,), 1, dtype=torch.int32).to(input_ids.device)
-                self.kv_indices = torch.empty(
-                    self.batch_size * self.flashinfer_extra_state.max_seq_length, dtype=torch.int32
-                ).to(input_ids.device)
+                if self.batch_size <= model.graph_max_batch_size:
+                    self.kv_indices = self.flashinfer_extra_state.kv_indices_buffer[self.microbatch_index][
+                        : self.batch_size * self.flashinfer_extra_state.max_seq_length
+                    ]
+                else:
+                    self.kv_indices = torch.empty(
+                        self.batch_size * self.flashinfer_extra_state.max_seq_length, dtype=torch.int32
+                    ).to(input_ids.device)
+                
                 repack_kv_index(
                     self.req_manager.req_to_token_indexs,
                     self.b_req_idx,
diff --git a/lightllm/models/llama/model.py b/lightllm/models/llama/model.py
@@ -30,6 +30,14 @@ def __init__(self, model):
         self.head_dim = model.config["hidden_size"] // model.config["num_attention_heads"]
         self.workspace_buffer = torch.empty(256 * 1024 * 1024, dtype=torch.int8).to(get_current_device_id())
         self.max_seq_length = model.max_seq_length
+        self.kv_indices_buffer = [
+            torch.empty(model.graph_max_batch_size * self.max_seq_length, dtype=torch.int32).to(
+                get_current_device_id()
+            ),
+            torch.empty(model.graph_max_batch_size * self.max_seq_length, dtype=torch.int32).to(
+                get_current_device_id()
+            ),
+        ]
         self.q_data_type = model.data_type
         self.kv_data_type = model.data_type
 
@@ -51,8 +59,6 @@ def __init__(self, kvargs):
         self.enable_flashinfer = (
             get_env_start_args().enable_flashinfer_prefill or get_env_start_args().enable_flashinfer_decode
         )
-        if self.enable_flashinfer:
-            self.infer_state_class = LlamaFlashInferStateInfo
         super().__init__(kvargs)
         return
 
@@ -61,8 +67,6 @@ def _init_config(self):
         # rename key
         # repair_config()
         self._reset_num_key_value_heads()
-        if self.enable_flashinfer:
-            self.flashinfer_extra_state = LlamaFlashInferStateExtraInfo(self)
         return
 
     def _reset_num_key_value_heads(self):
@@ -90,6 +94,9 @@ def _init_mem_manager(self):
     def _init_inferstate_cls(self):
         if get_env_start_args().enable_fa3:
             self.infer_state_class = FlashAttentionStateInfo
+        elif self.enable_flashinfer:
+            self.infer_state_class = LlamaFlashInferStateInfo
+            self.flashinfer_extra_state = LlamaFlashInferStateExtraInfo(self)
 
     def _init_custom(self):
         """