fix

shihaobai · shihaobai · commit 92bd068ffb9a · 2025-04-22T15:32:30.000+08:00
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -283,10 +283,10 @@ def init_all(self):
 
         if self.paused or not self.initialized:
             # 如果是具有 prompt_cache 的使用特性则需要进行提前的填充和恢复操作。
-            if g_infer_context.radix_cache is not None and self.get_cur_total_len() > 2:
+            if g_infer_context.radix_cache is not None and self.get_cur_total_len() > 1:
                 input_token_ids = self.shm_req.shm_prompt_ids.arr[0 : self.get_cur_total_len()]
                 key = torch.tensor(input_token_ids, dtype=torch.int64, device="cpu")
-                key = key[0 : len(key) - 2]  # 最后两个不需要，因为需要一个额外的token，让其在prefill的时候输出下一个token的值
+                key = key[0 : len(key) - 1]  # 最后两个不需要，因为需要一个额外的token，让其在prefill的时候输出下一个token的值
                 share_node, kv_len, value_tensor = g_infer_context.radix_cache.match_prefix(key, update_refs=True)
                 if share_node is not None:
                     self.shared_kv_node = share_node
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -335,7 +335,10 @@ def _get_classed_reqs(self, req_ids: List[int], no_decode: bool = False):
                 ok_finished_reqs.append(req_obj)
                 continue
 
-            is_decode = req_obj.cur_kv_len + 1 == req_obj.get_cur_total_len()
+            is_decode = (
+                req_obj.cur_kv_len + 1 == req_obj.get_cur_total_len()
+                and req_obj.cur_kv_len + 1 != req_obj.shm_req.input_len
+            )
 
             if not is_decode:
                 prefill_reqs.append(req_obj)