add multi_level_kv_cache start

liujiacheng · liujiacheng · commit 088345bd8e67 · 2025-08-11T07:14:22.000Z
diff --git a/lightllm/server/multi_level_kv_cache/manager.py b/lightllm/server/multi_level_kv_cache/manager.py
@@ -91,6 +91,8 @@ def _handle_group_req_cpu_cache_match(self, group_req_indexes: GroupReqIndexes,
             # diverse_mode 只有主请求一个初始化 cpu cache 信息。
             if self.args.diverse_mode and req.request_id != req.group_req_id:
                 continue
+            if req.is_aborted:
+                continue
 
             self.cpu_cache_client.lock.acquire_sleep1ms()
             req: Req = req
diff --git a/lightllm/server/router/req_queue/chunked_prefill/beam_impl.py b/lightllm/server/router/req_queue/chunked_prefill/beam_impl.py
@@ -2,6 +2,7 @@
 from typing import List
 from ...batch import Batch, Req
 from lightllm.server.router.req_queue.base_queue import BaseQueue
+from lightllm.utils.envs_utils import get_env_start_args
 
 
 class ChunkedBeamContinuesBatchQueue(BaseQueue):
@@ -90,8 +91,13 @@ def generate_new_batch(self, current_batch: Batch):
         new_batch_first_router_need_tokens = 0  # 主要是对 prefill 大块计算时候的token数量限制
         aborted_count = 0
         cur_group_reqs = []
+        # 在开启 cpu cache 功能的情况下，由于multi_level_kv_cache 模块会对请求申请一些cpu kv cache
+        # 页面，这些页面的释放是在推理进程中完成的，所以如果直接在调度的时候就退出，会导致这些页面无法回收
+        # ，所以在使能 cpu cache 的情况下，不在调度的过程中进行 cpu cache页面的释放，而是延迟到推理的
+        # 过程中进行回收
+        disable_queue_aborted = get_env_start_args().enable_cpu_cache
         for req in self.waiting_req_list:
-            if req.is_aborted:
+            if req.is_aborted and not disable_queue_aborted:
                 aborted_count += 1
                 abort_req_list.append(req)
                 continue
diff --git a/lightllm/server/router/req_queue/chunked_prefill/impl.py b/lightllm/server/router/req_queue/chunked_prefill/impl.py
@@ -3,6 +3,7 @@
 from ...batch import Batch, Req
 from lightllm.server.router.req_queue.base_queue import BaseQueue
 from lightllm.common.basemodel.infer_lock import g_router_lock
+from lightllm.utils.envs_utils import get_env_start_args
 
 
 class ChunkedPrefillQueue(BaseQueue):
@@ -76,9 +77,13 @@ def generate_new_batch(self, current_batch: Batch):
         aborted_count = 0
 
         waiting_queue = self.waiting_req_list
-
+        # 在开启 cpu cache 功能的情况下，由于multi_level_kv_cache 模块会对请求申请一些cpu kv cache
+        # 页面，这些页面的释放是在推理进程中完成的，所以如果直接在调度的时候就退出，会导致这些页面无法回收
+        # ，所以在使能 cpu cache 的情况下，不在调度的过程中进行 cpu cache页面的释放，而是延迟到推理的
+        # 过程中进行回收
+        disable_queue_aborted = get_env_start_args().enable_cpu_cache
         for req in waiting_queue:
-            if req.is_aborted:
+            if req.is_aborted and not disable_queue_aborted:
                 # 由于管理的复杂性，只有没有被调度运行过的请求可以因为abort直接在队列中忽略掉.
                 # 暂停的请求需要恢复后，由 router manager 部分来过滤。暂时保持这种处理方法, 否则会导致管理token的泄漏
                 aborted_count += 1