fix cpu cache aborted

wangzaijun · wangzaijun · commit 6e6092f67bc5 · 2025-10-15T08:14:19.000Z
diff --git a/lightllm/server/router/req_queue/base_queue.py b/lightllm/server/router/req_queue/base_queue.py
@@ -4,10 +4,11 @@
 from lightllm.server.core.objs import FinishStatus
 from lightllm.common.basemodel.infer_lock import g_router_lock
 from lightllm.utils.config_utils import get_fixed_kv_len
+from lightllm.server.core.objs import StartArgs
 
 
 class BaseQueue:
-    def __init__(self, args, router, dp_index, dp_size_in_node) -> None:
+    def __init__(self, args: StartArgs, router, dp_index, dp_size_in_node) -> None:
         self.args = args
         self.dp_index = dp_index
         self.dp_size_in_node = dp_size_in_node
@@ -26,6 +27,13 @@ def __init__(self, args, router, dp_index, dp_size_in_node) -> None:
         self.router_token_ratio = args.router_token_ratio  # ratio to determine whether the router is busy
         self.router_max_new_token_len = args.router_max_new_token_len
 
+    def free_aborted_req_cpu_cache_pages(self, req: Req):
+        if self.args.enable_cpu_cache:
+            self.router.cpu_cache_client.lock.acquire_sleep1ms()
+            self.router.cpu_cache_client.deref_pages(req.cpu_cache_match_page_indexes.get_all())
+            req.cpu_cache_match_page_indexes.clear()
+            self.router.cpu_cache_client.lock.release()
+
     def extend(self, req_group: List[Req]):
         for req in req_group:
             req.sample_params.suggested_dp_index = self.dp_index
diff --git a/lightllm/server/router/req_queue/chunked_prefill/beam_impl.py b/lightllm/server/router/req_queue/chunked_prefill/beam_impl.py
@@ -127,6 +127,7 @@ def generate_new_batch(self, current_batch: Batch):
             new_batch = Batch(uuid.uuid4().int, can_run_list, dp_size_in_node=self.dp_size_in_node)
 
         for req in abort_req_list:
+            self.free_aborted_req_cpu_cache_pages(req)
             self.router.shm_req_manager.put_back_req_obj(req)
         self.waiting_req_list = self.waiting_req_list[len(can_run_list) + aborted_count :]
         return new_batch
diff --git a/lightllm/server/router/req_queue/chunked_prefill/impl.py b/lightllm/server/router/req_queue/chunked_prefill/impl.py
@@ -78,13 +78,8 @@ def generate_new_batch(self, current_batch: Batch):
         aborted_count = 0
 
         waiting_queue = self.waiting_req_list
-        # 在开启 cpu cache 功能的情况下，由于multi_level_kv_cache 模块会对请求申请一些cpu kv cache
-        # 页面，这些页面的释放是在推理进程中完成的，所以如果直接在调度的时候就退出，会导致这些页面无法回收
-        # ，所以在使能 cpu cache 的情况下，不在调度的过程中进行 cpu cache页面的释放，而是延迟到推理的
-        # 过程中进行回收
-        disable_queue_aborted = get_env_start_args().enable_cpu_cache
         for req in waiting_queue:
-            if req.is_aborted and not disable_queue_aborted:
+            if req.is_aborted:
                 # 由于管理的复杂性，只有没有被调度运行过的请求可以因为abort直接在队列中忽略掉.
                 # 暂停的请求需要恢复后，由 router manager 部分来过滤。暂时保持这种处理方法, 否则会导致管理token的泄漏
                 aborted_count += 1
@@ -101,6 +96,7 @@ def generate_new_batch(self, current_batch: Batch):
         if len(can_run_list) != 0:
             new_batch = Batch(uuid.uuid4().int, can_run_list, dp_size_in_node=self.dp_size_in_node)
         for req in abort_req_list:
+            self.free_aborted_req_cpu_cache_pages(req)
             self.router.shm_req_manager.put_back_req_obj(req)
         self.waiting_req_list = self.waiting_req_list[len(can_run_list) + aborted_count :]
         return new_batch
diff --git a/lightllm/server/router/req_queue/chunked_prefill/impl_for_nixl_pd.py b/lightllm/server/router/req_queue/chunked_prefill/impl_for_nixl_pd.py
@@ -87,6 +87,7 @@ def generate_new_batch(self, current_batch: Batch):
         if len(can_run_list) != 0:
             new_batch = Batch(uuid.uuid4().int, can_run_list, dp_size_in_node=self.dp_size_in_node)
         for req in abort_req_list:
+            self.free_aborted_req_cpu_cache_pages(req)
             self.router.shm_req_manager.put_back_req_obj(req)
         self.waiting_req_list = self.waiting_req_list[len(can_run_list) + aborted_count :]
         return new_batch
diff --git a/lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_decode.py b/lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_decode.py
@@ -52,6 +52,7 @@ def generate_new_batch(self, current_batch: Batch):
         if len(can_run_list) != 0:
             new_batch = Batch(uuid.uuid4().int, can_run_list, dp_size_in_node=self.dp_size_in_node)
         for req in abort_req_list:
+            self.free_aborted_req_cpu_cache_pages(req)
             self.router.shm_req_manager.put_back_req_obj(req)
         self.waiting_req_list = self.waiting_req_list[len(can_run_list) + aborted_count :]
         return new_batch