Skip to content

Commit 088345b

Browse files
author
liujiacheng
committed
add multi_level_kv_cache start
1 parent b011825 commit 088345b

File tree

3 files changed

+16
-3
lines changed

3 files changed

+16
-3
lines changed

lightllm/server/multi_level_kv_cache/manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ def _handle_group_req_cpu_cache_match(self, group_req_indexes: GroupReqIndexes,
9191
# diverse_mode 只有主请求一个初始化 cpu cache 信息。
9292
if self.args.diverse_mode and req.request_id != req.group_req_id:
9393
continue
94+
if req.is_aborted:
95+
continue
9496

9597
self.cpu_cache_client.lock.acquire_sleep1ms()
9698
req: Req = req

lightllm/server/router/req_queue/chunked_prefill/beam_impl.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import List
33
from ...batch import Batch, Req
44
from lightllm.server.router.req_queue.base_queue import BaseQueue
5+
from lightllm.utils.envs_utils import get_env_start_args
56

67

78
class ChunkedBeamContinuesBatchQueue(BaseQueue):
@@ -90,8 +91,13 @@ def generate_new_batch(self, current_batch: Batch):
9091
new_batch_first_router_need_tokens = 0 # 主要是对 prefill 大块计算时候的token数量限制
9192
aborted_count = 0
9293
cur_group_reqs = []
94+
# 在开启 cpu cache 功能的情况下,由于multi_level_kv_cache 模块会对请求申请一些cpu kv cache
95+
# 页面,这些页面的释放是在推理进程中完成的,所以如果直接在调度的时候就退出,会导致这些页面无法回收
96+
# ,所以在使能 cpu cache 的情况下,不在调度的过程中进行 cpu cache页面的释放,而是延迟到推理的
97+
# 过程中进行回收
98+
disable_queue_aborted = get_env_start_args().enable_cpu_cache
9399
for req in self.waiting_req_list:
94-
if req.is_aborted:
100+
if req.is_aborted and not disable_queue_aborted:
95101
aborted_count += 1
96102
abort_req_list.append(req)
97103
continue

lightllm/server/router/req_queue/chunked_prefill/impl.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from ...batch import Batch, Req
44
from lightllm.server.router.req_queue.base_queue import BaseQueue
55
from lightllm.common.basemodel.infer_lock import g_router_lock
6+
from lightllm.utils.envs_utils import get_env_start_args
67

78

89
class ChunkedPrefillQueue(BaseQueue):
@@ -76,9 +77,13 @@ def generate_new_batch(self, current_batch: Batch):
7677
aborted_count = 0
7778

7879
waiting_queue = self.waiting_req_list
79-
80+
# 在开启 cpu cache 功能的情况下,由于multi_level_kv_cache 模块会对请求申请一些cpu kv cache
81+
# 页面,这些页面的释放是在推理进程中完成的,所以如果直接在调度的时候就退出,会导致这些页面无法回收
82+
# ,所以在使能 cpu cache 的情况下,不在调度的过程中进行 cpu cache页面的释放,而是延迟到推理的
83+
# 过程中进行回收
84+
disable_queue_aborted = get_env_start_args().enable_cpu_cache
8085
for req in waiting_queue:
81-
if req.is_aborted:
86+
if req.is_aborted and not disable_queue_aborted:
8287
# 由于管理的复杂性,只有没有被调度运行过的请求可以因为abort直接在队列中忽略掉.
8388
# 暂停的请求需要恢复后,由 router manager 部分来过滤。暂时保持这种处理方法, 否则会导致管理token的泄漏
8489
aborted_count += 1

0 commit comments

Comments
 (0)