We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 6e6092f commit f6119c0Copy full SHA for f6119c0
lightllm/server/router/req_queue/chunked_prefill/beam_impl.py
@@ -91,13 +91,8 @@ def generate_new_batch(self, current_batch: Batch):
91
new_batch_first_router_need_tokens = 0 # 主要是对 prefill 大块计算时候的token数量限制
92
aborted_count = 0
93
cur_group_reqs = []
94
- # 在开启 cpu cache 功能的情况下,由于multi_level_kv_cache 模块会对请求申请一些cpu kv cache
95
- # 页面,这些页面的释放是在推理进程中完成的,所以如果直接在调度的时候就退出,会导致这些页面无法回收
96
- # ,所以在使能 cpu cache 的情况下,不在调度的过程中进行 cpu cache页面的释放,而是延迟到推理的
97
- # 过程中进行回收
98
- disable_queue_aborted = get_env_start_args().enable_cpu_cache
99
for req in self.waiting_req_list:
100
- if req.is_aborted and not disable_queue_aborted:
+ if req.is_aborted:
101
aborted_count += 1
102
abort_req_list.append(req)
103
continue
0 commit comments