Skip to content

Commit baaf435

Browse files
committed
fix for dynamic prompt cache
1 parent 09659ab commit baaf435

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

lightllm/server/router/model_infer/infer_batch.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def init_all(self):
253253
self.cur_output_len = 0
254254
self.finish_status = FinishStatus()
255255

256-
if self.paused or not self.initialized:
256+
if self.paused or not self.initialized or self.is_chunked:
257257
# 如果是具有 prompt_cache 的使用特性则需要进行提前的填充和恢复操作。
258258
input_token_ids = self.get_input_token_ids()
259259
if g_infer_context.radix_cache is not None and len(input_token_ids) > 1:
@@ -263,10 +263,10 @@ def init_all(self):
263263
if share_node is not None:
264264
self.shared_kv_node = share_node
265265
ready_cache_len = share_node.node_prefix_total_len
266-
g_infer_context.req_manager.req_to_token_indexs[
267-
self.req_idx, self.cur_kv_len : self.cur_kv_len + ready_cache_len
268-
] = value_tensor
269-
self.cur_kv_len += int(ready_cache_len) # 序列化问题, 该对象可能为numpy.int64,用 int(*)转换
266+
g_infer_context.req_manager.req_to_token_indexs[self.req_idx, 0:ready_cache_len] = value_tensor
267+
self.cur_kv_len = max(
268+
self.cur_kv_len, int(ready_cache_len)
269+
) # dynamic prompt cache for chunked prefill
270270
self.shm_req.prompt_cache_len = self.cur_kv_len # 记录 prompt cache 的命中长度
271271

272272
self.shm_req.shm_cur_kv_len = self.cur_kv_len

0 commit comments

Comments
 (0)