Skip to content

Commit 2b3f369

Browse files
blueswhenniushengxiao
andauthored
fix: use sync for offloading in the cpu cache (#1163)
Co-authored-by: niushengxiao <[email protected]>
1 parent 4b70b7a commit 2b3f369

File tree

2 files changed

+4
-12
lines changed

2 files changed

+4
-12
lines changed

lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import Optional, List, Deque
66
from collections import deque
77
from lightllm.server.multi_level_kv_cache.cpu_cache_client import CpuKvCacheClient
8-
from lightllm.utils.envs_utils import get_env_start_args, disable_cpu_kvcache_sync
8+
from lightllm.utils.envs_utils import get_env_start_args
99
from ..infer_batch import InferReq
1010
from lightllm.utils.dist_utils import create_new_group_for_current_dp
1111
from lightllm.common.basemodel.triton_kernel.kv_cache_offload import offload_gpu_kv_to_cpu, load_cpu_kv_to_gpu
@@ -30,7 +30,7 @@ def __init__(self, backend):
3030
self.cpu_cache_client = CpuKvCacheClient(only_create_meta_data=False, init_shm_data=False)
3131

3232
# 一些算子模式需要同步计算和 cpu cache 的 load 和 offload 操作
33-
self.need_sync_compute_stream: bool = self.args.enable_fa3 and not disable_cpu_kvcache_sync()
33+
self.need_sync_compute_stream: bool = True
3434

3535
def wait(self):
3636
"""
@@ -72,7 +72,7 @@ def load_cpu_cache_to_reqs(self, reqs: List[InferReq]):
7272
g_infer_context.get_overlap_stream().synchronize()
7373

7474
# TODO 更有效的分配策略。
75-
grid_num = 16 if self.need_sync_compute_stream or (not self.args.enable_fa3) else 1
75+
grid_num = 16
7676

7777
mem_manager = self.backend.model.mem_manager
7878
if hasattr(mem_manager, "scale_buffer") and mem_manager.scale_buffer is not None:
@@ -226,7 +226,7 @@ def _start_kv_cache_offload_task(
226226
token_indexes = self.backend.model.req_manager.req_to_token_indexs[req.req_idx, 0:move_token_num]
227227

228228
# TODO 更有效的分配策略。
229-
grid_num = 16 if self.need_sync_compute_stream or (not self.args.enable_fa3) else 1
229+
grid_num = 16
230230

231231
mem_manager = self.backend.model.mem_manager
232232
if hasattr(mem_manager, "scale_buffer") and mem_manager.scale_buffer is not None:

lightllm/utils/envs_utils.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,14 +189,6 @@ def use_whisper_sdpa_attention() -> bool:
189189
return enable_env_vars("LIGHTLLM_USE_WHISPER_SDPA_ATTENTION")
190190

191191

192-
@lru_cache(maxsize=None)
193-
def disable_cpu_kvcache_sync() -> bool:
194-
"""
195-
实验用环境遍历,未来可能会移除
196-
"""
197-
return enable_env_vars("LIGHTLLM_DISABLE_CPU_CACHE_SYNC")
198-
199-
200192
@lru_cache(maxsize=None)
201193
def enable_radix_tree_timer_merge() -> bool:
202194
"""

0 commit comments

Comments
 (0)