diff --git a/lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py b/lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py index 5cde46308..d4ba90299 100644 --- a/lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py +++ b/lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py @@ -5,7 +5,7 @@ from typing import Optional, List, Deque from collections import deque from lightllm.server.multi_level_kv_cache.cpu_cache_client import CpuKvCacheClient -from lightllm.utils.envs_utils import get_env_start_args, disable_cpu_kvcache_sync +from lightllm.utils.envs_utils import get_env_start_args from ..infer_batch import InferReq from lightllm.utils.dist_utils import create_new_group_for_current_dp from lightllm.common.basemodel.triton_kernel.kv_cache_offload import offload_gpu_kv_to_cpu, load_cpu_kv_to_gpu @@ -30,7 +30,7 @@ def __init__(self, backend): self.cpu_cache_client = CpuKvCacheClient(only_create_meta_data=False, init_shm_data=False) # 一些算子模式需要同步计算和 cpu cache 的 load 和 offload 操作 - self.need_sync_compute_stream: bool = self.args.enable_fa3 and not disable_cpu_kvcache_sync() + self.need_sync_compute_stream: bool = True def wait(self): """ @@ -72,7 +72,7 @@ def load_cpu_cache_to_reqs(self, reqs: List[InferReq]): g_infer_context.get_overlap_stream().synchronize() # TODO 更有效的分配策略。 - grid_num = 16 if self.need_sync_compute_stream or (not self.args.enable_fa3) else 1 + grid_num = 16 mem_manager = self.backend.model.mem_manager if hasattr(mem_manager, "scale_buffer") and mem_manager.scale_buffer is not None: @@ -226,7 +226,7 @@ def _start_kv_cache_offload_task( token_indexes = self.backend.model.req_manager.req_to_token_indexs[req.req_idx, 0:move_token_num] # TODO 更有效的分配策略。 - grid_num = 16 if self.need_sync_compute_stream or (not self.args.enable_fa3) else 1 + grid_num = 16 mem_manager = self.backend.model.mem_manager if hasattr(mem_manager, "scale_buffer") and mem_manager.scale_buffer is not None: diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py index 06f53b307..b5822a342 100644 --- a/lightllm/utils/envs_utils.py +++ b/lightllm/utils/envs_utils.py @@ -189,14 +189,6 @@ def use_whisper_sdpa_attention() -> bool: return enable_env_vars("LIGHTLLM_USE_WHISPER_SDPA_ATTENTION") -@lru_cache(maxsize=None) -def disable_cpu_kvcache_sync() -> bool: - """ - 实验用环境遍历,未来可能会移除 - """ - return enable_env_vars("LIGHTLLM_DISABLE_CPU_CACHE_SYNC") - - @lru_cache(maxsize=None) def enable_radix_tree_timer_merge() -> bool: """