Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Optional, List, Deque
from collections import deque
from lightllm.server.multi_level_kv_cache.cpu_cache_client import CpuKvCacheClient
from lightllm.utils.envs_utils import get_env_start_args, disable_cpu_kvcache_sync
from lightllm.utils.envs_utils import get_env_start_args
from ..infer_batch import InferReq
from lightllm.utils.dist_utils import create_new_group_for_current_dp
from lightllm.common.basemodel.triton_kernel.kv_cache_offload import offload_gpu_kv_to_cpu, load_cpu_kv_to_gpu
Expand All @@ -30,7 +30,7 @@ def __init__(self, backend):
self.cpu_cache_client = CpuKvCacheClient(only_create_meta_data=False, init_shm_data=False)

# 一些算子模式需要同步计算和 cpu cache 的 load 和 offload 操作
self.need_sync_compute_stream: bool = self.args.enable_fa3 and not disable_cpu_kvcache_sync()
self.need_sync_compute_stream: bool = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Given that self.need_sync_compute_stream is now hardcoded to True, consider a follow-up refactoring to improve clarity. You could remove this attribute entirely and directly call synchronize() where it's used (e.g., lines 72, 162, 175), removing the now-redundant if checks. This would make it explicit that synchronization is always performed for CPU cache operations.

Additionally, the comments mentioning that synchronization is for fa3 (e.g., line 71, 161, 174) could be updated to reflect that this is now a general requirement for CPU cache, not specific to fa3.


def wait(self):
"""
Expand Down Expand Up @@ -72,7 +72,7 @@ def load_cpu_cache_to_reqs(self, reqs: List[InferReq]):
g_infer_context.get_overlap_stream().synchronize()

# TODO 更有效的分配策略。
grid_num = 16 if self.need_sync_compute_stream or (not self.args.enable_fa3) else 1
grid_num = 16

mem_manager = self.backend.model.mem_manager
if hasattr(mem_manager, "scale_buffer") and mem_manager.scale_buffer is not None:
Expand Down Expand Up @@ -226,7 +226,7 @@ def _start_kv_cache_offload_task(
token_indexes = self.backend.model.req_manager.req_to_token_indexs[req.req_idx, 0:move_token_num]

# TODO 更有效的分配策略。
grid_num = 16 if self.need_sync_compute_stream or (not self.args.enable_fa3) else 1
grid_num = 16

mem_manager = self.backend.model.mem_manager
if hasattr(mem_manager, "scale_buffer") and mem_manager.scale_buffer is not None:
Expand Down
8 changes: 0 additions & 8 deletions lightllm/utils/envs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,6 @@ def use_whisper_sdpa_attention() -> bool:
return enable_env_vars("LIGHTLLM_USE_WHISPER_SDPA_ATTENTION")


@lru_cache(maxsize=None)
def disable_cpu_kvcache_sync() -> bool:
"""
实验用环境遍历,未来可能会移除
"""
return enable_env_vars("LIGHTLLM_DISABLE_CPU_CACHE_SYNC")


@lru_cache(maxsize=None)
def enable_radix_tree_timer_merge() -> bool:
"""
Expand Down