55from typing import Optional , List , Deque
66from collections import deque
77from lightllm .server .multi_level_kv_cache .cpu_cache_client import CpuKvCacheClient
8- from lightllm .utils .envs_utils import get_env_start_args , disable_cpu_kvcache_sync
8+ from lightllm .utils .envs_utils import get_env_start_args
99from ..infer_batch import InferReq
1010from lightllm .utils .dist_utils import create_new_group_for_current_dp
1111from lightllm .common .basemodel .triton_kernel .kv_cache_offload import offload_gpu_kv_to_cpu , load_cpu_kv_to_gpu
@@ -30,7 +30,7 @@ def __init__(self, backend):
3030 self .cpu_cache_client = CpuKvCacheClient (only_create_meta_data = False , init_shm_data = False )
3131
3232 # 一些算子模式需要同步计算和 cpu cache 的 load 和 offload 操作
33- self .need_sync_compute_stream : bool = self . args . enable_fa3 and not disable_cpu_kvcache_sync ()
33+ self .need_sync_compute_stream : bool = True
3434
3535 def wait (self ):
3636 """
@@ -72,7 +72,7 @@ def load_cpu_cache_to_reqs(self, reqs: List[InferReq]):
7272 g_infer_context .get_overlap_stream ().synchronize ()
7373
7474 # TODO 更有效的分配策略。
75- grid_num = 16 if self . need_sync_compute_stream or ( not self . args . enable_fa3 ) else 1
75+ grid_num = 16
7676
7777 mem_manager = self .backend .model .mem_manager
7878 if hasattr (mem_manager , "scale_buffer" ) and mem_manager .scale_buffer is not None :
@@ -226,7 +226,7 @@ def _start_kv_cache_offload_task(
226226 token_indexes = self .backend .model .req_manager .req_to_token_indexs [req .req_idx , 0 :move_token_num ]
227227
228228 # TODO 更有效的分配策略。
229- grid_num = 16 if self . need_sync_compute_stream or ( not self . args . enable_fa3 ) else 1
229+ grid_num = 16
230230
231231 mem_manager = self .backend .model .mem_manager
232232 if hasattr (mem_manager , "scale_buffer" ) and mem_manager .scale_buffer is not None :
0 commit comments