Skip to content

Commit c7d7a1b

Browse files
author
wangzaijun
committed
improve cpu kv kernel setting
1 parent 0528bea commit c7d7a1b

File tree

3 files changed

+6
-1
lines changed

3 files changed

+6
-1
lines changed

lightllm/common/basemodel/triton_kernel/kv_cache_offload.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ def offload_gpu_kv_to_cpu(
116116
page_readies: torch.Tensor,
117117
tp_index: int,
118118
tp_world_size: int,
119+
grid_num: int,
119120
_cache_data={},
120121
):
121122
"""
@@ -231,7 +232,7 @@ def offload_gpu_kv_to_cpu(
231232
assert token_block_size == triton.next_power_of_2(token_block_size)
232233
page_num = page_indexes.shape[0]
233234

234-
grid = (1,)
235+
grid = (grid_num,)
235236
num_warps = 4
236237

237238
_offload_gpu_kv_to_cpu[grid](

lightllm/server/core/objs/start_args_type.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,6 @@ class StartArgs:
122122
# multi_modal
123123
enable_multimodal: bool = field(default=False)
124124
enable_multimodal_audio: bool = field(default=False)
125+
126+
# kernel setting
127+
enable_fa3: bool = field(default=False)

lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ def _start_kv_cache_offload_task(
202202
page_readies=page_readies,
203203
tp_index=self.backend.rank_in_dp,
204204
tp_world_size=self.backend.dp_world_size,
205+
grid_num=1 if self.args.enable_fa3 else 16, # TODO 更有效的分配策略。
205206
)
206207

207208
sync_event = torch.cuda.Event()

0 commit comments

Comments
 (0)