Skip to content

Commit 25f51b0

Browse files
rainyflyJiang-Jia-JunXieYunshen
authored
Fix block num in schduelr v1 for release 2.1 (#3315)
* fix bug for scheduler v0 * fix block num setting in scheduler v1 for release 2.1 * fix block num setting in scheduler v1 for release 2.1 --------- Co-authored-by: Jiang-Jia-Jun <[email protected]> Co-authored-by: YUNSHEN XIE <[email protected]>
1 parent 9b07f85 commit 25f51b0

File tree

5 files changed

+33
-12
lines changed

5 files changed

+33
-12
lines changed

fastdeploy/cache_manager/prefix_cache_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ def __init__(
6464
self.speculative_config = config.speculative_config
6565
self.local_data_parallel_id = local_data_parallel_id
6666

67-
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
67+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
68+
self.num_gpu_blocks = self.cache_config.total_block_num
69+
else:
70+
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
6871
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
6972
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
7073
if self.num_cpu_blocks > 0:

fastdeploy/config.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,10 @@ def __init__(self, args):
726726
self.block_size = 64
727727
self.gpu_memory_utilization = 0.9
728728
self.num_gpu_blocks_override = None
729-
self.kv_cache_ratio = 0.75
729+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
730+
self.kv_cache_ratio = 1.0
731+
else:
732+
self.kv_cache_ratio = 0.75
730733
self.enc_dec_block_num = 2
731734
self.prealloc_dec_block_slot_num_threshold = 5
732735
self.cache_dtype = "bfloat16"
@@ -811,7 +814,10 @@ def postprocess(self, num_total_tokens, number_of_tasks):
811814
self.dec_token_num = self.enc_dec_block_num * self.block_size
812815
if self.num_gpu_blocks_override is not None:
813816
self.total_block_num = self.num_gpu_blocks_override
814-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
817+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
818+
self.prefill_kvcache_block_num = self.total_block_num
819+
else:
820+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
815821
else:
816822
length = num_total_tokens // number_of_tasks
817823
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
@@ -824,7 +830,10 @@ def reset(self, num_gpu_blocks):
824830
reset gpu block number
825831
"""
826832
self.total_block_num = num_gpu_blocks
827-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
833+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
834+
self.prefill_kvcache_block_num = self.total_block_num
835+
else:
836+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
828837
logger.info(
829838
f"Reset block num, the total_block_num:{self.total_block_num},"
830839
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"

fastdeploy/engine/args_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from dataclasses import asdict, dataclass
1919
from dataclasses import fields as dataclass_fields
2020
from typing import Any, Dict, List, Optional
21+
import os
2122

2223
from fastdeploy.config import (
2324
CacheConfig,
@@ -865,7 +866,10 @@ def create_engine_config(self) -> Config:
865866
if self.enable_chunked_prefill:
866867
self.max_num_batched_tokens = 2048
867868
else:
868-
self.max_num_batched_tokens = self.max_model_len
869+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
870+
self.max_num_batched_tokens = self.max_model_len
871+
else:
872+
self.max_num_batched_tokens = 8192
869873

870874
all_dict = asdict(self)
871875
all_dict["model_cfg"] = model_cfg

fastdeploy/engine/config.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,10 @@ def postprocess(self):
236236
if self.cache_config.enable_chunked_prefill:
237237
self.max_num_batched_tokens = 2048
238238
else:
239-
self.max_num_batched_tokens = self.max_model_len
239+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
240+
self.max_num_batched_tokens = self.max_model_len
241+
else:
242+
self.max_num_batched_tokens = 8192
240243

241244
if self.long_prefill_token_threshold == 0:
242245
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -284,10 +287,11 @@ def check(self):
284287
)
285288

286289
if not self.cache_config.enable_chunked_prefill:
287-
assert self.max_num_batched_tokens >= self.max_model_len, (
288-
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
289-
f"should be larger than or equal to max_model_len: {self.max_model_len}"
290-
)
290+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
291+
assert self.max_num_batched_tokens >= self.max_model_len, (
292+
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
293+
f"should be larger than or equal to max_model_len: {self.max_model_len}"
294+
)
291295
else:
292296
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
293297
f"max_num_batched_tokens: {self.max_num_batched_tokens} "

fastdeploy/output/token_processor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -463,8 +463,9 @@ def _process_batch_output(self):
463463
if recovery_stop:
464464
llm_logger.info(f"recovery stop signal found at task {task_id}")
465465
if not recovery_stop and token_id < 0:
466-
if task_id in self.resource_manager.to_be_rescheduled_request_id_set:
467-
self.resource_manager.reschedule_preempt_task(task_id)
466+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
467+
if task_id in self.resource_manager.to_be_rescheduled_request_id_set:
468+
self.resource_manager.reschedule_preempt_task(task_id)
468469
continue
469470

470471
if task.get("prefill_chunk_info", None) is not None:

0 commit comments

Comments
 (0)