Skip to content

Commit b21272d

Browse files
authored
[Bug fix] fix block num setting in scheduler v1 for develop (#3303)
* fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix max_block_num and max_num_batched_tokens setting * fix max_block_num and max_num_batched_tokens setting * fix max_block_num and max_num_batched_tokens setting * fix max_block_num and max_num_batched_tokens setting
1 parent 183e386 commit b21272d

File tree

4 files changed

+30
-10
lines changed

4 files changed

+30
-10
lines changed

fastdeploy/cache_manager/prefix_cache_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ def __init__(
6464
self.speculative_config = config.speculative_config
6565
self.local_data_parallel_id = local_data_parallel_id
6666

67-
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
67+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
68+
self.num_gpu_blocks = self.cache_config.total_block_num
69+
else:
70+
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
6871
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
6972
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
7073
if self.num_cpu_blocks > 0:

fastdeploy/config.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,10 @@ def __init__(self, args):
731731
self.block_size = 64
732732
self.gpu_memory_utilization = 0.9
733733
self.num_gpu_blocks_override = None
734-
self.kv_cache_ratio = 0.75
734+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
735+
self.kv_cache_ratio = 1.0
736+
else:
737+
self.kv_cache_ratio = 0.75
735738
self.enc_dec_block_num = 0 if current_platform.is_iluvatar() else 2
736739
self.prealloc_dec_block_slot_num_threshold = 5
737740
self.cache_dtype = "bfloat16"
@@ -816,7 +819,10 @@ def postprocess(self, num_total_tokens, number_of_tasks):
816819
self.dec_token_num = self.enc_dec_block_num * self.block_size
817820
if self.num_gpu_blocks_override is not None:
818821
self.total_block_num = self.num_gpu_blocks_override
819-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
822+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
823+
self.prefill_kvcache_block_num = self.total_block_num
824+
else:
825+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
820826
else:
821827
length = num_total_tokens // number_of_tasks
822828
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
@@ -829,7 +835,10 @@ def reset(self, num_gpu_blocks):
829835
reset gpu block number
830836
"""
831837
self.total_block_num = num_gpu_blocks
832-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
838+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
839+
self.prefill_kvcache_block_num = self.total_block_num
840+
else:
841+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
833842
logger.info(
834843
f"Reset block num, the total_block_num:{self.total_block_num},"
835844
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"

fastdeploy/engine/args_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from dataclasses import asdict, dataclass
1919
from dataclasses import fields as dataclass_fields
2020
from typing import Any, Dict, List, Optional
21+
import os
2122

2223
from fastdeploy.config import (
2324
CacheConfig,
@@ -884,7 +885,10 @@ def create_engine_config(self) -> Config:
884885
if self.enable_chunked_prefill:
885886
self.max_num_batched_tokens = 2048
886887
else:
887-
self.max_num_batched_tokens = self.max_model_len
888+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
889+
self.max_num_batched_tokens = self.max_model_len
890+
else:
891+
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
888892

889893
all_dict = asdict(self)
890894
all_dict["model_cfg"] = model_cfg

fastdeploy/engine/config.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,10 @@ def postprocess(self):
245245
if self.cache_config.enable_chunked_prefill:
246246
self.max_num_batched_tokens = 2048
247247
else:
248-
self.max_num_batched_tokens = self.max_model_len
248+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
249+
self.max_num_batched_tokens = self.max_model_len
250+
else:
251+
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
249252

250253
if self.long_prefill_token_threshold == 0:
251254
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -293,10 +296,11 @@ def check(self):
293296
)
294297

295298
if not self.cache_config.enable_chunked_prefill:
296-
assert self.max_num_batched_tokens >= self.max_model_len, (
297-
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
298-
f"should be larger than or equal to max_model_len: {self.max_model_len}"
299-
)
299+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
300+
assert self.max_num_batched_tokens >= self.max_model_len, (
301+
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
302+
f"should be larger than or equal to max_model_len: {self.max_model_len}"
303+
)
300304
else:
301305
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
302306
f"max_num_batched_tokens: {self.max_num_batched_tokens} "

0 commit comments

Comments
 (0)