Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@ def test_get_num_unfinished_requests():
@pytest.mark.parametrize(
"enable_prefix_caching, prompt_logprobs",
[
(None, None),
(False, None),
(True, 5),
],
)
def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None):
"""Test scheduling.
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
"""
Expand Down Expand Up @@ -582,12 +582,12 @@ def test_check_stop_min_tokens():
@pytest.mark.parametrize(
"enable_prefix_caching, prompt_logprobs",
[
(None, None),
(False, None),
(True, 5),
],
)
def test_schedule_concurrent_batches(
enable_prefix_caching: bool | None, prompt_logprobs: int | None
enable_prefix_caching: bool, prompt_logprobs: int | None
):
scheduler = create_scheduler(
max_num_batched_tokens=1024,
Expand Down Expand Up @@ -1416,7 +1416,7 @@ def create_scheduler_with_priority(
model: str = "facebook/opt-125m",
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_prefix_caching: bool | None = None,
enable_prefix_caching: bool = False,
long_prefill_token_threshold: int = 0,
disable_chunked_mm_input: bool = False,
use_kv_connector: bool = False,
Expand All @@ -1435,7 +1435,7 @@ def create_scheduler_with_priority(
max_num_batch_tokens: max num tokens to batch
enable_prefix_caching: optionally force APC config
(True/False) or use default
(None)
(False)

Returns:
{class}`Scheduler` instance with priority scheduling
Expand All @@ -1458,17 +1458,12 @@ def create_scheduler_with_priority(
seed=42,
)
# Cache config, optionally force APC
kwargs_cache = (
{}
if enable_prefix_caching is None
else {"enable_prefix_caching": enable_prefix_caching}
)
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
**kwargs_cache,
enable_prefix_caching=enable_prefix_caching,
)
kv_transfer_config = (
KVTransferConfig(
Expand Down
11 changes: 3 additions & 8 deletions tests/v1/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def create_scheduler(
model: str = "facebook/opt-125m",
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_prefix_caching: bool | None = None,
enable_prefix_caching: bool = False,
long_prefill_token_threshold: int = 0,
disable_chunked_mm_input: bool = False,
use_kv_connector: None | bool | MockKVConfig = None,
Expand All @@ -63,7 +63,7 @@ def create_scheduler(
max_num_batch_tokens: max num tokens to batch
enable_prefix_caching: optionally force APC config
(True/False) or use default
(None)
(False)

Returns:
{class}`Scheduler` instance
Expand All @@ -87,17 +87,12 @@ def create_scheduler(
skip_tokenizer_init=skip_tokenizer_init,
)
# Cache config, optionally force APC
kwargs_cache = (
{}
if enable_prefix_caching is None
else {"enable_prefix_caching": enable_prefix_caching}
)
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
**kwargs_cache,
enable_prefix_caching=enable_prefix_caching,
)
kv_transfer_config = None
if isinstance(use_kv_connector, MockKVConfig):
Expand Down
4 changes: 2 additions & 2 deletions vllm/config/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ class CacheConfig:
sliding_window: int | None = None
"""Sliding window size for the KV cache. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
enable_prefix_caching: bool | None = None
"""Whether to enable prefix caching. Enabled by default for V1."""
enable_prefix_caching: bool = True
"""Whether to enable prefix caching."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n
- "sha256" uses Pickle for object serialization before hashing.\n
Expand Down
24 changes: 19 additions & 5 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ class EngineArgs:
ParallelConfig.max_parallel_loading_workers
)
block_size: BlockSize | None = CacheConfig.block_size
enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
enable_prefix_caching: bool | None = None
prefix_caching_hash_algo: PrefixCachingHashAlgo = (
CacheConfig.prefix_caching_hash_algo
)
Expand Down Expand Up @@ -1966,10 +1966,11 @@ def _set_default_args(
if self.prefill_context_parallel_size > 1:
default_chunked_prefill = False
default_prefix_caching = False
logger.warning(
logger.warning_once(
"--prefill-context-parallel-size > 1 is not compatible with "
"chunked prefill and prefix caching now. Chunked prefill "
"and prefix caching have been disabled by default."
"and prefix caching have been disabled by default.",
scope="local",
)

if self.enable_chunked_prefill is None:
Expand All @@ -1979,15 +1980,27 @@ def _set_default_args(
"%s chunked prefill by default",
"Enabling" if default_chunked_prefill else "Disabling",
)
elif (
model_config.runner_type == "generate"
and not self.enable_chunked_prefill
and default_chunked_prefill
):
logger.warning_once(
"This model does not officially support disabling chunked prefill. "
"Disabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)
elif (
model_config.runner_type == "pooling"
and self.enable_chunked_prefill
and not default_chunked_prefill
):
logger.warning(
logger.warning_once(
"This model does not officially support chunked prefill. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)

if self.enable_prefix_caching is None:
Expand All @@ -2002,10 +2015,11 @@ def _set_default_args(
and self.enable_prefix_caching
and not default_prefix_caching
):
logger.warning(
logger.warning_once(
"This model does not officially support prefix caching. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)

world_size = self.pipeline_parallel_size * self.tensor_parallel_size
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def __init__(
self.kv_cache_manager = KVCacheManager(
kv_cache_config=kv_cache_config,
max_model_len=self.max_model_len,
enable_caching=bool(self.cache_config.enable_prefix_caching),
enable_caching=self.cache_config.enable_prefix_caching,
use_eagle=self.use_eagle,
log_stats=self.log_stats,
enable_kv_cache_events=self.enable_kv_cache_events,
Expand Down
Loading