Skip to content

Commit e23cacd

Browse files
[Bugfix]: Clean up chunked prefill logging when using whisper (vllm-project#25075)
Signed-off-by: simondanielsson <[email protected]>
1 parent 2e1b8bc commit e23cacd

File tree

4 files changed

+75
-8
lines changed

4 files changed

+75
-8
lines changed

tests/v1/core/test_scheduler.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import dataclasses
34
from typing import Optional
45
from unittest.mock import Mock
56

@@ -1899,4 +1900,53 @@ def test_priority_scheduling_preemption_when_out_of_kv():
18991900
assert output.scheduled_cached_reqs.num_reqs == 1
19001901
assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
19011902
assert len(scheduler.waiting) == 1
1902-
assert len(scheduler.running) == 1
1903+
assert len(scheduler.running) == 1
1904+
1905+
1906+
@pytest.mark.parametrize(
1907+
("enable_chunked_prefill", "is_encoder_decoder", "expect_enabled"),
1908+
[
1909+
(True, False, True),
1910+
(False, False, False),
1911+
# Encoder-decoder models should always have it disabled
1912+
(False, True, False),
1913+
(True, True, False),
1914+
])
1915+
def test_chunked_prefill_disabled_for_encoder_decoder(
1916+
enable_chunked_prefill: bool, is_encoder_decoder: bool,
1917+
expect_enabled: bool) -> None:
1918+
"""Validate that chunked prefill is appropriately disabled for
1919+
encoder-decoder models."""
1920+
scheduler_config = SchedulerConfig(
1921+
enable_chunked_prefill=enable_chunked_prefill,
1922+
is_encoder_decoder=is_encoder_decoder,
1923+
)
1924+
1925+
# `is_encoder_decoder` should only be used during construction
1926+
# of the config, and otherwise stored in the model config.
1927+
assert "is_encoder_decoder" not in vars(scheduler_config)
1928+
assert "is_encoder_decoder" not in [
1929+
f.name for f in dataclasses.fields(scheduler_config)
1930+
]
1931+
_validate_chunked_prefill_settings_for_encoder_decoder(
1932+
scheduler_config, is_encoder_decoder, expect_enabled)
1933+
1934+
# Ensure it is retained in VllmConfig, even after its post-init.
1935+
vllm_config = VllmConfig(scheduler_config=scheduler_config)
1936+
_validate_chunked_prefill_settings_for_encoder_decoder(
1937+
vllm_config.scheduler_config, is_encoder_decoder, expect_enabled)
1938+
1939+
1940+
def _validate_chunked_prefill_settings_for_encoder_decoder(
1941+
scheduler_config: SchedulerConfig, is_encoder_decoder: bool,
1942+
expect_enabled: bool) -> None:
1943+
"""Validate chunked prefill settings in the scheduler config for
1944+
encoder-decoder models."""
1945+
assert scheduler_config.chunked_prefill_enabled is expect_enabled
1946+
assert scheduler_config.enable_chunked_prefill is expect_enabled
1947+
if is_encoder_decoder:
1948+
# Encoder-decoder models should automatically disable chunked multimodal
1949+
# inputs as well
1950+
assert scheduler_config.disable_chunked_mm_input is not expect_enabled
1951+
if is_encoder_decoder and not expect_enabled:
1952+
assert scheduler_config.long_prefill_token_threshold == 0

vllm/config/scheduler.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import hashlib
5-
from dataclasses import field
5+
from dataclasses import InitVar, field
66
from typing import Any, Literal, Union
77

88
from pydantic import SkipValidation, model_validator
@@ -84,6 +84,13 @@ class SchedulerConfig:
8484
is_multimodal_model: bool = False
8585
"""True if the model is multimodal."""
8686

87+
is_encoder_decoder: InitVar[bool] = False
88+
"""True if the model is an encoder-decoder model.
89+
90+
Note: This is stored in the ModelConfig, and is used only here to
91+
disable chunked prefill and prefix caching for encoder-decoder models.
92+
"""
93+
8794
# TODO (ywang96): Make this configurable.
8895
max_num_encoder_input_tokens: int = field(init=False)
8996
"""Multimodal encoder compute budget, only used in V1.
@@ -161,13 +168,23 @@ def compute_hash(self) -> str:
161168
usedforsecurity=False).hexdigest()
162169
return hash_str
163170

164-
def __post_init__(self) -> None:
171+
def __post_init__(self, is_encoder_decoder: bool) -> None:
165172
if self.max_model_len is None:
166173
self.max_model_len = 8192
167174

168175
if self.max_num_seqs is None:
169176
self.max_num_seqs = 128
170177

178+
if is_encoder_decoder:
179+
# Chunked prefill should be disabled for encoder-decoder models.
180+
self.disable_chunked_mm_input = True
181+
self.chunked_prefill_enabled = False
182+
self.enable_chunked_prefill = False
183+
self.long_prefill_token_threshold = 0
184+
logger.info(
185+
"Encoder-decoder models do not support chunked prefill nor"
186+
" prefix caching; disabling both.")
187+
171188
if self.max_num_batched_tokens is None:
172189
if self.enable_chunked_prefill:
173190
self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS

vllm/config/vllm.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -386,10 +386,6 @@ def __post_init__(self):
386386
"Encoder-decoder model detected: setting "
387387
"`max_num_encoder_input_tokens` to encoder length (%s)",
388388
self.scheduler_config.max_num_encoder_input_tokens)
389-
self.scheduler_config.disable_chunked_mm_input = True
390-
disable_chunked_prefill_reasons.append(
391-
"Encoder-decoder models do not support chunked prefill nor"
392-
" prefix caching; disabling both.")
393389
if (self.model_config.architecture
394390
== "WhisperForConditionalGeneration"
395391
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
@@ -400,7 +396,10 @@ def __post_init__(self):
400396
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
401397
"to 'spawn'.")
402398

403-
if disable_chunked_prefill_reasons:
399+
# Disable prefix caching only if chunked prefill is explicitly disabled
400+
# (and not merely unset)
401+
if (self.scheduler_config.chunked_prefill_enabled is False
402+
or disable_chunked_prefill_reasons):
404403
for reason in disable_chunked_prefill_reasons:
405404
logger.info(reason)
406405
self.scheduler_config.chunked_prefill_enabled = False

vllm/engine/arg_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,7 @@ def create_engine_config(
13671367
enable_chunked_prefill=self.enable_chunked_prefill,
13681368
disable_chunked_mm_input=self.disable_chunked_mm_input,
13691369
is_multimodal_model=model_config.is_multimodal_model,
1370+
is_encoder_decoder=model_config.is_encoder_decoder,
13701371
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
13711372
and parallel_config.use_ray),
13721373
policy=self.scheduling_policy,

0 commit comments

Comments
 (0)