Skip to content

Commit 78077d5

Browse files
authored
Move SchedulerConfig from config/__init__.py to config/scheduler.py (#22626)
Signed-off-by: Harry Mellor <[email protected]>
1 parent 6d729c4 commit 78077d5

File tree

2 files changed

+331
-314
lines changed

2 files changed

+331
-314
lines changed

vllm/config/__init__.py

Lines changed: 2 additions & 314 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
3535
PassConfig)
3636
from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
37+
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
3738
from vllm.config.utils import ConfigType, config
3839
from vllm.logger import init_logger
3940
from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -47,15 +48,9 @@
4748
try_get_tokenizer_config, uses_mrope)
4849
from vllm.transformers_utils.s3_utils import S3Model
4950
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
50-
# yapf conflicts with isort for this block
51-
# yapf: disable
52-
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
53-
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
54-
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
51+
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
5552
LazyLoader, common_broadcastable_dtype, random_uuid)
5653

57-
# yapf: enable
58-
5954
if TYPE_CHECKING:
6055
from _typeshed import DataclassInstance
6156
from transformers.configuration_utils import PretrainedConfig
@@ -1820,313 +1815,6 @@ def __post_init__(self):
18201815
self.ignore_patterns = ["original/**/*"]
18211816

18221817

1823-
PreemptionMode = Literal["swap", "recompute"]
1824-
SchedulerPolicy = Literal["fcfs", "priority"]
1825-
1826-
1827-
@config
1828-
@dataclass
1829-
class SchedulerConfig:
1830-
"""Scheduler configuration."""
1831-
1832-
runner_type: RunnerType = "generate"
1833-
"""The runner type to launch for the model."""
1834-
1835-
max_num_batched_tokens: SkipValidation[int] = None # type: ignore
1836-
"""Maximum number of tokens to be processed in a single iteration.
1837-
1838-
This config has no static default. If left unspecified by the user, it will
1839-
be set in `EngineArgs.create_engine_config` based on the usage context."""
1840-
1841-
max_num_seqs: SkipValidation[int] = None # type: ignore
1842-
"""Maximum number of sequences to be processed in a single iteration.
1843-
1844-
This config has no static default. If left unspecified by the user, it will
1845-
be set in `EngineArgs.create_engine_config` based on the usage context."""
1846-
1847-
max_model_len: SkipValidation[int] = None # type: ignore
1848-
"""Maximum length of a sequence (including prompt and generated text). This
1849-
is primarily set in `ModelConfig` and that value should be manually
1850-
duplicated here."""
1851-
1852-
max_num_partial_prefills: int = 1
1853-
"""For chunked prefill, the maximum number of sequences that can be
1854-
partially prefilled concurrently."""
1855-
1856-
max_long_partial_prefills: int = 1
1857-
"""For chunked prefill, the maximum number of prompts longer than
1858-
long_prefill_token_threshold that will be prefilled concurrently. Setting
1859-
this less than max_num_partial_prefills will allow shorter prompts to jump
1860-
the queue in front of longer prompts in some cases, improving latency."""
1861-
1862-
long_prefill_token_threshold: int = 0
1863-
"""For chunked prefill, a request is considered long if the prompt is
1864-
longer than this number of tokens."""
1865-
1866-
num_lookahead_slots: int = 0
1867-
"""The number of slots to allocate per sequence per
1868-
step, beyond the known token ids. This is used in speculative
1869-
decoding to store KV activations of tokens which may or may not be
1870-
accepted.
1871-
1872-
NOTE: This will be replaced by speculative config in the future; it is
1873-
present to enable correctness tests until then."""
1874-
1875-
cuda_graph_sizes: list[int] = field(default_factory=list)
1876-
"""Cuda graph capture sizes
1877-
1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
1878-
2. if one value is provided, then the capture list would follow the
1879-
pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
1880-
3. more than one value (e.g. 1 2 128) is provided, then the capture list
1881-
will follow the provided list."""
1882-
1883-
delay_factor: float = 0.0
1884-
"""Apply a delay (of delay factor multiplied by previous
1885-
prompt latency) before scheduling next prompt."""
1886-
1887-
enable_chunked_prefill: SkipValidation[bool] = None # type: ignore
1888-
"""If True, prefill requests can be chunked based
1889-
on the remaining max_num_batched_tokens."""
1890-
1891-
is_multimodal_model: bool = False
1892-
"""True if the model is multimodal."""
1893-
1894-
# TODO (ywang96): Make this configurable.
1895-
max_num_encoder_input_tokens: int = field(init=False)
1896-
"""Multimodal encoder compute budget, only used in V1.
1897-
1898-
NOTE: This is not currently configurable. It will be overridden by
1899-
max_num_batched_tokens in case max multimodal embedding size is larger."""
1900-
1901-
# TODO (ywang96): Make this configurable.
1902-
encoder_cache_size: int = field(init=False)
1903-
"""Multimodal encoder cache size, only used in V1.
1904-
1905-
NOTE: This is not currently configurable. It will be overridden by
1906-
max_num_batched_tokens in case max multimodal embedding size is larger."""
1907-
1908-
preemption_mode: Optional[PreemptionMode] = None
1909-
"""Whether to perform preemption by swapping or
1910-
recomputation. If not specified, we determine the mode as follows:
1911-
We use recomputation by default since it incurs lower overhead than
1912-
swapping. However, when the sequence group has multiple sequences
1913-
(e.g., beam search), recomputation is not currently supported. In
1914-
such a case, we use swapping instead."""
1915-
1916-
num_scheduler_steps: int = 1
1917-
"""Maximum number of forward steps per scheduler call."""
1918-
1919-
multi_step_stream_outputs: bool = True
1920-
"""If False, then multi-step will stream outputs at the end of all steps"""
1921-
1922-
send_delta_data: bool = False
1923-
"""Private API. If used, scheduler sends delta data to
1924-
workers instead of an entire data. It should be enabled only
1925-
when SPMD worker architecture is enabled. I.e.,
1926-
VLLM_USE_RAY_SPMD_WORKER=1"""
1927-
1928-
policy: SchedulerPolicy = "fcfs"
1929-
"""The scheduling policy to use:\n
1930-
- "fcfs" means first come first served, i.e. requests are handled in order
1931-
of arrival.\n
1932-
- "priority" means requests are handled based on given priority (lower
1933-
value means earlier handling) and time of arrival deciding any ties)."""
1934-
1935-
chunked_prefill_enabled: bool = field(init=False)
1936-
"""True if chunked prefill is enabled."""
1937-
1938-
disable_chunked_mm_input: bool = False
1939-
"""If set to true and chunked prefill is enabled, we do not want to
1940-
partially schedule a multimodal item. Only used in V1
1941-
This ensures that if a request has a mixed prompt
1942-
(like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1943-
some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1944-
it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
1945-
1946-
# scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
1947-
# or "mod.custom_class".
1948-
scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
1949-
"""The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
1950-
default scheduler. Can be a class directly or the path to a class of form
1951-
"mod.custom_class"."""
1952-
1953-
disable_hybrid_kv_cache_manager: bool = False
1954-
"""If set to True, KV cache manager will allocate the same size of KV cache
1955-
for all attention layers even if there are multiple type of attention layers
1956-
like full attention and sliding window attention.
1957-
"""
1958-
1959-
async_scheduling: bool = False
1960-
"""EXPERIMENTAL: If set to True, perform async scheduling. This may help
1961-
reduce the CPU overheads, leading to better latency and throughput. However,
1962-
async scheduling is currently not supported with some features such as
1963-
structured outputs, speculative decoding, and pipeline parallelism.
1964-
"""
1965-
1966-
def compute_hash(self) -> str:
1967-
"""
1968-
WARNING: Whenever a new field is added to this config,
1969-
ensure that it is included in the factors list if
1970-
it affects the computation graph.
1971-
1972-
Provide a hash that uniquely identifies all the configs
1973-
that affect the structure of the computation
1974-
graph from input ids/embeddings to the final hidden states,
1975-
excluding anything before input ids/embeddings and after
1976-
the final hidden states.
1977-
"""
1978-
# no factors to consider.
1979-
# this config will not affect the computation graph.
1980-
factors: list[Any] = []
1981-
hash_str = hashlib.md5(str(factors).encode(),
1982-
usedforsecurity=False).hexdigest()
1983-
return hash_str
1984-
1985-
def __post_init__(self) -> None:
1986-
if self.max_model_len is None:
1987-
self.max_model_len = 8192
1988-
1989-
if self.max_num_seqs is None:
1990-
self.max_num_seqs = 128
1991-
1992-
if self.max_num_batched_tokens is None:
1993-
if self.enable_chunked_prefill:
1994-
if self.num_scheduler_steps > 1:
1995-
# Multi-step Chunked-Prefill doesn't allow prompt-chunking
1996-
# for now. Have max_num_batched_tokens set to max_model_len
1997-
# so we don't reject sequences on account of a short
1998-
# max_num_batched_tokens.
1999-
self.max_num_batched_tokens = max(
2000-
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
2001-
else:
2002-
self.max_num_batched_tokens = (
2003-
DEFAULT_MAX_NUM_BATCHED_TOKENS)
2004-
else:
2005-
# If max_model_len is too short, use
2006-
# DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
2007-
# for higher throughput.
2008-
self.max_num_batched_tokens = max(
2009-
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
2010-
2011-
if self.runner_type == "pooling":
2012-
# Choose specific value for higher throughput
2013-
self.max_num_batched_tokens = max(
2014-
self.max_num_batched_tokens,
2015-
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
2016-
)
2017-
if self.is_multimodal_model:
2018-
# The value needs to be at least the number of multimodal tokens
2019-
self.max_num_batched_tokens = max(
2020-
self.max_num_batched_tokens,
2021-
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
2022-
)
2023-
2024-
# When using default settings,
2025-
# Ensure max_num_batched_tokens does not exceed model limit.
2026-
# Some models (e.g., Whisper) have embeddings tied to max length.
2027-
self.max_num_batched_tokens = min(
2028-
self.max_num_seqs * self.max_model_len,
2029-
self.max_num_batched_tokens)
2030-
2031-
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
2032-
self.encoder_cache_size = self.max_num_batched_tokens
2033-
2034-
if self.enable_chunked_prefill:
2035-
logger.info(
2036-
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
2037-
self.max_num_batched_tokens)
2038-
2039-
self.chunked_prefill_enabled = self.enable_chunked_prefill
2040-
if self.max_num_partial_prefills > 1:
2041-
if self.long_prefill_token_threshold == 0:
2042-
self.long_prefill_token_threshold = int(self.max_model_len *
2043-
0.04)
2044-
2045-
logger.info(
2046-
"Concurrent partial prefills enabled with "
2047-
"max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
2048-
"long_prefill_token_threshold=%d",
2049-
self.max_num_partial_prefills, self.max_long_partial_prefills,
2050-
self.long_prefill_token_threshold)
2051-
2052-
# NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
2053-
# This avoids OOM in tight memory scenarios with small max_num_seqs,
2054-
# and prevents capture of many large graphs (>512) that would greatly
2055-
# increase startup time with limited performance benefit.
2056-
if not self.cuda_graph_sizes:
2057-
self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
2058-
2059-
if self.async_scheduling:
2060-
self.scheduler_cls = (
2061-
"vllm.v1.core.sched.async_scheduler.AsyncScheduler")
2062-
2063-
@model_validator(mode='after')
2064-
def _verify_args(self) -> Self:
2065-
if (self.max_num_batched_tokens < self.max_model_len
2066-
and not self.chunked_prefill_enabled):
2067-
raise ValueError(
2068-
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
2069-
f"smaller than max_model_len ({self.max_model_len}). "
2070-
"This effectively limits the maximum sequence length to "
2071-
"max_num_batched_tokens and makes vLLM reject longer "
2072-
"sequences. Please increase max_num_batched_tokens or "
2073-
"decrease max_model_len.")
2074-
2075-
if self.max_num_batched_tokens < self.max_num_seqs:
2076-
raise ValueError(
2077-
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
2078-
"be greater than or equal to max_num_seqs "
2079-
f"({self.max_num_seqs}).")
2080-
2081-
if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
2082-
logger.warning(
2083-
"max_num_batched_tokens (%d) exceeds max_num_seqs "
2084-
"* max_model_len (%d). This may lead to unexpected behavior.",
2085-
self.max_num_batched_tokens,
2086-
self.max_num_seqs * self.max_model_len)
2087-
2088-
if self.num_lookahead_slots < 0:
2089-
raise ValueError(
2090-
"num_lookahead_slots "
2091-
f"({self.num_lookahead_slots}) must be greater than or "
2092-
"equal to 0.")
2093-
2094-
if self.num_scheduler_steps < 1:
2095-
raise ValueError(
2096-
"num_scheduler_steps "
2097-
f"({self.num_scheduler_steps}) must be greater than or "
2098-
"equal to 1.")
2099-
2100-
if self.max_num_partial_prefills < 1:
2101-
raise ValueError(
2102-
f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
2103-
"must be greater than or equal to 1.")
2104-
elif self.max_num_partial_prefills > 1:
2105-
if not self.chunked_prefill_enabled:
2106-
raise ValueError("Chunked prefill must be enabled to set "
2107-
"max_num_partial_prefills > 1.")
2108-
2109-
if self.long_prefill_token_threshold > self.max_model_len:
2110-
raise ValueError(
2111-
"long_prefill_token_threshold "
2112-
f"({self.long_prefill_token_threshold}) cannot be greater "
2113-
f"than the max_model_len ({self.max_model_len}).")
2114-
2115-
if (self.max_long_partial_prefills
2116-
< 1) or (self.max_long_partial_prefills
2117-
> self.max_num_partial_prefills):
2118-
raise ValueError(
2119-
f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
2120-
"must be greater than or equal to 1 and less than or equal to "
2121-
f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
2122-
2123-
return self
2124-
2125-
@property
2126-
def is_multi_step(self) -> bool:
2127-
return self.num_scheduler_steps > 1
2128-
2129-
21301818
Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
21311819

21321820

0 commit comments

Comments
 (0)