Skip to content

Commit 5a13314

Browse files
hmellorpaulpak58
authored andcommitted
Move SchedulerConfig from config/__init__.py to config/scheduler.py (vllm-project#22626)
Signed-off-by: Harry Mellor <[email protected]> Signed-off-by: Paul Pak <[email protected]>
1 parent 51c51a4 commit 5a13314

File tree

2 files changed

+331
-314
lines changed

2 files changed

+331
-314
lines changed

vllm/config/__init__.py

Lines changed: 2 additions & 314 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
3535
PassConfig)
3636
from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
37+
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
3738
from vllm.config.utils import ConfigType, config
3839
from vllm.logger import init_logger
3940
from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -47,15 +48,9 @@
4748
try_get_tokenizer_config, uses_mrope)
4849
from vllm.transformers_utils.s3_utils import S3Model
4950
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
50-
# yapf conflicts with isort for this block
51-
# yapf: disable
52-
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
53-
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
54-
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
51+
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
5552
LazyLoader, common_broadcastable_dtype, random_uuid)
5653

57-
# yapf: enable
58-
5954
if TYPE_CHECKING:
6055
from _typeshed import DataclassInstance
6156
from transformers.configuration_utils import PretrainedConfig
@@ -1833,313 +1828,6 @@ def __post_init__(self):
18331828
self.ignore_patterns = ["original/**/*"]
18341829

18351830

1836-
PreemptionMode = Literal["swap", "recompute"]
1837-
SchedulerPolicy = Literal["fcfs", "priority"]
1838-
1839-
1840-
@config
1841-
@dataclass
1842-
class SchedulerConfig:
1843-
"""Scheduler configuration."""
1844-
1845-
runner_type: RunnerType = "generate"
1846-
"""The runner type to launch for the model."""
1847-
1848-
max_num_batched_tokens: SkipValidation[int] = None # type: ignore
1849-
"""Maximum number of tokens to be processed in a single iteration.
1850-
1851-
This config has no static default. If left unspecified by the user, it will
1852-
be set in `EngineArgs.create_engine_config` based on the usage context."""
1853-
1854-
max_num_seqs: SkipValidation[int] = None # type: ignore
1855-
"""Maximum number of sequences to be processed in a single iteration.
1856-
1857-
This config has no static default. If left unspecified by the user, it will
1858-
be set in `EngineArgs.create_engine_config` based on the usage context."""
1859-
1860-
max_model_len: SkipValidation[int] = None # type: ignore
1861-
"""Maximum length of a sequence (including prompt and generated text). This
1862-
is primarily set in `ModelConfig` and that value should be manually
1863-
duplicated here."""
1864-
1865-
max_num_partial_prefills: int = 1
1866-
"""For chunked prefill, the maximum number of sequences that can be
1867-
partially prefilled concurrently."""
1868-
1869-
max_long_partial_prefills: int = 1
1870-
"""For chunked prefill, the maximum number of prompts longer than
1871-
long_prefill_token_threshold that will be prefilled concurrently. Setting
1872-
this less than max_num_partial_prefills will allow shorter prompts to jump
1873-
the queue in front of longer prompts in some cases, improving latency."""
1874-
1875-
long_prefill_token_threshold: int = 0
1876-
"""For chunked prefill, a request is considered long if the prompt is
1877-
longer than this number of tokens."""
1878-
1879-
num_lookahead_slots: int = 0
1880-
"""The number of slots to allocate per sequence per
1881-
step, beyond the known token ids. This is used in speculative
1882-
decoding to store KV activations of tokens which may or may not be
1883-
accepted.
1884-
1885-
NOTE: This will be replaced by speculative config in the future; it is
1886-
present to enable correctness tests until then."""
1887-
1888-
cuda_graph_sizes: list[int] = field(default_factory=list)
1889-
"""Cuda graph capture sizes
1890-
1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
1891-
2. if one value is provided, then the capture list would follow the
1892-
pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
1893-
3. more than one value (e.g. 1 2 128) is provided, then the capture list
1894-
will follow the provided list."""
1895-
1896-
delay_factor: float = 0.0
1897-
"""Apply a delay (of delay factor multiplied by previous
1898-
prompt latency) before scheduling next prompt."""
1899-
1900-
enable_chunked_prefill: SkipValidation[bool] = None # type: ignore
1901-
"""If True, prefill requests can be chunked based
1902-
on the remaining max_num_batched_tokens."""
1903-
1904-
is_multimodal_model: bool = False
1905-
"""True if the model is multimodal."""
1906-
1907-
# TODO (ywang96): Make this configurable.
1908-
max_num_encoder_input_tokens: int = field(init=False)
1909-
"""Multimodal encoder compute budget, only used in V1.
1910-
1911-
NOTE: This is not currently configurable. It will be overridden by
1912-
max_num_batched_tokens in case max multimodal embedding size is larger."""
1913-
1914-
# TODO (ywang96): Make this configurable.
1915-
encoder_cache_size: int = field(init=False)
1916-
"""Multimodal encoder cache size, only used in V1.
1917-
1918-
NOTE: This is not currently configurable. It will be overridden by
1919-
max_num_batched_tokens in case max multimodal embedding size is larger."""
1920-
1921-
preemption_mode: Optional[PreemptionMode] = None
1922-
"""Whether to perform preemption by swapping or
1923-
recomputation. If not specified, we determine the mode as follows:
1924-
We use recomputation by default since it incurs lower overhead than
1925-
swapping. However, when the sequence group has multiple sequences
1926-
(e.g., beam search), recomputation is not currently supported. In
1927-
such a case, we use swapping instead."""
1928-
1929-
num_scheduler_steps: int = 1
1930-
"""Maximum number of forward steps per scheduler call."""
1931-
1932-
multi_step_stream_outputs: bool = True
1933-
"""If False, then multi-step will stream outputs at the end of all steps"""
1934-
1935-
send_delta_data: bool = False
1936-
"""Private API. If used, scheduler sends delta data to
1937-
workers instead of an entire data. It should be enabled only
1938-
when SPMD worker architecture is enabled. I.e.,
1939-
VLLM_USE_RAY_SPMD_WORKER=1"""
1940-
1941-
policy: SchedulerPolicy = "fcfs"
1942-
"""The scheduling policy to use:\n
1943-
- "fcfs" means first come first served, i.e. requests are handled in order
1944-
of arrival.\n
1945-
- "priority" means requests are handled based on given priority (lower
1946-
value means earlier handling) and time of arrival deciding any ties)."""
1947-
1948-
chunked_prefill_enabled: bool = field(init=False)
1949-
"""True if chunked prefill is enabled."""
1950-
1951-
disable_chunked_mm_input: bool = False
1952-
"""If set to true and chunked prefill is enabled, we do not want to
1953-
partially schedule a multimodal item. Only used in V1
1954-
This ensures that if a request has a mixed prompt
1955-
(like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1956-
some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1957-
it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
1958-
1959-
# scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
1960-
# or "mod.custom_class".
1961-
scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
1962-
"""The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
1963-
default scheduler. Can be a class directly or the path to a class of form
1964-
"mod.custom_class"."""
1965-
1966-
disable_hybrid_kv_cache_manager: bool = False
1967-
"""If set to True, KV cache manager will allocate the same size of KV cache
1968-
for all attention layers even if there are multiple type of attention layers
1969-
like full attention and sliding window attention.
1970-
"""
1971-
1972-
async_scheduling: bool = False
1973-
"""EXPERIMENTAL: If set to True, perform async scheduling. This may help
1974-
reduce the CPU overheads, leading to better latency and throughput. However,
1975-
async scheduling is currently not supported with some features such as
1976-
structured outputs, speculative decoding, and pipeline parallelism.
1977-
"""
1978-
1979-
def compute_hash(self) -> str:
1980-
"""
1981-
WARNING: Whenever a new field is added to this config,
1982-
ensure that it is included in the factors list if
1983-
it affects the computation graph.
1984-
1985-
Provide a hash that uniquely identifies all the configs
1986-
that affect the structure of the computation
1987-
graph from input ids/embeddings to the final hidden states,
1988-
excluding anything before input ids/embeddings and after
1989-
the final hidden states.
1990-
"""
1991-
# no factors to consider.
1992-
# this config will not affect the computation graph.
1993-
factors: list[Any] = []
1994-
hash_str = hashlib.md5(str(factors).encode(),
1995-
usedforsecurity=False).hexdigest()
1996-
return hash_str
1997-
1998-
def __post_init__(self) -> None:
1999-
if self.max_model_len is None:
2000-
self.max_model_len = 8192
2001-
2002-
if self.max_num_seqs is None:
2003-
self.max_num_seqs = 128
2004-
2005-
if self.max_num_batched_tokens is None:
2006-
if self.enable_chunked_prefill:
2007-
if self.num_scheduler_steps > 1:
2008-
# Multi-step Chunked-Prefill doesn't allow prompt-chunking
2009-
# for now. Have max_num_batched_tokens set to max_model_len
2010-
# so we don't reject sequences on account of a short
2011-
# max_num_batched_tokens.
2012-
self.max_num_batched_tokens = max(
2013-
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
2014-
else:
2015-
self.max_num_batched_tokens = (
2016-
DEFAULT_MAX_NUM_BATCHED_TOKENS)
2017-
else:
2018-
# If max_model_len is too short, use
2019-
# DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
2020-
# for higher throughput.
2021-
self.max_num_batched_tokens = max(
2022-
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
2023-
2024-
if self.runner_type == "pooling":
2025-
# Choose specific value for higher throughput
2026-
self.max_num_batched_tokens = max(
2027-
self.max_num_batched_tokens,
2028-
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
2029-
)
2030-
if self.is_multimodal_model:
2031-
# The value needs to be at least the number of multimodal tokens
2032-
self.max_num_batched_tokens = max(
2033-
self.max_num_batched_tokens,
2034-
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
2035-
)
2036-
2037-
# When using default settings,
2038-
# Ensure max_num_batched_tokens does not exceed model limit.
2039-
# Some models (e.g., Whisper) have embeddings tied to max length.
2040-
self.max_num_batched_tokens = min(
2041-
self.max_num_seqs * self.max_model_len,
2042-
self.max_num_batched_tokens)
2043-
2044-
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
2045-
self.encoder_cache_size = self.max_num_batched_tokens
2046-
2047-
if self.enable_chunked_prefill:
2048-
logger.info(
2049-
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
2050-
self.max_num_batched_tokens)
2051-
2052-
self.chunked_prefill_enabled = self.enable_chunked_prefill
2053-
if self.max_num_partial_prefills > 1:
2054-
if self.long_prefill_token_threshold == 0:
2055-
self.long_prefill_token_threshold = int(self.max_model_len *
2056-
0.04)
2057-
2058-
logger.info(
2059-
"Concurrent partial prefills enabled with "
2060-
"max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
2061-
"long_prefill_token_threshold=%d",
2062-
self.max_num_partial_prefills, self.max_long_partial_prefills,
2063-
self.long_prefill_token_threshold)
2064-
2065-
# NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
2066-
# This avoids OOM in tight memory scenarios with small max_num_seqs,
2067-
# and prevents capture of many large graphs (>512) that would greatly
2068-
# increase startup time with limited performance benefit.
2069-
if not self.cuda_graph_sizes:
2070-
self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
2071-
2072-
if self.async_scheduling:
2073-
self.scheduler_cls = (
2074-
"vllm.v1.core.sched.async_scheduler.AsyncScheduler")
2075-
2076-
@model_validator(mode='after')
2077-
def _verify_args(self) -> Self:
2078-
if (self.max_num_batched_tokens < self.max_model_len
2079-
and not self.chunked_prefill_enabled):
2080-
raise ValueError(
2081-
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
2082-
f"smaller than max_model_len ({self.max_model_len}). "
2083-
"This effectively limits the maximum sequence length to "
2084-
"max_num_batched_tokens and makes vLLM reject longer "
2085-
"sequences. Please increase max_num_batched_tokens or "
2086-
"decrease max_model_len.")
2087-
2088-
if self.max_num_batched_tokens < self.max_num_seqs:
2089-
raise ValueError(
2090-
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
2091-
"be greater than or equal to max_num_seqs "
2092-
f"({self.max_num_seqs}).")
2093-
2094-
if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
2095-
logger.warning(
2096-
"max_num_batched_tokens (%d) exceeds max_num_seqs "
2097-
"* max_model_len (%d). This may lead to unexpected behavior.",
2098-
self.max_num_batched_tokens,
2099-
self.max_num_seqs * self.max_model_len)
2100-
2101-
if self.num_lookahead_slots < 0:
2102-
raise ValueError(
2103-
"num_lookahead_slots "
2104-
f"({self.num_lookahead_slots}) must be greater than or "
2105-
"equal to 0.")
2106-
2107-
if self.num_scheduler_steps < 1:
2108-
raise ValueError(
2109-
"num_scheduler_steps "
2110-
f"({self.num_scheduler_steps}) must be greater than or "
2111-
"equal to 1.")
2112-
2113-
if self.max_num_partial_prefills < 1:
2114-
raise ValueError(
2115-
f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
2116-
"must be greater than or equal to 1.")
2117-
elif self.max_num_partial_prefills > 1:
2118-
if not self.chunked_prefill_enabled:
2119-
raise ValueError("Chunked prefill must be enabled to set "
2120-
"max_num_partial_prefills > 1.")
2121-
2122-
if self.long_prefill_token_threshold > self.max_model_len:
2123-
raise ValueError(
2124-
"long_prefill_token_threshold "
2125-
f"({self.long_prefill_token_threshold}) cannot be greater "
2126-
f"than the max_model_len ({self.max_model_len}).")
2127-
2128-
if (self.max_long_partial_prefills
2129-
< 1) or (self.max_long_partial_prefills
2130-
> self.max_num_partial_prefills):
2131-
raise ValueError(
2132-
f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
2133-
"must be greater than or equal to 1 and less than or equal to "
2134-
f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
2135-
2136-
return self
2137-
2138-
@property
2139-
def is_multi_step(self) -> bool:
2140-
return self.num_scheduler_steps > 1
2141-
2142-
21431831
Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
21441832

21451833

0 commit comments

Comments
 (0)