|
34 | 34 | from vllm.config.compilation import (CompilationConfig, CompilationLevel,
|
35 | 35 | PassConfig)
|
36 | 36 | from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
|
| 37 | +from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy |
37 | 38 | from vllm.config.utils import ConfigType, config
|
38 | 39 | from vllm.logger import init_logger
|
39 | 40 | from vllm.model_executor.layers.quantization import QuantizationMethods
|
|
47 | 48 | try_get_tokenizer_config, uses_mrope)
|
48 | 49 | from vllm.transformers_utils.s3_utils import S3Model
|
49 | 50 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
|
50 |
| -# yapf conflicts with isort for this block |
51 |
| -# yapf: disable |
52 |
| -from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, |
53 |
| - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
54 |
| - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType, |
| 51 | +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType, |
55 | 52 | LazyLoader, common_broadcastable_dtype, random_uuid)
|
56 | 53 |
|
57 |
| -# yapf: enable |
58 |
| - |
59 | 54 | if TYPE_CHECKING:
|
60 | 55 | from _typeshed import DataclassInstance
|
61 | 56 | from transformers.configuration_utils import PretrainedConfig
|
@@ -1820,313 +1815,6 @@ def __post_init__(self):
|
1820 | 1815 | self.ignore_patterns = ["original/**/*"]
|
1821 | 1816 |
|
1822 | 1817 |
|
1823 |
| -PreemptionMode = Literal["swap", "recompute"] |
1824 |
| -SchedulerPolicy = Literal["fcfs", "priority"] |
1825 |
| - |
1826 |
| - |
1827 |
| -@config |
1828 |
| -@dataclass |
1829 |
| -class SchedulerConfig: |
1830 |
| - """Scheduler configuration.""" |
1831 |
| - |
1832 |
| - runner_type: RunnerType = "generate" |
1833 |
| - """The runner type to launch for the model.""" |
1834 |
| - |
1835 |
| - max_num_batched_tokens: SkipValidation[int] = None # type: ignore |
1836 |
| - """Maximum number of tokens to be processed in a single iteration. |
1837 |
| -
|
1838 |
| - This config has no static default. If left unspecified by the user, it will |
1839 |
| - be set in `EngineArgs.create_engine_config` based on the usage context.""" |
1840 |
| - |
1841 |
| - max_num_seqs: SkipValidation[int] = None # type: ignore |
1842 |
| - """Maximum number of sequences to be processed in a single iteration. |
1843 |
| -
|
1844 |
| - This config has no static default. If left unspecified by the user, it will |
1845 |
| - be set in `EngineArgs.create_engine_config` based on the usage context.""" |
1846 |
| - |
1847 |
| - max_model_len: SkipValidation[int] = None # type: ignore |
1848 |
| - """Maximum length of a sequence (including prompt and generated text). This |
1849 |
| - is primarily set in `ModelConfig` and that value should be manually |
1850 |
| - duplicated here.""" |
1851 |
| - |
1852 |
| - max_num_partial_prefills: int = 1 |
1853 |
| - """For chunked prefill, the maximum number of sequences that can be |
1854 |
| - partially prefilled concurrently.""" |
1855 |
| - |
1856 |
| - max_long_partial_prefills: int = 1 |
1857 |
| - """For chunked prefill, the maximum number of prompts longer than |
1858 |
| - long_prefill_token_threshold that will be prefilled concurrently. Setting |
1859 |
| - this less than max_num_partial_prefills will allow shorter prompts to jump |
1860 |
| - the queue in front of longer prompts in some cases, improving latency.""" |
1861 |
| - |
1862 |
| - long_prefill_token_threshold: int = 0 |
1863 |
| - """For chunked prefill, a request is considered long if the prompt is |
1864 |
| - longer than this number of tokens.""" |
1865 |
| - |
1866 |
| - num_lookahead_slots: int = 0 |
1867 |
| - """The number of slots to allocate per sequence per |
1868 |
| - step, beyond the known token ids. This is used in speculative |
1869 |
| - decoding to store KV activations of tokens which may or may not be |
1870 |
| - accepted. |
1871 |
| -
|
1872 |
| - NOTE: This will be replaced by speculative config in the future; it is |
1873 |
| - present to enable correctness tests until then.""" |
1874 |
| - |
1875 |
| - cuda_graph_sizes: list[int] = field(default_factory=list) |
1876 |
| - """Cuda graph capture sizes |
1877 |
| - 1. if none provided, then default set to [min(max_num_seqs * 2, 512)] |
1878 |
| - 2. if one value is provided, then the capture list would follow the |
1879 |
| - pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)] |
1880 |
| - 3. more than one value (e.g. 1 2 128) is provided, then the capture list |
1881 |
| - will follow the provided list.""" |
1882 |
| - |
1883 |
| - delay_factor: float = 0.0 |
1884 |
| - """Apply a delay (of delay factor multiplied by previous |
1885 |
| - prompt latency) before scheduling next prompt.""" |
1886 |
| - |
1887 |
| - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore |
1888 |
| - """If True, prefill requests can be chunked based |
1889 |
| - on the remaining max_num_batched_tokens.""" |
1890 |
| - |
1891 |
| - is_multimodal_model: bool = False |
1892 |
| - """True if the model is multimodal.""" |
1893 |
| - |
1894 |
| - # TODO (ywang96): Make this configurable. |
1895 |
| - max_num_encoder_input_tokens: int = field(init=False) |
1896 |
| - """Multimodal encoder compute budget, only used in V1. |
1897 |
| -
|
1898 |
| - NOTE: This is not currently configurable. It will be overridden by |
1899 |
| - max_num_batched_tokens in case max multimodal embedding size is larger.""" |
1900 |
| - |
1901 |
| - # TODO (ywang96): Make this configurable. |
1902 |
| - encoder_cache_size: int = field(init=False) |
1903 |
| - """Multimodal encoder cache size, only used in V1. |
1904 |
| -
|
1905 |
| - NOTE: This is not currently configurable. It will be overridden by |
1906 |
| - max_num_batched_tokens in case max multimodal embedding size is larger.""" |
1907 |
| - |
1908 |
| - preemption_mode: Optional[PreemptionMode] = None |
1909 |
| - """Whether to perform preemption by swapping or |
1910 |
| - recomputation. If not specified, we determine the mode as follows: |
1911 |
| - We use recomputation by default since it incurs lower overhead than |
1912 |
| - swapping. However, when the sequence group has multiple sequences |
1913 |
| - (e.g., beam search), recomputation is not currently supported. In |
1914 |
| - such a case, we use swapping instead.""" |
1915 |
| - |
1916 |
| - num_scheduler_steps: int = 1 |
1917 |
| - """Maximum number of forward steps per scheduler call.""" |
1918 |
| - |
1919 |
| - multi_step_stream_outputs: bool = True |
1920 |
| - """If False, then multi-step will stream outputs at the end of all steps""" |
1921 |
| - |
1922 |
| - send_delta_data: bool = False |
1923 |
| - """Private API. If used, scheduler sends delta data to |
1924 |
| - workers instead of an entire data. It should be enabled only |
1925 |
| - when SPMD worker architecture is enabled. I.e., |
1926 |
| - VLLM_USE_RAY_SPMD_WORKER=1""" |
1927 |
| - |
1928 |
| - policy: SchedulerPolicy = "fcfs" |
1929 |
| - """The scheduling policy to use:\n |
1930 |
| - - "fcfs" means first come first served, i.e. requests are handled in order |
1931 |
| - of arrival.\n |
1932 |
| - - "priority" means requests are handled based on given priority (lower |
1933 |
| - value means earlier handling) and time of arrival deciding any ties).""" |
1934 |
| - |
1935 |
| - chunked_prefill_enabled: bool = field(init=False) |
1936 |
| - """True if chunked prefill is enabled.""" |
1937 |
| - |
1938 |
| - disable_chunked_mm_input: bool = False |
1939 |
| - """If set to true and chunked prefill is enabled, we do not want to |
1940 |
| - partially schedule a multimodal item. Only used in V1 |
1941 |
| - This ensures that if a request has a mixed prompt |
1942 |
| - (like text tokens TTTT followed by image tokens IIIIIIIIII) where only |
1943 |
| - some image tokens can be scheduled (like TTTTIIIII, leaving IIIII), |
1944 |
| - it will be scheduled as TTTT in one step and IIIIIIIIII in the next.""" |
1945 |
| - |
1946 |
| - # scheduler class or path. "vllm.core.scheduler.Scheduler" (default) |
1947 |
| - # or "mod.custom_class". |
1948 |
| - scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" |
1949 |
| - """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the |
1950 |
| - default scheduler. Can be a class directly or the path to a class of form |
1951 |
| - "mod.custom_class".""" |
1952 |
| - |
1953 |
| - disable_hybrid_kv_cache_manager: bool = False |
1954 |
| - """If set to True, KV cache manager will allocate the same size of KV cache |
1955 |
| - for all attention layers even if there are multiple type of attention layers |
1956 |
| - like full attention and sliding window attention. |
1957 |
| - """ |
1958 |
| - |
1959 |
| - async_scheduling: bool = False |
1960 |
| - """EXPERIMENTAL: If set to True, perform async scheduling. This may help |
1961 |
| - reduce the CPU overheads, leading to better latency and throughput. However, |
1962 |
| - async scheduling is currently not supported with some features such as |
1963 |
| - structured outputs, speculative decoding, and pipeline parallelism. |
1964 |
| - """ |
1965 |
| - |
1966 |
| - def compute_hash(self) -> str: |
1967 |
| - """ |
1968 |
| - WARNING: Whenever a new field is added to this config, |
1969 |
| - ensure that it is included in the factors list if |
1970 |
| - it affects the computation graph. |
1971 |
| -
|
1972 |
| - Provide a hash that uniquely identifies all the configs |
1973 |
| - that affect the structure of the computation |
1974 |
| - graph from input ids/embeddings to the final hidden states, |
1975 |
| - excluding anything before input ids/embeddings and after |
1976 |
| - the final hidden states. |
1977 |
| - """ |
1978 |
| - # no factors to consider. |
1979 |
| - # this config will not affect the computation graph. |
1980 |
| - factors: list[Any] = [] |
1981 |
| - hash_str = hashlib.md5(str(factors).encode(), |
1982 |
| - usedforsecurity=False).hexdigest() |
1983 |
| - return hash_str |
1984 |
| - |
1985 |
| - def __post_init__(self) -> None: |
1986 |
| - if self.max_model_len is None: |
1987 |
| - self.max_model_len = 8192 |
1988 |
| - |
1989 |
| - if self.max_num_seqs is None: |
1990 |
| - self.max_num_seqs = 128 |
1991 |
| - |
1992 |
| - if self.max_num_batched_tokens is None: |
1993 |
| - if self.enable_chunked_prefill: |
1994 |
| - if self.num_scheduler_steps > 1: |
1995 |
| - # Multi-step Chunked-Prefill doesn't allow prompt-chunking |
1996 |
| - # for now. Have max_num_batched_tokens set to max_model_len |
1997 |
| - # so we don't reject sequences on account of a short |
1998 |
| - # max_num_batched_tokens. |
1999 |
| - self.max_num_batched_tokens = max( |
2000 |
| - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2001 |
| - else: |
2002 |
| - self.max_num_batched_tokens = ( |
2003 |
| - DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2004 |
| - else: |
2005 |
| - # If max_model_len is too short, use |
2006 |
| - # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value |
2007 |
| - # for higher throughput. |
2008 |
| - self.max_num_batched_tokens = max( |
2009 |
| - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2010 |
| - |
2011 |
| - if self.runner_type == "pooling": |
2012 |
| - # Choose specific value for higher throughput |
2013 |
| - self.max_num_batched_tokens = max( |
2014 |
| - self.max_num_batched_tokens, |
2015 |
| - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, |
2016 |
| - ) |
2017 |
| - if self.is_multimodal_model: |
2018 |
| - # The value needs to be at least the number of multimodal tokens |
2019 |
| - self.max_num_batched_tokens = max( |
2020 |
| - self.max_num_batched_tokens, |
2021 |
| - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
2022 |
| - ) |
2023 |
| - |
2024 |
| - # When using default settings, |
2025 |
| - # Ensure max_num_batched_tokens does not exceed model limit. |
2026 |
| - # Some models (e.g., Whisper) have embeddings tied to max length. |
2027 |
| - self.max_num_batched_tokens = min( |
2028 |
| - self.max_num_seqs * self.max_model_len, |
2029 |
| - self.max_num_batched_tokens) |
2030 |
| - |
2031 |
| - self.max_num_encoder_input_tokens = self.max_num_batched_tokens |
2032 |
| - self.encoder_cache_size = self.max_num_batched_tokens |
2033 |
| - |
2034 |
| - if self.enable_chunked_prefill: |
2035 |
| - logger.info( |
2036 |
| - "Chunked prefill is enabled with max_num_batched_tokens=%d.", |
2037 |
| - self.max_num_batched_tokens) |
2038 |
| - |
2039 |
| - self.chunked_prefill_enabled = self.enable_chunked_prefill |
2040 |
| - if self.max_num_partial_prefills > 1: |
2041 |
| - if self.long_prefill_token_threshold == 0: |
2042 |
| - self.long_prefill_token_threshold = int(self.max_model_len * |
2043 |
| - 0.04) |
2044 |
| - |
2045 |
| - logger.info( |
2046 |
| - "Concurrent partial prefills enabled with " |
2047 |
| - "max_num_partial_prefills=%d, max_long_partial_prefills=%d, " |
2048 |
| - "long_prefill_token_threshold=%d", |
2049 |
| - self.max_num_partial_prefills, self.max_long_partial_prefills, |
2050 |
| - self.long_prefill_token_threshold) |
2051 |
| - |
2052 |
| - # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)]. |
2053 |
| - # This avoids OOM in tight memory scenarios with small max_num_seqs, |
2054 |
| - # and prevents capture of many large graphs (>512) that would greatly |
2055 |
| - # increase startup time with limited performance benefit. |
2056 |
| - if not self.cuda_graph_sizes: |
2057 |
| - self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)] |
2058 |
| - |
2059 |
| - if self.async_scheduling: |
2060 |
| - self.scheduler_cls = ( |
2061 |
| - "vllm.v1.core.sched.async_scheduler.AsyncScheduler") |
2062 |
| - |
2063 |
| - @model_validator(mode='after') |
2064 |
| - def _verify_args(self) -> Self: |
2065 |
| - if (self.max_num_batched_tokens < self.max_model_len |
2066 |
| - and not self.chunked_prefill_enabled): |
2067 |
| - raise ValueError( |
2068 |
| - f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " |
2069 |
| - f"smaller than max_model_len ({self.max_model_len}). " |
2070 |
| - "This effectively limits the maximum sequence length to " |
2071 |
| - "max_num_batched_tokens and makes vLLM reject longer " |
2072 |
| - "sequences. Please increase max_num_batched_tokens or " |
2073 |
| - "decrease max_model_len.") |
2074 |
| - |
2075 |
| - if self.max_num_batched_tokens < self.max_num_seqs: |
2076 |
| - raise ValueError( |
2077 |
| - f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " |
2078 |
| - "be greater than or equal to max_num_seqs " |
2079 |
| - f"({self.max_num_seqs}).") |
2080 |
| - |
2081 |
| - if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: |
2082 |
| - logger.warning( |
2083 |
| - "max_num_batched_tokens (%d) exceeds max_num_seqs " |
2084 |
| - "* max_model_len (%d). This may lead to unexpected behavior.", |
2085 |
| - self.max_num_batched_tokens, |
2086 |
| - self.max_num_seqs * self.max_model_len) |
2087 |
| - |
2088 |
| - if self.num_lookahead_slots < 0: |
2089 |
| - raise ValueError( |
2090 |
| - "num_lookahead_slots " |
2091 |
| - f"({self.num_lookahead_slots}) must be greater than or " |
2092 |
| - "equal to 0.") |
2093 |
| - |
2094 |
| - if self.num_scheduler_steps < 1: |
2095 |
| - raise ValueError( |
2096 |
| - "num_scheduler_steps " |
2097 |
| - f"({self.num_scheduler_steps}) must be greater than or " |
2098 |
| - "equal to 1.") |
2099 |
| - |
2100 |
| - if self.max_num_partial_prefills < 1: |
2101 |
| - raise ValueError( |
2102 |
| - f"max_num_partial_prefills ({self.max_num_partial_prefills}) " |
2103 |
| - "must be greater than or equal to 1.") |
2104 |
| - elif self.max_num_partial_prefills > 1: |
2105 |
| - if not self.chunked_prefill_enabled: |
2106 |
| - raise ValueError("Chunked prefill must be enabled to set " |
2107 |
| - "max_num_partial_prefills > 1.") |
2108 |
| - |
2109 |
| - if self.long_prefill_token_threshold > self.max_model_len: |
2110 |
| - raise ValueError( |
2111 |
| - "long_prefill_token_threshold " |
2112 |
| - f"({self.long_prefill_token_threshold}) cannot be greater " |
2113 |
| - f"than the max_model_len ({self.max_model_len}).") |
2114 |
| - |
2115 |
| - if (self.max_long_partial_prefills |
2116 |
| - < 1) or (self.max_long_partial_prefills |
2117 |
| - > self.max_num_partial_prefills): |
2118 |
| - raise ValueError( |
2119 |
| - f"max_long_partial_prefills ({self.max_long_partial_prefills}) " |
2120 |
| - "must be greater than or equal to 1 and less than or equal to " |
2121 |
| - f"max_num_partial_prefills ({self.max_num_partial_prefills}).") |
2122 |
| - |
2123 |
| - return self |
2124 |
| - |
2125 |
| - @property |
2126 |
| - def is_multi_step(self) -> bool: |
2127 |
| - return self.num_scheduler_steps > 1 |
2128 |
| - |
2129 |
| - |
2130 | 1818 | Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
|
2131 | 1819 |
|
2132 | 1820 |
|
|
0 commit comments