Skip to content

Commit 0ff8ebb

Browse files
authored
[V0 Deprecation] Remove async_output_proc, preemption mode, delay factor (#25334)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent 26e673f commit 0ff8ebb

File tree

15 files changed

+12
-210
lines changed

15 files changed

+12
-210
lines changed

tests/detokenizer/test_stop_strings.py

Lines changed: 5 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@ def _test_stopping(llm: LLM,
3232
assert output.stop_reason == expected_reason
3333

3434

35-
def _set_async_mode(llm, is_async):
36-
llm.llm_engine.scheduler[0].use_async_output_proc = is_async
37-
38-
3935
def _stop_basic(llm):
4036
_test_stopping(llm,
4137
stop=["."],
@@ -103,40 +99,8 @@ def test_stop_strings():
10399
# async output processing below.
104100
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
105101

106-
if envs.VLLM_USE_V1:
107-
_stop_basic(llm)
108-
else:
109-
_set_async_mode(llm, True)
110-
_stop_basic(llm)
111-
112-
_set_async_mode(llm, False)
113-
_stop_basic(llm)
114-
115-
if envs.VLLM_USE_V1:
116-
_stop_multi_tokens(llm)
117-
else:
118-
_set_async_mode(llm, True)
119-
_stop_multi_tokens(llm)
120-
121-
_set_async_mode(llm, False)
122-
_stop_multi_tokens(llm)
123-
124-
if envs.VLLM_USE_V1:
125-
_stop_partial_token(llm)
126-
else:
127-
_set_async_mode(llm, True)
128-
_stop_partial_token(llm)
129-
130-
_set_async_mode(llm, False)
131-
_stop_partial_token(llm)
132-
133-
if envs.VLLM_USE_V1:
134-
# FIXME: this does not respect include_in_output=False
135-
# _stop_token_id(llm)
136-
pass
137-
else:
138-
_set_async_mode(llm, True)
139-
_stop_token_id(llm)
140-
141-
_set_async_mode(llm, False)
142-
_stop_token_id(llm)
102+
_stop_basic(llm)
103+
_stop_multi_tokens(llm)
104+
_stop_partial_token(llm)
105+
# FIXME: this does not respect include_in_output=False
106+
# _stop_token_id(llm)

tests/v1/engine/test_processor_multi_modal_uuids.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from vllm.assets.image import ImageAsset
77
from vllm.assets.video import VideoAsset
88
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
9-
from vllm.platforms.interface import UnspecifiedPlatform
109
from vllm.sampling_params import SamplingParams
1110
from vllm.v1.engine import processor as processor_mod
1211
from vllm.v1.engine.processor import Processor
@@ -33,15 +32,6 @@ def _mk_processor(monkeypatch,
3332
"__post_init__",
3433
lambda self, *args: None,
3534
raising=True)
36-
monkeypatch.setattr(UnspecifiedPlatform,
37-
"is_async_output_supported",
38-
classmethod(lambda cls, enforce_eager: True),
39-
raising=True)
40-
monkeypatch.setattr(
41-
ModelConfig,
42-
"verify_async_output_proc",
43-
lambda self, parallel_config, speculative_config, device_config: None,
44-
raising=True)
4535
monkeypatch.setattr(ModelConfig,
4636
"verify_with_parallel_config",
4737
lambda self, parallel_config: None,

tests/v1/test_oracle.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,6 @@ def test_unsupported_configs(monkeypatch):
2929
},
3030
).create_engine_config()
3131

32-
with pytest.raises(NotImplementedError):
33-
AsyncEngineArgs(
34-
model=MODEL,
35-
preemption_mode="swap",
36-
).create_engine_config()
37-
38-
with pytest.raises(NotImplementedError):
39-
AsyncEngineArgs(
40-
model=MODEL,
41-
disable_async_output_proc=True,
42-
).create_engine_config()
43-
44-
with pytest.raises(NotImplementedError):
45-
AsyncEngineArgs(
46-
model=MODEL,
47-
scheduler_delay_factor=1.2,
48-
).create_engine_config()
49-
5032

5133
def test_enable_by_default_fallback(monkeypatch):
5234
with monkeypatch.context() as m:

vllm/config/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,9 +454,6 @@ def __post_init__(self):
454454
self.try_verify_and_update_config()
455455

456456
if self.model_config is not None:
457-
self.model_config.verify_async_output_proc(self.parallel_config,
458-
self.speculative_config,
459-
self.device_config)
460457
self.model_config.verify_with_parallel_config(self.parallel_config)
461458
self.model_config.verify_dual_chunk_attention_config(
462459
self.load_config)
@@ -877,7 +874,6 @@ def __str__(self):
877874
f"served_model_name={self.model_config.served_model_name}, "
878875
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
879876
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
880-
f"use_async_output_proc={self.model_config.use_async_output_proc}, "
881877
f"pooler_config={self.model_config.pooler_config!r}, "
882878
f"compilation_config={self.compilation_config!r}")
883879

vllm/config/model.py

Lines changed: 6 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,6 @@ class ModelConfig:
223223
that this name(s) will also be used in `model_name` tag content of
224224
prometheus metrics, if multiple names provided, metrics tag will take the
225225
first one."""
226-
use_async_output_proc: bool = True
227-
"""Whether to use async output processor."""
228226
config_format: Union[str, ConfigFormat] = "auto"
229227
"""The format of the model config to load:\n
230228
- "auto" will try to load the config in hf format if available else it
@@ -1119,37 +1117,6 @@ def verify_dual_chunk_attention_config(
11191117
raise ValueError("please set VLLM_ATTENTION_BACKEND to "
11201118
f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")
11211119

1122-
def verify_async_output_proc(self, parallel_config, speculative_config,
1123-
device_config) -> None:
1124-
if not self.use_async_output_proc:
1125-
# Nothing to check
1126-
return
1127-
1128-
if parallel_config.pipeline_parallel_size > 1:
1129-
self.use_async_output_proc = False
1130-
return
1131-
1132-
# Reminder: Please update docs/features/compatibility_matrix.md
1133-
# If the feature combo become valid
1134-
from vllm.platforms import current_platform
1135-
if not current_platform.is_async_output_supported(self.enforce_eager):
1136-
self.use_async_output_proc = False
1137-
return
1138-
1139-
if envs.VLLM_USE_RAY_SPMD_WORKER:
1140-
self.use_async_output_proc = False
1141-
return
1142-
1143-
# Async postprocessor is not necessary for pooling models
1144-
# since there is no token generation
1145-
if self.runner_type == "pooling":
1146-
self.use_async_output_proc = False
1147-
1148-
# Reminder: Please update docs/features/compatibility_matrix.md
1149-
# If the feature combo become valid
1150-
if speculative_config:
1151-
self.use_async_output_proc = False
1152-
11531120
def verify_with_parallel_config(
11541121
self,
11551122
parallel_config: ParallelConfig,
@@ -1173,15 +1140,12 @@ def verify_with_parallel_config(
11731140
self._verify_with_expert_parallelism()
11741141

11751142
pipeline_parallel_size = parallel_config.pipeline_parallel_size
1176-
if pipeline_parallel_size > 1:
1177-
if not self.registry.is_pp_supported_model(self.architectures,
1178-
self):
1179-
raise NotImplementedError(
1180-
"Pipeline parallelism is not supported for this model. "
1181-
"Supported models implement the `SupportsPP` interface.")
1182-
1183-
if self.use_async_output_proc:
1184-
self.use_async_output_proc = False
1143+
if (pipeline_parallel_size > 1
1144+
and not self.registry.is_pp_supported_model(
1145+
self.architectures, self)):
1146+
raise NotImplementedError(
1147+
"Pipeline parallelism is not supported for this model. "
1148+
"Supported models implement the `SupportsPP` interface.")
11851149

11861150
def get_sliding_window(self) -> Optional[int]:
11871151
"""Get the sliding window size from the HF text config if present."""

vllm/config/scheduler.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import hashlib
55
from dataclasses import field
6-
from typing import Any, Literal, Optional, Union
6+
from typing import Any, Literal, Union
77

88
from pydantic import SkipValidation, model_validator
99
from pydantic.dataclasses import dataclass
@@ -18,7 +18,6 @@
1818
logger = init_logger(__name__)
1919

2020
RunnerType = Literal["generate", "pooling", "draft"]
21-
PreemptionMode = Literal["swap", "recompute"]
2221
SchedulerPolicy = Literal["fcfs", "priority"]
2322

2423

@@ -78,10 +77,6 @@ class SchedulerConfig:
7877
3. more than one value (e.g. 1 2 128) is provided, then the capture list
7978
will follow the provided list."""
8079

81-
delay_factor: float = 0.0
82-
"""Apply a delay (of delay factor multiplied by previous
83-
prompt latency) before scheduling next prompt."""
84-
8580
enable_chunked_prefill: SkipValidation[bool] = None # type: ignore
8681
"""If True, prefill requests can be chunked based
8782
on the remaining max_num_batched_tokens."""
@@ -103,14 +98,6 @@ class SchedulerConfig:
10398
NOTE: This is not currently configurable. It will be overridden by
10499
max_num_batched_tokens in case max multimodal embedding size is larger."""
105100

106-
preemption_mode: Optional[PreemptionMode] = None
107-
"""Whether to perform preemption by swapping or
108-
recomputation. If not specified, we determine the mode as follows:
109-
We use recomputation by default since it incurs lower overhead than
110-
swapping. However, when the sequence group has multiple sequences
111-
(e.g., beam search), recomputation is not currently supported. In
112-
such a case, we use swapping instead."""
113-
114101
send_delta_data: bool = False
115102
"""Private API. If used, scheduler sends delta data to
116103
workers instead of an entire data. It should be enabled only

vllm/engine/arg_utils.py

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -409,9 +409,7 @@ class EngineArgs:
409409
get_field(LoadConfig, "model_loader_extra_config")
410410
ignore_patterns: Optional[Union[str,
411411
List[str]]] = LoadConfig.ignore_patterns
412-
preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
413412

414-
scheduler_delay_factor: float = SchedulerConfig.delay_factor
415413
enable_chunked_prefill: Optional[
416414
bool] = SchedulerConfig.enable_chunked_prefill
417415
disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
@@ -439,7 +437,6 @@ class EngineArgs:
439437
ObservabilityConfig.otlp_traces_endpoint
440438
collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
441439
ObservabilityConfig.collect_detailed_traces
442-
disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
443440
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
444441
scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
445442

@@ -561,14 +558,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
561558
**model_kwargs["enable_prompt_embeds"])
562559
model_group.add_argument("--served-model-name",
563560
**model_kwargs["served_model_name"])
564-
# This one is a special case because it is the
565-
# opposite of ModelConfig.use_async_output_proc
566-
model_group.add_argument(
567-
"--disable-async-output-proc",
568-
action="store_true",
569-
default=EngineArgs.disable_async_output_proc,
570-
help="Disable async output processing. This may result in "
571-
"lower performance.")
572561
model_group.add_argument("--config-format",
573562
**model_kwargs["config_format"])
574563
# This one is a special case because it can bool
@@ -897,10 +886,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
897886
**scheduler_kwargs["long_prefill_token_threshold"])
898887
scheduler_group.add_argument("--num-lookahead-slots",
899888
**scheduler_kwargs["num_lookahead_slots"])
900-
scheduler_group.add_argument("--scheduler-delay-factor",
901-
**scheduler_kwargs["delay_factor"])
902-
scheduler_group.add_argument("--preemption-mode",
903-
**scheduler_kwargs["preemption_mode"])
904889
# multi-step scheduling has been removed; corresponding arguments
905890
# are no longer supported.
906891
scheduler_group.add_argument("--scheduling-policy",
@@ -1029,7 +1014,6 @@ def create_model_config(self) -> ModelConfig:
10291014
interleave_mm_strings=self.interleave_mm_strings,
10301015
media_io_kwargs=self.media_io_kwargs,
10311016
skip_mm_profiling=self.skip_mm_profiling,
1032-
use_async_output_proc=not self.disable_async_output_proc,
10331017
config_format=self.config_format,
10341018
mm_processor_kwargs=self.mm_processor_kwargs,
10351019
mm_processor_cache_gb=self.mm_processor_cache_gb,
@@ -1395,11 +1379,9 @@ def create_engine_config(
13951379
max_model_len=model_config.max_model_len,
13961380
cuda_graph_sizes=self.cuda_graph_sizes,
13971381
num_lookahead_slots=num_lookahead_slots,
1398-
delay_factor=self.scheduler_delay_factor,
13991382
enable_chunked_prefill=self.enable_chunked_prefill,
14001383
disable_chunked_mm_input=self.disable_chunked_mm_input,
14011384
is_multimodal_model=model_config.is_multimodal_model,
1402-
preemption_mode=self.preemption_mode,
14031385
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
14041386
and parallel_config.use_ray),
14051387
policy=self.scheduling_policy,
@@ -1492,22 +1474,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
14921474
recommend_to_remove=False)
14931475
return False
14941476

1495-
if self.preemption_mode != SchedulerConfig.preemption_mode:
1496-
_raise_or_fallback(feature_name="--preemption-mode",
1497-
recommend_to_remove=True)
1498-
return False
1499-
1500-
if (self.disable_async_output_proc
1501-
!= EngineArgs.disable_async_output_proc):
1502-
_raise_or_fallback(feature_name="--disable-async-output-proc",
1503-
recommend_to_remove=True)
1504-
return False
1505-
1506-
if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
1507-
_raise_or_fallback(feature_name="--scheduler-delay-factor",
1508-
recommend_to_remove=True)
1509-
return False
1510-
15111477
# No Mamba or Encoder-Decoder so far.
15121478
if not model_config.is_v1_compatible:
15131479
_raise_or_fallback(feature_name=model_config.architectures,

vllm/entrypoints/llm.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,6 @@ class LLM:
137137
back to the eager mode.
138138
disable_custom_all_reduce: See
139139
[ParallelConfig][vllm.config.ParallelConfig].
140-
disable_async_output_proc: Disable async output processing.
141-
This may result in lower performance.
142140
hf_token: The token to use as HTTP bearer authorization for remote files
143141
. If `True`, will use the token generated when running
144142
`huggingface-cli login` (stored in `~/.huggingface`).
@@ -188,7 +186,6 @@ def __init__(
188186
enforce_eager: bool = False,
189187
max_seq_len_to_capture: int = 8192,
190188
disable_custom_all_reduce: bool = False,
191-
disable_async_output_proc: bool = False,
192189
hf_token: Optional[Union[bool, str]] = None,
193190
hf_overrides: Optional[HfOverrides] = None,
194191
mm_processor_kwargs: Optional[dict[str, Any]] = None,
@@ -286,7 +283,6 @@ def __init__(
286283
enforce_eager=enforce_eager,
287284
max_seq_len_to_capture=max_seq_len_to_capture,
288285
disable_custom_all_reduce=disable_custom_all_reduce,
289-
disable_async_output_proc=disable_async_output_proc,
290286
hf_token=hf_token,
291287
hf_overrides=hf_overrides,
292288
mm_processor_kwargs=mm_processor_kwargs,

vllm/executor/uniproc_executor.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,10 +137,6 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
137137
def _init_executor(self) -> None:
138138
"""Initialize the worker and load the model.
139139
"""
140-
assert self.vllm_config.scheduler_config.delay_factor == 0.0, \
141-
("ExecutorWithExternalLauncher needs deterministic "
142-
"execution, so it"
143-
"does not support delay_factor in scheduling")
144140
if envs.VLLM_USE_V1:
145141
assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
146142
("To get deterministic execution in V1, "

vllm/platforms/cpu.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,6 @@ def set_device(cls, device: torch.device) -> None:
126126
"""
127127
torch.cpu.set_device(device)
128128

129-
@classmethod
130-
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
131-
return False
132-
133129
@classmethod
134130
def inference_mode(cls):
135131
return torch.no_grad()

0 commit comments

Comments
 (0)