From b228746c137e062d0a0132db1aab34a325fdbcb9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 30 Jul 2025 06:33:28 +0000 Subject: [PATCH 1/4] [Deprecation] Remove deprecated args and methods for v0.11 Signed-off-by: DarkLight1337 --- vllm/entrypoints/chat_utils.py | 34 +++++------------------------- vllm/multimodal/registry.py | 25 ---------------------- vllm/worker/neuron_model_runner.py | 7 +----- 3 files changed, 6 insertions(+), 60 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a6602391d408..17f6b15e9c39 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -48,7 +48,7 @@ # yapf: enable from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import deprecate_kwargs, random_uuid +from vllm.utils import random_uuid logger = init_logger(__name__) @@ -383,17 +383,12 @@ def resolve_mistral_chat_template( return None -@deprecate_kwargs( - "trust_remote_code", - additional_message="Please use `model_config.trust_remote_code` instead.", -) def resolve_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], *, - model_config: ModelConfig, - trust_remote_code: Optional[bool] = None, + model_config: ModelConfig ) -> Optional[str]: # 1st priority: The given chat template if chat_template is not None: @@ -488,10 +483,6 @@ def _log_chat_template_content_format( ) -@deprecate_kwargs( - "trust_remote_code", - additional_message="Please use `model_config.trust_remote_code` instead.", -) def resolve_chat_template_content_format( chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], @@ -499,7 +490,6 @@ def resolve_chat_template_content_format( tokenizer: AnyTokenizer, *, model_config: ModelConfig, - trust_remote_code: Optional[bool] = None, ) -> _ChatTemplateContentFormat: if given_format != "auto": return given_format @@ -568,17 +558,9 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]: input_modality = modality.replace("_embeds", "") - if mm_registry.has_processor(model_config): - mm_processor = mm_registry.create_processor(model_config) - allowed_counts = mm_processor.info.get_allowed_mm_limits() - allowed_count = allowed_counts.get(input_modality, 0) - else: - mm_config = model_config.multimodal_config - if mm_config is None: - msg = "This model does not support multi-modal inputs" - raise ValueError(msg) - - allowed_count = mm_config.get_limit_per_prompt(input_modality) + mm_processor = mm_registry.create_processor(model_config) + allowed_counts = mm_processor.info.get_allowed_mm_limits() + allowed_count = allowed_counts.get(input_modality, 0) current_count = len(self._items_by_modality[modality]) + 1 if current_count > allowed_count: @@ -1285,10 +1267,6 @@ def parse_chat_messages_futures( return conversation, mm_tracker.all_mm_data() -@deprecate_kwargs( - "trust_remote_code", - additional_message="Please use `model_config.trust_remote_code` instead.", -) def apply_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], conversation: list[ConversationMessage], @@ -1297,8 +1275,6 @@ def apply_hf_chat_template( *, model_config: ModelConfig, tokenize: bool = False, # Different from HF's default - # Deprecated, explicitly capture here so it doesn't slit into kwargs. - trust_remote_code: Optional[bool] = None, **kwargs: Any, ) -> str: hf_chat_template = resolve_hf_chat_template( diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index bfa391829d29..5f5b620e0cf7 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar import torch.nn as nn -from typing_extensions import deprecated from vllm.envs import VLLM_MM_INPUT_CACHE_GIB from vllm.inputs import InputProcessingContext @@ -105,13 +104,6 @@ def reset_processor_cache(self) -> bool: return True # Success - @deprecated("Legacy input processor/mapper pipeline has been removed. " - "Please update your model runner to use " - "`seq_group_metadata.multi_modal_data` directly without " - "further processing.") - def create_input_mapper(self, model_config: "ModelConfig"): - return lambda data, mm_processor_kwargs: data - def get_max_tokens_per_item_by_modality( self, model_config: "ModelConfig", @@ -182,16 +174,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: """ return sum(self.get_max_tokens_by_modality(model_config).values()) - @deprecated("Legacy input processor/mapper pipeline has been removed. " - "Please update your model runner to use " - "`seq_group_metadata.multi_modal_data` directly without " - "further processing.") - def init_mm_limits_per_prompt( - self, - model_config: "ModelConfig", - ) -> None: - pass - def get_mm_limits_per_prompt( self, model_config: "ModelConfig", @@ -246,13 +228,6 @@ def _get_model_cls(self, model_config: "ModelConfig"): model_cls, _ = get_model_architecture(model_config) return model_cls - @deprecated("Legacy input processor/mapper pipeline has been removed. " - "Please update your model runner to use " - "`seq_group_metadata.multi_modal_data` directly without " - "further processing.") - def has_processor(self, model_config: "ModelConfig") -> bool: - return True - def create_processor( self, model_config: "ModelConfig", diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 7ccf1a2c0a87..8317b9abff0c 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -15,8 +15,7 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs) +from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata @@ -88,10 +87,6 @@ def __init__( self.device = self.device_config.device self.pin_memory = is_pin_memory_available() - # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ - .create_input_mapper(self.model_config) - # Lazy initialization. self.model: nn.Module # initialize after load_model. From e5d00150bbfad02eeebed1927c95b5d978f5b4c5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 30 Jul 2025 06:53:02 +0000 Subject: [PATCH 2/4] Reduce diff Signed-off-by: DarkLight1337 --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 17f6b15e9c39..6485ed6b148b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -388,7 +388,7 @@ def resolve_hf_chat_template( chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], *, - model_config: ModelConfig + model_config: ModelConfig, ) -> Optional[str]: # 1st priority: The given chat template if chat_template is not None: From 23796588f88f515bdbc36ee0ceaaaae9dd905d74 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Jul 2025 03:48:21 +0000 Subject: [PATCH 3/4] Debug Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2bf0b6fd9a16..21ff5f79ab7a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -145,7 +145,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ + - pytest -v -sx entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - label: Distributed Tests (4 GPUs) # 10min From 78ecd66126ca5ccb02f492d6ffcdd18ab4518b6d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Jul 2025 06:46:15 +0000 Subject: [PATCH 4/4] Test passes Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 21ff5f79ab7a..2bf0b6fd9a16 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -145,7 +145,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -sx entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - label: Distributed Tests (4 GPUs) # 10min