Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 4 additions & 28 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
# yapf: enable
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import deprecate_kwargs, random_uuid
from vllm.utils import random_uuid

logger = init_logger(__name__)

Expand Down Expand Up @@ -383,17 +383,12 @@ def resolve_mistral_chat_template(
return None


@deprecate_kwargs(
"trust_remote_code",
additional_message="Please use `model_config.trust_remote_code` instead.",
)
def resolve_hf_chat_template(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
chat_template: Optional[str],
tools: Optional[list[dict[str, Any]]],
*,
model_config: ModelConfig,
trust_remote_code: Optional[bool] = None,
) -> Optional[str]:
# 1st priority: The given chat template
if chat_template is not None:
Expand Down Expand Up @@ -488,18 +483,13 @@ def _log_chat_template_content_format(
)


@deprecate_kwargs(
"trust_remote_code",
additional_message="Please use `model_config.trust_remote_code` instead.",
)
def resolve_chat_template_content_format(
chat_template: Optional[str],
tools: Optional[list[dict[str, Any]]],
given_format: ChatTemplateContentFormatOption,
tokenizer: AnyTokenizer,
*,
model_config: ModelConfig,
trust_remote_code: Optional[bool] = None,
) -> _ChatTemplateContentFormat:
if given_format != "auto":
return given_format
Expand Down Expand Up @@ -568,17 +558,9 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]:

input_modality = modality.replace("_embeds", "")

if mm_registry.has_processor(model_config):
mm_processor = mm_registry.create_processor(model_config)
allowed_counts = mm_processor.info.get_allowed_mm_limits()
allowed_count = allowed_counts.get(input_modality, 0)
else:
mm_config = model_config.multimodal_config
if mm_config is None:
msg = "This model does not support multi-modal inputs"
raise ValueError(msg)

allowed_count = mm_config.get_limit_per_prompt(input_modality)
mm_processor = mm_registry.create_processor(model_config)
allowed_counts = mm_processor.info.get_allowed_mm_limits()
allowed_count = allowed_counts.get(input_modality, 0)

current_count = len(self._items_by_modality[modality]) + 1
if current_count > allowed_count:
Expand Down Expand Up @@ -1285,10 +1267,6 @@ def parse_chat_messages_futures(
return conversation, mm_tracker.all_mm_data()


@deprecate_kwargs(
"trust_remote_code",
additional_message="Please use `model_config.trust_remote_code` instead.",
)
def apply_hf_chat_template(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
conversation: list[ConversationMessage],
Expand All @@ -1297,8 +1275,6 @@ def apply_hf_chat_template(
*,
model_config: ModelConfig,
tokenize: bool = False, # Different from HF's default
# Deprecated, explicitly capture here so it doesn't slit into kwargs.
trust_remote_code: Optional[bool] = None,
**kwargs: Any,
) -> str:
hf_chat_template = resolve_hf_chat_template(
Expand Down
25 changes: 0 additions & 25 deletions vllm/multimodal/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar

import torch.nn as nn
from typing_extensions import deprecated

from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
from vllm.inputs import InputProcessingContext
Expand Down Expand Up @@ -105,13 +104,6 @@ def reset_processor_cache(self) -> bool:

return True # Success

@deprecated("Legacy input processor/mapper pipeline has been removed. "
"Please update your model runner to use "
"`seq_group_metadata.multi_modal_data` directly without "
"further processing.")
def create_input_mapper(self, model_config: "ModelConfig"):
return lambda data, mm_processor_kwargs: data

def get_max_tokens_per_item_by_modality(
self,
model_config: "ModelConfig",
Expand Down Expand Up @@ -182,16 +174,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
"""
return sum(self.get_max_tokens_by_modality(model_config).values())

@deprecated("Legacy input processor/mapper pipeline has been removed. "
"Please update your model runner to use "
"`seq_group_metadata.multi_modal_data` directly without "
"further processing.")
def init_mm_limits_per_prompt(
self,
model_config: "ModelConfig",
) -> None:
pass

def get_mm_limits_per_prompt(
self,
model_config: "ModelConfig",
Expand Down Expand Up @@ -246,13 +228,6 @@ def _get_model_cls(self, model_config: "ModelConfig"):
model_cls, _ = get_model_architecture(model_config)
return model_cls

@deprecated("Legacy input processor/mapper pipeline has been removed. "
"Please update your model runner to use "
"`seq_group_metadata.multi_modal_data` directly without "
"further processing.")
def has_processor(self, model_config: "ModelConfig") -> bool:
return True

def create_processor(
self,
model_config: "ModelConfig",
Expand Down
7 changes: 1 addition & 6 deletions vllm/worker/neuron_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader.neuron import get_neuron_model
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalKwargs)
from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
Expand Down Expand Up @@ -88,10 +87,6 @@ def __init__(
self.device = self.device_config.device
self.pin_memory = is_pin_memory_available()

# Multi-modal data support
self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
.create_input_mapper(self.model_config)

# Lazy initialization.
self.model: nn.Module # initialize after load_model.

Expand Down