|
8 | 8 | from dataclasses import dataclass
|
9 | 9 | from functools import partial
|
10 | 10 | from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
|
11 |
| - Iterable, List, Mapping, NamedTuple, Optional) |
| 11 | + Iterable, List, Literal, Mapping, NamedTuple, Optional) |
12 | 12 | from typing import Sequence as GenericSequence
|
13 | 13 | from typing import Set, Type, Union, cast, overload
|
14 | 14 |
|
|
30 | 30 | get_logits_processors as get_openai_logits_processors)
|
31 | 31 | from vllm.executor.executor_base import ExecutorBase
|
32 | 32 | from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
|
33 |
| - PromptType) |
| 33 | + PromptType, SingletonInputs) |
34 | 34 | from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
|
35 | 35 | from vllm.inputs.preprocess import InputPreprocessor
|
36 | 36 | from vllm.logger import init_logger
|
|
40 | 40 | get_local_guided_decoding_logits_processor)
|
41 | 41 | from vllm.model_executor.layers.sampler import SamplerOutput
|
42 | 42 | from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
| 43 | +from vllm.multimodal.processing import EncDecMultiModalProcessor |
43 | 44 | from vllm.outputs import (PoolingRequestOutput, RequestOutput,
|
44 | 45 | RequestOutputFactory)
|
45 | 46 | from vllm.pooling_params import PoolingParams
|
@@ -2029,29 +2030,57 @@ def _validate_model_inputs(self, inputs: ProcessorInputs,
|
2029 | 2030 | lora_request: Optional[LoRARequest]):
|
2030 | 2031 | encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
|
2031 | 2032 |
|
2032 |
| - # For encoder-decoder multimodal models, the max_prompt_len |
2033 |
| - # restricts the decoder prompt length |
2034 |
| - if self.model_config.is_multimodal_model: |
2035 |
| - prompt_inputs = decoder_inputs |
2036 |
| - else: |
2037 |
| - prompt_inputs = encoder_inputs or decoder_inputs |
| 2033 | + if encoder_inputs is not None: |
| 2034 | + self._validate_model_input(encoder_inputs, |
| 2035 | + lora_request, |
| 2036 | + prompt_type="encoder") |
2038 | 2037 |
|
2039 |
| - prompt_ids = prompt_inputs["prompt_token_ids"] |
| 2038 | + self._validate_model_input(decoder_inputs, |
| 2039 | + lora_request, |
| 2040 | + prompt_type="decoder") |
2040 | 2041 |
|
2041 |
| - if prompt_ids is None or len(prompt_ids) == 0: |
2042 |
| - raise ValueError("Prompt cannot be empty") |
| 2042 | + def _validate_model_input( |
| 2043 | + self, |
| 2044 | + prompt_inputs: SingletonInputs, |
| 2045 | + lora_request: Optional[LoRARequest], |
| 2046 | + *, |
| 2047 | + prompt_type: Literal["encoder", "decoder"], |
| 2048 | + ): |
| 2049 | + if prompt_type == "encoder" and self.tokenizer is not None: |
| 2050 | + tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) |
| 2051 | + model_config = self.model_config |
2043 | 2052 |
|
2044 |
| - if self.model_config.is_multimodal_model: |
2045 |
| - max_prompt_len = self.model_config.max_model_len |
| 2053 | + if model_config.is_multimodal_model: |
| 2054 | + mm_registry = self.input_preprocessor.mm_registry |
| 2055 | + mm_processor = mm_registry.create_processor( |
| 2056 | + model_config, tokenizer=tokenizer) |
| 2057 | + assert isinstance(mm_processor, EncDecMultiModalProcessor) |
2046 | 2058 |
|
2047 |
| - if len(prompt_ids) > max_prompt_len: |
2048 |
| - raise ValueError( |
2049 |
| - f"The prompt (total length {len(prompt_ids)}) is too long " |
2050 |
| - f"to fit into the model (context length {max_prompt_len}). " |
| 2059 | + if mm_processor.pad_dummy_encoder_prompt: |
| 2060 | + return # Skip encoder length check for Whisper |
| 2061 | + |
| 2062 | + prompt_ids = prompt_inputs["prompt_token_ids"] |
| 2063 | + |
| 2064 | + if not prompt_ids: |
| 2065 | + raise ValueError(f"The {prompt_type} prompt cannot be empty") |
| 2066 | + |
| 2067 | + max_prompt_len = self.model_config.max_model_len |
| 2068 | + if len(prompt_ids) >= max_prompt_len: |
| 2069 | + if self.model_config.is_multimodal_model: |
| 2070 | + suggestion = ( |
2051 | 2071 | "Make sure that `max_model_len` is no smaller than the "
|
2052 | 2072 | "number of text tokens plus multimodal tokens. For image "
|
2053 | 2073 | "inputs, the number of image tokens depends on the number "
|
2054 | 2074 | "of images, and possibly their aspect ratios as well.")
|
| 2075 | + else: |
| 2076 | + suggestion = ( |
| 2077 | + "Make sure that `max_model_len` is no smaller than the " |
| 2078 | + "number of text tokens.") |
| 2079 | + |
| 2080 | + raise ValueError( |
| 2081 | + f"The {prompt_type} prompt (length {len(prompt_ids)}) is " |
| 2082 | + f"longer than the maximum model length of {max_prompt_len}. " |
| 2083 | + f"{suggestion}") |
2055 | 2084 |
|
2056 | 2085 | # TODO: Find out how many placeholder tokens are there so we can
|
2057 | 2086 | # check that chunked prefill does not truncate them
|
|
0 commit comments