Skip to content

Commit a5d11a5

Browse files
[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 3d4c877 commit a5d11a5

File tree

3 files changed

+39
-30
lines changed

3 files changed

+39
-30
lines changed

vllm/engine/llm_engine.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2046,27 +2046,31 @@ def _validate_model_input(
20462046
*,
20472047
prompt_type: Literal["encoder", "decoder"],
20482048
):
2049-
if prompt_type == "encoder" and self.tokenizer is not None:
2050-
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
2051-
model_config = self.model_config
2049+
model_config = self.model_config
2050+
tokenizer = (None if self.tokenizer is None else
2051+
self.tokenizer.get_lora_tokenizer(lora_request))
20522052

2053-
if model_config.is_multimodal_model:
2053+
prompt_ids = prompt_inputs["prompt_token_ids"]
2054+
if not prompt_ids:
2055+
if prompt_type == "encoder" and model_config.is_multimodal_model:
2056+
pass # Mllama may have empty encoder inputs for text-only data
2057+
else:
2058+
raise ValueError(f"The {prompt_type} prompt cannot be empty")
2059+
2060+
max_prompt_len = self.model_config.max_model_len
2061+
if len(prompt_ids) >= max_prompt_len:
2062+
if prompt_type == "encoder" and model_config.is_multimodal_model:
20542063
mm_registry = self.input_preprocessor.mm_registry
20552064
mm_processor = mm_registry.create_processor(
2056-
model_config, tokenizer=tokenizer)
2065+
model_config,
2066+
tokenizer=tokenizer or object(), # Dummy if no tokenizer
2067+
)
20572068
assert isinstance(mm_processor, EncDecMultiModalProcessor)
20582069

20592070
if mm_processor.pad_dummy_encoder_prompt:
20602071
return # Skip encoder length check for Whisper
20612072

2062-
prompt_ids = prompt_inputs["prompt_token_ids"]
2063-
2064-
if not prompt_ids:
2065-
raise ValueError(f"The {prompt_type} prompt cannot be empty")
2066-
2067-
max_prompt_len = self.model_config.max_model_len
2068-
if len(prompt_ids) >= max_prompt_len:
2069-
if self.model_config.is_multimodal_model:
2073+
if model_config.is_multimodal_model:
20702074
suggestion = (
20712075
"Make sure that `max_model_len` is no smaller than the "
20722076
"number of text tokens plus multimodal tokens. For image "

vllm/model_executor/models/mllama.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,9 @@ def apply(
211211
# }
212212

213213
if mm_data:
214+
hf_processor = self.info.get_hf_processor()
215+
image_token: str = hf_processor.image_token
216+
214217
# Since only the last group of consecutive images
215218
# are attended by the decoded tokens, we only need to
216219
# get the number of tokens for those images.
@@ -227,7 +230,7 @@ def apply(
227230
num_tokens = decode_tiles * token_per_chunk
228231
mm_inputs["encoder_prompt_token_ids"] = [image_token_id
229232
] * num_tokens
230-
mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
233+
mm_inputs["encoder_prompt"] = image_token * num_tokens
231234

232235
return mm_inputs
233236

vllm/v1/engine/processor.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -315,32 +315,34 @@ def _validate_model_input(
315315
*,
316316
prompt_type: Literal["encoder", "decoder"],
317317
):
318+
model_config = self.model_config
318319
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
319320

320-
if prompt_type == "encoder":
321-
model_config = self.model_config
322-
323-
if model_config.is_multimodal_model:
324-
mm_registry = self.input_preprocessor.mm_registry
325-
mm_processor = mm_registry.create_processor(
326-
model_config, tokenizer=tokenizer)
327-
assert isinstance(mm_processor, EncDecMultiModalProcessor)
328-
329-
if mm_processor.pad_dummy_encoder_prompt:
330-
return # Skip encoder length check for Whisper
331-
332321
prompt_ids = prompt_inputs["prompt_token_ids"]
333-
334322
if not prompt_ids:
335-
raise ValueError(f"The {prompt_type} prompt cannot be empty")
323+
if prompt_type == "encoder" and model_config.is_multimodal_model:
324+
pass # Mllama may have empty encoder inputs for text-only data
325+
else:
326+
raise ValueError(f"The {prompt_type} prompt cannot be empty")
336327

337-
max_input_id = max(prompt_ids)
328+
max_input_id = max(prompt_ids, default=0)
338329
if max_input_id > tokenizer.max_token_id:
339330
raise ValueError(f"Token id {max_input_id} is out of vocabulary")
340331

341332
max_prompt_len = self.model_config.max_model_len
342333
if len(prompt_ids) >= max_prompt_len:
343-
if self.model_config.is_multimodal_model:
334+
if prompt_type == "encoder" and model_config.is_multimodal_model:
335+
mm_registry = self.input_preprocessor.mm_registry
336+
mm_processor = mm_registry.create_processor(
337+
model_config,
338+
tokenizer=tokenizer,
339+
)
340+
assert isinstance(mm_processor, EncDecMultiModalProcessor)
341+
342+
if mm_processor.pad_dummy_encoder_prompt:
343+
return # Skip encoder length check for Whisper
344+
345+
if model_config.is_multimodal_model:
344346
suggestion = (
345347
"Make sure that `max_model_len` is no smaller than the "
346348
"number of text tokens plus multimodal tokens. For image "

0 commit comments

Comments
 (0)