[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)

DarkLight1337 · web-flow · commit a5d11a54dc45 · 2025-04-10T14:19:42.000+08:00
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -2046,27 +2046,31 @@ def _validate_model_input(
         *,
         prompt_type: Literal["encoder", "decoder"],
     ):
-        if prompt_type == "encoder" and self.tokenizer is not None:
-            tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
-            model_config = self.model_config
+        model_config = self.model_config
+        tokenizer = (None if self.tokenizer is None else
+                     self.tokenizer.get_lora_tokenizer(lora_request))
 
-            if model_config.is_multimodal_model:
+        prompt_ids = prompt_inputs["prompt_token_ids"]
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) >= max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
-                    model_config, tokenizer=tokenizer)
+                    model_config,
+                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
+                )
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
 
                 if mm_processor.pad_dummy_encoder_prompt:
                     return  # Skip encoder length check for Whisper
 
-        prompt_ids = prompt_inputs["prompt_token_ids"]
-
-        if not prompt_ids:
-            raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) >= max_prompt_len:
-            if self.model_config.is_multimodal_model:
+            if model_config.is_multimodal_model:
                 suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -211,6 +211,9 @@ def apply(
         # }
 
         if mm_data:
+            hf_processor = self.info.get_hf_processor()
+            image_token: str = hf_processor.image_token
+
             # Since only the last group of consecutive images
             # are attended by the decoded tokens, we only need to
             # get the number of tokens for those images.
@@ -227,7 +230,7 @@ def apply(
             num_tokens = decode_tiles * token_per_chunk
             mm_inputs["encoder_prompt_token_ids"] = [image_token_id
                                                      ] * num_tokens
-            mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
+            mm_inputs["encoder_prompt"] = image_token * num_tokens
 
         return mm_inputs
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -315,32 +315,34 @@ def _validate_model_input(
         *,
         prompt_type: Literal["encoder", "decoder"],
     ):
+        model_config = self.model_config
         tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
 
-        if prompt_type == "encoder":
-            model_config = self.model_config
-
-            if model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                mm_processor = mm_registry.create_processor(
-                    model_config, tokenizer=tokenizer)
-                assert isinstance(mm_processor, EncDecMultiModalProcessor)
-
-                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
-
         prompt_ids = prompt_inputs["prompt_token_ids"]
-
         if not prompt_ids:
-            raise ValueError(f"The {prompt_type} prompt cannot be empty")
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
-        max_input_id = max(prompt_ids)
+        max_input_id = max(prompt_ids, default=0)
         if max_input_id > tokenizer.max_token_id:
             raise ValueError(f"Token id {max_input_id} is out of vocabulary")
 
         max_prompt_len = self.model_config.max_model_len
         if len(prompt_ids) >= max_prompt_len:
-            if self.model_config.is_multimodal_model:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
                 suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "