vllm-project
diff --git a/‎tests/models/language/pooling/test_mm_classifier_conversion.py‎
Lines changed: 114 additions & 0 deletions b/‎tests/models/language/pooling/test_mm_classifier_conversion.py‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎vllm/entrypoints/llm.py‎
Lines changed: 90 additions & 52 deletions b/‎vllm/entrypoints/llm.py‎
Lines changed: 90 additions & 52 deletions
diff --git a/‎vllm/model_executor/model_loader/utils.py‎
Lines changed: 14 additions & 4 deletions b/‎vllm/model_executor/model_loader/utils.py‎
Lines changed: 14 additions & 4 deletions
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.platforms import current_platform
+
+
+def test_idefics_multimodal(
+    vllm_runner,
+    monkeypatch,
+) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
+                     runner="pooling",
+                     task="classify",
+                     convert="classify",
+                     load_format="dummy",
+                     max_model_len=512,
+                     enforce_eager=True,
+                     tensor_parallel_size=1,
+                     disable_log_stats=True,
+                     dtype="bfloat16") as vllm_model:
+        llm = vllm_model.get_llm()
+        outputs = llm.classify(prompts)
+        for output in outputs:
+            assert len(output.outputs.probs) == 2
+
+
+def update_config(config):
+    config.text_config.update({
+        "architectures": ["Gemma3ForSequenceClassification"],
+        "classifier_from_token": ["A", "B", "C", "D", "E"],
+        "method":
+        "no_post_processing",
+        "id2label": {
+            "A": "Chair",
+            "B": "Couch",
+            "C": "Table",
+            "D": "Bed",
+            "E": "Cupboard"
+        },
+    })
+    return config
+
+
+def test_gemma_multimodal(
+    vllm_runner,
+    monkeypatch,
+) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    messages = [{
+        "role":
+        "system",
+        "content":
+        """
+    You are a helpful assistant. You will be given a product description
+    which may also include an image. Classify the following product into
+    one of the categories:
+
+    A = chair
+    B = couch
+    C = table
+    D = bed
+    E = cupboard
+
+    You'll answer with exactly one letter (A, B, C, D, or E)."""
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url":
+                "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
+            }
+        }, {
+            "type": "text",
+            "text": "A fine 19th century piece of furniture."
+        }]
+    }]
+
+    with vllm_runner(model_name="google/gemma-3-4b-it",
+                     runner="pooling",
+                     task="classify",
+                     convert="classify",
+                     load_format="auto",
+                     hf_overrides=update_config,
+                     override_pooler_config={"pooling_type": "LAST"},
+                     max_model_len=512,
+                     enforce_eager=True,
+                     tensor_parallel_size=1,
+                     disable_log_stats=True,
+                     dtype="bfloat16") as vllm_model:
+
+        llm = vllm_model.get_llm()
+        prompts = llm.preprocess_chat(messages)
+
+        result = llm.classify(prompts)
+        assert result[0].outputs.probs[0] > 0.95
+        assert all(c < 0.05 for c in result[0].outputs.probs[1:])
@@ -703,13 +703,10 @@ def create_tokens_prompt_from_beam(
 
         return outputs
 
-    def chat(
+    def preprocess_chat(
         self,
         messages: Union[list[ChatCompletionMessageParam],
                         list[list[ChatCompletionMessageParam]]],
-        sampling_params: Optional[Union[SamplingParams,
-                                        list[SamplingParams]]] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[LoRARequest] = None,
         chat_template: Optional[str] = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
@@ -718,56 +715,16 @@ def chat(
         tools: Optional[list[dict[str, Any]]] = None,
         chat_template_kwargs: Optional[dict[str, Any]] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[RequestOutput]:
+    ) -> list[TokensPrompt]:
         """
-        Generate responses for a chat conversation.
-
-        The chat conversation is converted into a text prompt using the
-        tokenizer and calls the [generate][vllm.LLM.generate] method to generate
-        the responses.
-
-        Multi-modal inputs can be passed in the same way you would pass them
-        to the OpenAI API.
-
-        Args:
-            messages: A list of conversations or a single conversation.
-
-                - Each conversation is represented as a list of messages.
-                - Each message is a dictionary with 'role' and 'content' keys.
-
-            sampling_params: The sampling parameters for text generation.
-                If None, we use the default sampling parameters. When it
-                is a single value, it is applied to every prompt. When it
-                is a list, the list must have the same length as the
-                prompts and it is paired one by one with the prompt.
-            use_tqdm: If `True`, shows a tqdm progress bar.
-                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
-                it is used to create the progress bar.
-                If `False`, no progress bar is created.
-            lora_request: LoRA request to use for generation, if any.
-            chat_template: The template to use for structuring the chat.
-                If not provided, the model's default chat template will be used.
-            chat_template_content_format: The format to render message content.
-
-                - "string" will render the content as a string.
-                  Example: `"Who are you?"`
-                - "openai" will render the content as a list of dictionaries,
-                  similar to OpenAI schema.
-                  Example: `[{"type": "text", "text": "Who are you?"}]`
-
-            add_generation_prompt: If True, adds a generation template
-                to each message.
-            continue_final_message: If True, continues the final message in
-                the conversation instead of starting a new one. Cannot be
-                `True` if `add_generation_prompt` is also `True`.
-            chat_template_kwargs: Additional kwargs to pass to the chat
-                template.
-            mm_processor_kwargs: Multimodal processor kwarg overrides for this
-                chat request. Only used for offline requests.
+        Generate prompt for a chat conversation. The pre-processed
+        prompt can then be used as input for the other LLM methods.
 
+        Refer to `chat` for a complete description of the arguments.
         Returns:
-            A list of `RequestOutput` objects containing the generated
-            responses in the same order as the input messages.
+            A list of `TokensPrompts` objects containing the tokenized
+            prompt after chat template interpolation, and the
+            pre-processed multi-modal inputs.
         """
         list_of_messages: list[list[ChatCompletionMessageParam]]
 
@@ -800,7 +757,7 @@ def chat(
         )
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
-        prompts: list[Union[TokensPrompt, TextPrompt]] = []
+        prompts: list[TokensPrompt] = []
 
         for msgs in list_of_messages:
             # NOTE: _parse_chat_message_content_parts() currently doesn't
@@ -844,6 +801,87 @@ def chat(
 
             prompts.append(prompt)
 
+        return prompts
+
+    def chat(
+        self,
+        messages: Union[list[ChatCompletionMessageParam],
+                        list[list[ChatCompletionMessageParam]]],
+        sampling_params: Optional[Union[SamplingParams,
+                                        list[SamplingParams]]] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        lora_request: Optional[LoRARequest] = None,
+        chat_template: Optional[str] = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        add_generation_prompt: bool = True,
+        continue_final_message: bool = False,
+        tools: Optional[list[dict[str, Any]]] = None,
+        chat_template_kwargs: Optional[dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[RequestOutput]:
+        """
+        Generate responses for a chat conversation.
+
+        The chat conversation is converted into a text prompt using the
+        tokenizer and calls the [generate][vllm.LLM.generate] method to generate
+        the responses.
+
+        Multi-modal inputs can be passed in the same way you would pass them
+        to the OpenAI API.
+
+        Args:
+            messages: A list of conversations or a single conversation.
+
+                - Each conversation is represented as a list of messages.
+                - Each message is a dictionary with 'role' and 'content' keys.
+
+            sampling_params: The sampling parameters for text generation.
+                If None, we use the default sampling parameters. When it
+                is a single value, it is applied to every prompt. When it
+                is a list, the list must have the same length as the
+                prompts and it is paired one by one with the prompt.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+            chat_template: The template to use for structuring the chat.
+                If not provided, the model's default chat template will be used.
+            chat_template_content_format: The format to render message content.
+
+                - "string" will render the content as a string.
+                  Example: `"Who are you?"`
+                - "openai" will render the content as a list of dictionaries,
+                  similar to OpenAI schema.
+                  Example: `[{"type": "text", "text": "Who are you?"}]`
+
+            add_generation_prompt: If True, adds a generation template
+                to each message.
+            continue_final_message: If True, continues the final message in
+                the conversation instead of starting a new one. Cannot be
+                `True` if `add_generation_prompt` is also `True`.
+            chat_template_kwargs: Additional kwargs to pass to the chat
+                template.
+            mm_processor_kwargs: Multimodal processor kwarg overrides for this
+                chat request. Only used for offline requests.
+
+        Returns:
+            A list of `RequestOutput` objects containing the generated
+            responses in the same order as the input messages.
+        """
+
+        prompts = self.preprocess_chat(
+            messages=messages,
+            lora_request=lora_request,
+            chat_template=chat_template,
+            chat_template_content_format=chat_template_content_format,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=continue_final_message,
+            tools=tools,
+            chat_template_kwargs=chat_template_kwargs,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+
         return self.generate(
             prompts,
             sampling_params=sampling_params,
 
@@ -19,10 +19,11 @@
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.models.adapters import (as_embedding_model,
-                                                 as_reward_model,
-                                                 as_seq_cls_model)
-from vllm.model_executor.models.interfaces import SupportsQuant
+from vllm.model_executor.models.adapters import (
+    as_embedding_model, as_reward_model, as_seq_cls_model,
+    try_create_mm_pooling_model_cls)
+from vllm.model_executor.models.interfaces import (SupportsQuant,
+                                                   supports_multimodal)
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -183,6 +184,15 @@ def get_model_architecture(
                 "performance may not be optimal.", arch)
 
     convert_type = model_config.convert_type
+    if convert_type != "none" and supports_multimodal(model_cls):
+        logger.debug_once("Detected conversion of Multi Modal model.")
+        converted = try_create_mm_pooling_model_cls(model_cls)
+        if converted is not None:
+            logger.debug_once("Creating wrapper class to forward pooler.")
+            return converted, arch
+        else:
+            logger.debug_once("Attempting direct conversion.")
+
     if convert_type == "none":
         pass
     elif convert_type == "embed":