[vLLM] Compatibility fixes for model generators after pulling Apr16-July22 upstream changes - removed legacy input processors and refactored for multi-modal models (#28406)

skhorasganiTT · idjuricTT · yugi957 · commit 95c7f5068708 · 2025-09-23T21:44:56.000Z
### Ticket [N/A](#27285) ### Problem description - Legacy input mappers/processors were removed from vLLM V0 (vllm-project/vllm#15686, vllm-project/vllm#10114). These changes are required to maintain compatibility of existing integrated models after pulling upstream changes in tenstorrent/vllm#172. ### What's changed - Removed legacy vLLM input processors from Llama3, Gemma3, Qwen2.5-VL - Defined new multi-modal input processor classes for Llama3.2-11B-Vision (`MllamaMultiModalProcessor`), Gemma3 / Qwen2.5-VL (`MultiModalProcessor`) and added support multi-modal limits for each - Moved max seq len assertion for Llama8B to model initialization, `--max_model_len` must be set on vLLM side for any models which support less than default max context length - Fixed bug where `create_multimodal_model` import was removed for Llama3.2-11B-Vision and broke the model (from 87b758d) ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI with demo tests passes (if applicable) - [x] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [x] (For models and ops writers) [Single-card demo tests](https://github.com/tenstorrent/tt-metal/actions/workflows/single-card-demo-tests.yaml) CI passes (if applicable) See [recommended dev flow](https://github.com/tenstorrent/tt-metal/blob/main/models/docs/MODEL_ADD.md#a-recommended-dev-flow-on-github-for-adding-new-models). - [x] [Galaxy quick](https://github.com/tenstorrent/tt-metal/actions/workflows/galaxy-quick.yaml) CI passes (if applicable) - [x] [Galaxy demo tests, for Llama](https://github.com/tenstorrent/tt-metal/actions/workflows/galaxy-demo-tests.yaml) CI passes, if applicable, because of current Llama work - [x] (For runtime and ops writers) [T3000 unit tests](https://github.com/tenstorrent/tt-metal/actions/workflows/t3000-unit-tests.yaml) CI passes (if applicable, since this is run on push to main) - [x] (For models and ops writers) [T3000 demo tests](https://github.com/tenstorrent/tt-metal/actions/workflows/t3000-demo-tests.yaml) CI passes (if applicable, since this is required for release) - [x] New/Existing tests provide coverage for changes vLLM nightly tests - https://github.com/tenstorrent/tt-metal/actions/runs/17680447236 --------- Signed-off-by: Salar <skhorasgani@tenstorrent.com> Co-authored-by: Igor Djuric <idjuric@tenstorrent.com>
diff --git a/models/demos/llama3_70b_galaxy/tt/generator_vllm.py b/models/demos/llama3_70b_galaxy/tt/generator_vllm.py
@@ -9,7 +9,6 @@
 from models.demos.llama3_70b_galaxy.tt.llama_model import TtTransformer
 from models.demos.llama3_70b_galaxy.tt.model_config import LlamaOptimizations, TtModelArgs
 from models.tt_transformers.tt.generator import create_submeshes
-from vllm.inputs import INPUT_REGISTRY
 
 
 def allocate_vllm_kv_cache(kv_cache_shape, dtype, num_layers, model: TtTransformer, tt_cache_path):
@@ -88,11 +87,6 @@ def initialize_vllm_text_transformer(
     return tt_model, model_args
 
 
-def input_processor_for_llama_text(ctx, inputs):
-    return inputs
-
-
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llama_text)
 class LlamaForCausalLM(Generator):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/models/demos/qwen25_vl/tt/generator_vllm.py b/models/demos/qwen25_vl/tt/generator_vllm.py
@@ -4,21 +4,23 @@
 
 
 from types import SimpleNamespace
+from typing import Mapping, Optional
 
 import torch
 from loguru import logger
 from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
     Qwen2_5_VLForConditionalGeneration as Ref_Qwen2_5_VLForConditionalGeneration,
 )
-from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VLProcessingInfo
+from vllm.multimodal import MULTIMODAL_REGISTRY
 
 import ttnn
 from models.demos.qwen25_vl.tt.common import merge_vision_tokens, multimodal_rope_from_hf, preprocess_inputs_prefill
 from models.demos.qwen25_vl.tt.generator import Generator as QwenVLGenerator
 from models.demos.qwen25_vl.tt.model import DropInVisionTransformer, Transformer
 from models.demos.qwen25_vl.tt.model_config import VisionModelArgs
-from models.tt_transformers.tt.generator_vllm import input_processor_for_multimodal
+from models.tt_transformers.tt.generator_vllm import DummyInputsBuilder, MultiModalProcessor
 from models.tt_transformers.tt.model_config import DecodersPrecision, ModelArgs
 
 
@@ -89,7 +91,15 @@ def __contains__(self, key):
         return key in self.__dict__
 
 
-@INPUT_REGISTRY.register_input_processor(input_processor_for_multimodal)
+class TT_Qwen2_5_VLProcessingInfo(Qwen2_5_VLProcessingInfo):
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1, "video": 0}  # [INFO] videos are not supported yet, only supporting 1 image for now
+
+
+# TODO: Eventually replace MultiModalProcessor with vllm.model_executor.models.qwen2_5_vl::Qwen2_5_VLMultiModalProcessor
+@MULTIMODAL_REGISTRY.register_processor(
+    MultiModalProcessor, info=TT_Qwen2_5_VLProcessingInfo, dummy_inputs=DummyInputsBuilder
+)
 class Qwen2_5_VLForConditionalGeneration(QwenVLGenerator, SupportsMultiModal):
     def __init__(self, *args, **kwargs):
         self.reference_model = kwargs.pop("reference_model", None)
@@ -167,15 +177,17 @@ def prefill_forward(
 
         # reconstruct the inputs that Qwen2.5-VL expects
         inputs = CustomNamespace()
-        inputs.input_ids = tokens.to(images[0].attention_mask.dtype)
+        inputs.input_ids = tokens.to(images[0].attention_mask.dtype) if images[0] is not None else tokens
         inputs.attention_mask = torch.concat(
             [
                 torch.nn.functional.pad(im.attention_mask, (0, padded_seq_len - im.attention_mask.shape[-1]), value=0)
-                for im in images
+                if im is not None
+                else torch.ones_like(tokens[i : i + 1], dtype=tokens.dtype)
+                for i, im in enumerate(images)
             ],
             dim=0,
         )
-        if "pixel_values" in images[0]:
+        if images[0] is not None and "pixel_values" in images[0]:
             # we currently do not support mixed inputs of text-only users and text-image users; hence checking images[0] is enough
             inputs.pixel_values = torch.concat([im.pixel_values for im in images], dim=0)
             inputs.image_grid_thw = torch.concat([im.image_grid_thw for im in images], dim=0)
diff --git a/models/tt_transformers/tt/generator_vllm.py b/models/tt_transformers/tt/generator_vllm.py