[vLLM] Compatibility fixes for model generators after pulling Apr16-July22 upstream changes - removed legacy input processors and refactored for multi-modal models (#28406)

skhorasganiTT · idjuricTT · ricozhu-TT · commit 38759e2051bf · 2025-09-26T23:41:52.000Z
### Ticket [N/A](#27285) ### Problem description - Legacy input mappers/processors were removed from vLLM V0 (vllm-project/vllm#15686, vllm-project/vllm#10114). These changes are required to maintain compatibility of existing integrated models after pulling upstream changes in tenstorrent/vllm#172. ### What's changed - Removed legacy vLLM input processors from Llama3, Gemma3, Qwen2.5-VL - Defined new multi-modal input processor classes for Llama3.2-11B-Vision (`MllamaMultiModalProcessor`), Gemma3 / Qwen2.5-VL (`MultiModalProcessor`) and added support multi-modal limits for each - Moved max seq len assertion for Llama8B to model initialization, `--max_model_len` must be set on vLLM side for any models which support less than default max context length - Fixed bug where `create_multimodal_model` import was removed for Llama3.2-11B-Vision and broke the model (from 87b758d) ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI with demo tests passes (if applicable) - [x] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [x] (For models and ops writers) [Single-card demo tests](https://github.com/tenstorrent/tt-metal/actions/workflows/single-card-demo-tests.yaml) CI passes (if applicable) See [recommended dev flow](https://github.com/tenstorrent/tt-metal/blob/main/models/docs/MODEL_ADD.md#a-recommended-dev-flow-on-github-for-adding-new-models). - [x] [Galaxy quick](https://github.com/tenstorrent/tt-metal/actions/workflows/galaxy-quick.yaml) CI passes (if applicable) - [x] [Galaxy demo tests, for Llama](https://github.com/tenstorrent/tt-metal/actions/workflows/galaxy-demo-tests.yaml) CI passes, if applicable, because of current Llama work - [x] (For runtime and ops writers) [T3000 unit tests](https://github.com/tenstorrent/tt-metal/actions/workflows/t3000-unit-tests.yaml) CI passes (if applicable, since this is run on push to main) - [x] (For models and ops writers) [T3000 demo tests](https://github.com/tenstorrent/tt-metal/actions/workflows/t3000-demo-tests.yaml) CI passes (if applicable, since this is required for release) - [x] New/Existing tests provide coverage for changes vLLM nightly tests - https://github.com/tenstorrent/tt-metal/actions/runs/17680447236 --------- Signed-off-by: Salar <skhorasgani@tenstorrent.com> Co-authored-by: Igor Djuric <idjuric@tenstorrent.com>
diff --git a/models/demos/llama3_70b_galaxy/tt/generator_vllm.py b/models/demos/llama3_70b_galaxy/tt/generator_vllm.py
@@ -88,63 +88,6 @@ def initialize_vllm_text_transformer(
     return tt_model, model_args
 
 
-def initialize_vllm_text_transformer_qwen(
-    hf_config,
-    tt_data_parallel,
-    mesh_device,
-    max_batch_size,
-    max_seq_len,
-    n_layers=None,
-    dtype=ttnn.bfloat8_b,
-    optimizations=LlamaOptimizations.performance,
-):
-    submesh_devices = create_submeshes(mesh_device, tt_data_parallel)
-    # Load model args, weights
-    model_args = []
-    for submesh in submesh_devices:
-        model_args_i = TtQwenModelArgs(
-            submesh,
-            instruct=(
-                "Instruct" in hf_config._name_or_path or "DeepSeek-R1-Distill-Llama-70B" in hf_config._name_or_path
-            ),
-            max_batch_size=max_batch_size // tt_data_parallel,
-            # optimizations=optimizations,
-            max_seq_len=max_seq_len,
-        )
-
-        if n_layers is not None:
-            model_args_i.n_layers = n_layers
-
-        model_args.append(model_args_i)
-
-    state_dict = model_args[0].load_state_dict()
-
-    tt_model = []
-    for i, submesh in enumerate(submesh_devices):
-        tt_model_i = TtTransformer(
-            args=model_args[i],
-            mesh_device=submesh,
-            dtype=ttnn.bfloat8_b,
-            state_dict=state_dict,
-            weight_cache_path=model_args[i].weight_cache_path(ttnn.bfloat8_b),
-            use_paged_kv_cache=True,
-            mode="prefill",
-            enable_prefetcher_performance_mode=True,
-        )
-        tt_model.append(tt_model_i)
-
-    return tt_model, model_args
-
-
-def input_processor_for_llama_text(ctx, inputs):
-    return inputs
-
-
-def input_processor_for_qwen_text(ctx, inputs):
-    return inputs
-
-
-# @INPUT_REGISTRY.register_input_processor(input_processor_for_llama_text)
 class LlamaForCausalLM(Generator):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/models/demos/qwen25_vl/tt/generator_vllm.py b/models/demos/qwen25_vl/tt/generator_vllm.py
@@ -4,21 +4,23 @@
 
 
 from types import SimpleNamespace
+from typing import Mapping, Optional
 
 import torch
 from loguru import logger
 from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
     Qwen2_5_VLForConditionalGeneration as Ref_Qwen2_5_VLForConditionalGeneration,
 )
-from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VLProcessingInfo
+from vllm.multimodal import MULTIMODAL_REGISTRY
 
 import ttnn
 from models.demos.qwen25_vl.tt.common import merge_vision_tokens, multimodal_rope_from_hf, preprocess_inputs_prefill
 from models.demos.qwen25_vl.tt.generator import Generator as QwenVLGenerator
 from models.demos.qwen25_vl.tt.model import DropInVisionTransformer, Transformer
 from models.demos.qwen25_vl.tt.model_config import VisionModelArgs
-from models.tt_transformers.tt.generator_vllm import input_processor_for_multimodal
+from models.tt_transformers.tt.generator_vllm import DummyInputsBuilder, MultiModalProcessor
 from models.tt_transformers.tt.model_config import DecodersPrecision, ModelArgs
 
 
@@ -89,7 +91,15 @@ def __contains__(self, key):
         return key in self.__dict__
 
 
-@INPUT_REGISTRY.register_input_processor(input_processor_for_multimodal)
+class TT_Qwen2_5_VLProcessingInfo(Qwen2_5_VLProcessingInfo):
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1, "video": 0}  # [INFO] videos are not supported yet, only supporting 1 image for now
+
+
+# TODO: Eventually replace MultiModalProcessor with vllm.model_executor.models.qwen2_5_vl::Qwen2_5_VLMultiModalProcessor
+@MULTIMODAL_REGISTRY.register_processor(
+    MultiModalProcessor, info=TT_Qwen2_5_VLProcessingInfo, dummy_inputs=DummyInputsBuilder
+)
 class Qwen2_5_VLForConditionalGeneration(QwenVLGenerator, SupportsMultiModal):
     def __init__(self, *args, **kwargs):
         self.reference_model = kwargs.pop("reference_model", None)
@@ -167,15 +177,17 @@ def prefill_forward(
 
         # reconstruct the inputs that Qwen2.5-VL expects
         inputs = CustomNamespace()
-        inputs.input_ids = tokens.to(images[0].attention_mask.dtype)
+        inputs.input_ids = tokens.to(images[0].attention_mask.dtype) if images[0] is not None else tokens
         inputs.attention_mask = torch.concat(
             [
                 torch.nn.functional.pad(im.attention_mask, (0, padded_seq_len - im.attention_mask.shape[-1]), value=0)
-                for im in images
+                if im is not None
+                else torch.ones_like(tokens[i : i + 1], dtype=tokens.dtype)
+                for i, im in enumerate(images)
             ],
             dim=0,
         )
-        if "pixel_values" in images[0]:
+        if images[0] is not None and "pixel_values" in images[0]:
             # we currently do not support mixed inputs of text-only users and text-image users; hence checking images[0] is enough
             inputs.pixel_values = torch.concat([im.pixel_values for im in images], dim=0)
             inputs.image_grid_thw = torch.concat([im.image_grid_thw for im in images], dim=0)
diff --git a/models/tt_transformers/tt/generator_vllm.py b/models/tt_transformers/tt/generator_vllm.py