[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)

Isotr0py · DarkLight1337 · web-flow · commit 83f3c3bd91f7 · 2025-04-19T02:26:11.000-07:00
Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
Co-authored-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -1004,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `microsoft/Phi-4-multimodal-instruct`, etc.
   * ✅︎
   *
-  *
+  * ✅︎
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
@@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=12800,
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
@@ -814,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=5120,
         max_num_seqs=2,
+        max_num_batched_tokens=12800,
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
         limit_mm_per_prompt={"image": 1},
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
@@ -503,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=10000,
+        max_model_len=4096,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
     )
 
     placeholders = "".join(f"<|image_{i}|>"
diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -18,6 +18,7 @@ transformers
 mistral_common >= 1.5.4
 aiohttp
 starlette
+scipy
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,14 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Optional
+from typing import Any, Optional
 
 import numpy as np
 import pytest
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer
 
-from vllm.multimodal.audio import resample_audio
+from vllm.multimodal.audio import resample_audio_librosa
 from vllm.sequence import SampleLogprobs
 
 from ....conftest import HfRunner, VllmRunner
@@ -43,6 +43,18 @@ def audio(request):
     return AudioAsset(request.param)
 
 
+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_','-')}")
+        else:
+            args.append(f"--{key.replace('_','-')}={value}")
+    return args
+
+
 @pytest.fixture(params=[
     pytest.param({}, marks=pytest.mark.cpu_model),
     pytest.param(CHUNKED_PREFILL_KWARGS),
@@ -52,10 +64,7 @@ def server(request, audio_assets):
         "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
         "--limit-mm-per-prompt",
         json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
-    ] + [
-        f"--{key.replace('_','-')}={value}"
-        for key, value in request.param.items()
-    ]
+    ] + params_kwargs_to_cli_args(request.param)
 
     with RemoteOpenAIServer(MODEL_NAME,
                             args,
@@ -136,9 +145,9 @@ def run_test(
                 [hf_prompt],
                 max_tokens,
                 num_logprobs=num_logprobs,
-                audios=[(resample_audio(audio[0],
-                                        orig_sr=audio[1],
-                                        target_sr=16000), 16000)])
+                audios=[(resample_audio_librosa(audio[0],
+                                                orig_sr=audio[1],
+                                                target_sr=16000), 16000)])
             for _, hf_prompt, audio in prompts_and_audios
         ]
 
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -181,7 +181,7 @@ def patch_hf_processor(*args,
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@@ -225,7 +225,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@@ -258,7 +258,7 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
 
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
@@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
     "nvidia/NVLM-D-72B",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-4-multimodal-instruct",
     "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
     "Qwen/Qwen-VL-Chat",
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for phi4mm's multimodal preprocessing kwargs."""
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"dynamic_hd": 4}, 1329),
+        ({"dynamic_hd": 16}, 4433),
+        # the default num_crops of phi-4-multimodal is 36
+        ({}, 9585),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+
+    image_size = ctx.get_hf_config(
+    ).embd_layer["image_embd_layer"]["crop_size"]
+    dummy_image_size = (image_size * 7, image_size * 7)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(
+        _IMAGE_PLACEHOLDER_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -482,11 +482,8 @@ def _placeholder_str(self, modality: ModalityStr,
         if modality in ("image", "image_embeds"):
             if model_type == "chatglm":
                 return "<|begin_of_image|><|endoftext|><|end_of_image|>"
-            if model_type == "phi3_v":
-                # Workaround since this token is not defined in the tokenizer
+            if model_type in ("phi3_v", "phi4mm"):
                 return f"<|image_{current_count}|>"
-            if model_type == "phi4mm":
-                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
@@ -522,7 +519,7 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type == "ultravox":
                 return "<|audio|>"
             if model_type == "phi4mm":
-                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
+                return f"<|audio_{current_count}|>"
             if model_type in ("qwen2_audio", "qwen2_5_omni"):
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
@@ -327,7 +327,7 @@ def get_num_image_tokens(
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin],
+        processor: Optional[ProcessorMixin] = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py

Original file line number	Diff line number	Diff line change
`@@ -1004,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ`
`1004`	`1004`	* `microsoft/Phi-4-multimodal-instruct`, etc.
`1005`	`1005`	`* ✅︎`
`1006`	`1006`	`*`
`1007`		`- *`
	`1007`	`+ * ✅︎`
`1008`	`1008`	- * `PixtralForConditionalGeneration`
`1009`	`1009`	`* Pixtral`
`1010`	`1010`	`* T + I<sup>+</sup>`