Skip to content

Commit 83f3c3b

Browse files
[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)
Signed-off-by: Isotr0py <[email protected]> Signed-off-by: DarkLight1337 <[email protected]> Co-authored-by: DarkLight1337 <[email protected]>
1 parent d9737ca commit 83f3c3b

File tree

15 files changed

+792
-1220
lines changed

15 files changed

+792
-1220
lines changed

docs/source/models/supported_models.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ
10041004
* `microsoft/Phi-4-multimodal-instruct`, etc.
10051005
* ✅︎
10061006
*
1007-
*
1007+
* ✅︎
10081008
- * `PixtralForConditionalGeneration`
10091009
* Pixtral
10101010
* T + I<sup>+</sup>

examples/offline_inference/audio_language.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
8989
engine_args = EngineArgs(
9090
model=model_path,
9191
trust_remote_code=True,
92-
max_model_len=4096,
92+
max_model_len=12800,
9393
max_num_seqs=2,
9494
enable_lora=True,
9595
max_lora_rank=320,

examples/offline_inference/vision_language.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -814,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
814814
engine_args = EngineArgs(
815815
model=model_path,
816816
trust_remote_code=True,
817-
max_model_len=4096,
817+
max_model_len=5120,
818818
max_num_seqs=2,
819+
max_num_batched_tokens=12800,
819820
enable_lora=True,
820821
max_lora_rank=320,
822+
# Note - mm_processor_kwargs can also be passed to generate/chat calls
823+
mm_processor_kwargs={"dynamic_hd": 16},
821824
limit_mm_per_prompt={"image": 1},
822825
)
823826

examples/offline_inference/vision_language_multi_image.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
503503
engine_args = EngineArgs(
504504
model=model_path,
505505
trust_remote_code=True,
506-
max_model_len=10000,
506+
max_model_len=4096,
507507
max_num_seqs=2,
508508
limit_mm_per_prompt={"image": len(image_urls)},
509509
enable_lora=True,
510510
max_lora_rank=320,
511+
# Note - mm_processor_kwargs can also be passed to generate/chat calls
512+
mm_processor_kwargs={"dynamic_hd": 4},
511513
)
512514

513515
placeholders = "".join(f"<|image_{i}|>"

requirements/docs.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ transformers
1818
mistral_common >= 1.5.4
1919
aiohttp
2020
starlette
21+
scipy
2122
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
2223
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
2324
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args

tests/models/decoder_only/audio_language/test_ultravox.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# SPDX-License-Identifier: Apache-2.0
22

33
import json
4-
from typing import Optional
4+
from typing import Any, Optional
55

66
import numpy as np
77
import pytest
88
import pytest_asyncio
99
from transformers import AutoModel, AutoTokenizer
1010

11-
from vllm.multimodal.audio import resample_audio
11+
from vllm.multimodal.audio import resample_audio_librosa
1212
from vllm.sequence import SampleLogprobs
1313

1414
from ....conftest import HfRunner, VllmRunner
@@ -43,6 +43,18 @@ def audio(request):
4343
return AudioAsset(request.param)
4444

4545

46+
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
47+
"""Convert kwargs to CLI args."""
48+
args = []
49+
for key, value in params_kwargs.items():
50+
if isinstance(value, bool):
51+
if value:
52+
args.append(f"--{key.replace('_','-')}")
53+
else:
54+
args.append(f"--{key.replace('_','-')}={value}")
55+
return args
56+
57+
4658
@pytest.fixture(params=[
4759
pytest.param({}, marks=pytest.mark.cpu_model),
4860
pytest.param(CHUNKED_PREFILL_KWARGS),
@@ -52,10 +64,7 @@ def server(request, audio_assets):
5264
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
5365
"--limit-mm-per-prompt",
5466
json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
55-
] + [
56-
f"--{key.replace('_','-')}={value}"
57-
for key, value in request.param.items()
58-
]
67+
] + params_kwargs_to_cli_args(request.param)
5968

6069
with RemoteOpenAIServer(MODEL_NAME,
6170
args,
@@ -136,9 +145,9 @@ def run_test(
136145
[hf_prompt],
137146
max_tokens,
138147
num_logprobs=num_logprobs,
139-
audios=[(resample_audio(audio[0],
140-
orig_sr=audio[1],
141-
target_sr=16000), 16000)])
148+
audios=[(resample_audio_librosa(audio[0],
149+
orig_sr=audio[1],
150+
target_sr=16000), 16000)])
142151
for _, hf_prompt, audio in prompts_and_audios
143152
]
144153

tests/models/decoder_only/vision_language/test_phi4mm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def patch_hf_processor(*args,
181181
],
182182
)
183183
@pytest.mark.parametrize("dtype", [target_dtype])
184-
@pytest.mark.parametrize("max_model_len", [4096])
184+
@pytest.mark.parametrize("max_model_len", [12800])
185185
@pytest.mark.parametrize("max_tokens", [128])
186186
@pytest.mark.parametrize("num_logprobs", [10])
187187
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@@ -225,7 +225,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
225225
],
226226
)
227227
@pytest.mark.parametrize("dtype", [target_dtype])
228-
@pytest.mark.parametrize("max_model_len", [10000])
228+
@pytest.mark.parametrize("max_model_len", [25600])
229229
@pytest.mark.parametrize("max_tokens", [128])
230230
@pytest.mark.parametrize("num_logprobs", [10])
231231
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@@ -258,7 +258,7 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
258258

259259
@pytest.mark.parametrize("model", models)
260260
@pytest.mark.parametrize("dtype", [target_dtype])
261-
@pytest.mark.parametrize("max_model_len", [10000])
261+
@pytest.mark.parametrize("max_model_len", [12800])
262262
@pytest.mark.parametrize("max_tokens", [128])
263263
@pytest.mark.parametrize("num_logprobs", [10])
264264
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
274274
"nvidia/NVLM-D-72B",
275275
"google/paligemma-3b-mix-224",
276276
"google/paligemma2-3b-ft-docci-448",
277+
"microsoft/Phi-4-multimodal-instruct",
277278
"mistralai/Pixtral-12B-2409",
278279
"mistral-community/pixtral-12b",
279280
"Qwen/Qwen-VL-Chat",
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
"""Tests for phi4mm's multimodal preprocessing kwargs."""
3+
import pytest
4+
5+
from vllm.multimodal import MULTIMODAL_REGISTRY
6+
7+
from ....conftest import _ImageAssets
8+
from ...utils import build_model_context
9+
10+
11+
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
12+
# yapf: disable
13+
@pytest.mark.parametrize(
14+
("mm_processor_kwargs", "expected_toks_per_img"),
15+
[
16+
({"dynamic_hd": 4}, 1329),
17+
({"dynamic_hd": 16}, 4433),
18+
# the default num_crops of phi-4-multimodal is 36
19+
({}, 9585),
20+
])
21+
# yapf: enable
22+
@pytest.mark.parametrize("num_imgs", [1, 2])
23+
@pytest.mark.parametrize("kwargs_on_init", [True, False])
24+
def test_processor_override(
25+
image_assets: _ImageAssets,
26+
model_id: str,
27+
mm_processor_kwargs: dict[str, int],
28+
expected_toks_per_img: int,
29+
num_imgs: int,
30+
kwargs_on_init: bool,
31+
):
32+
"""Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
33+
# Avoid initializing CUDA early
34+
from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
35+
36+
ctx = build_model_context(
37+
model_id,
38+
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
39+
limit_mm_per_prompt={"image": num_imgs},
40+
)
41+
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
42+
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
43+
44+
# Build the image str / prompt based on the number of images we pass
45+
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
46+
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
47+
48+
image_size = ctx.get_hf_config(
49+
).embd_layer["image_embd_layer"]["crop_size"]
50+
dummy_image_size = (image_size * 7, image_size * 7)
51+
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
52+
mm_data = {"image": [dummy_image] * num_imgs}
53+
54+
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
55+
56+
# Ensure we have the right number of placeholders per num_crops size
57+
img_tok_count = processed_inputs["prompt_token_ids"].count(
58+
_IMAGE_PLACEHOLDER_TOKEN_ID)
59+
assert img_tok_count == expected_toks_per_img * num_imgs

vllm/entrypoints/chat_utils.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -482,11 +482,8 @@ def _placeholder_str(self, modality: ModalityStr,
482482
if modality in ("image", "image_embeds"):
483483
if model_type == "chatglm":
484484
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
485-
if model_type == "phi3_v":
486-
# Workaround since this token is not defined in the tokenizer
485+
if model_type in ("phi3_v", "phi4mm"):
487486
return f"<|image_{current_count}|>"
488-
if model_type == "phi4mm":
489-
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
490487
if model_type in ("minicpmo", "minicpmv"):
491488
return "(<image>./</image>)"
492489
if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
@@ -522,7 +519,7 @@ def _placeholder_str(self, modality: ModalityStr,
522519
if model_type == "ultravox":
523520
return "<|audio|>"
524521
if model_type == "phi4mm":
525-
return "<|endoftext11|>" # 200011 (see vocab.json in hf model)
522+
return f"<|audio_{current_count}|>"
526523
if model_type in ("qwen2_audio", "qwen2_5_omni"):
527524
return (f"Audio {current_count}: "
528525
f"<|audio_bos|><|AUDIO|><|audio_eos|>")

0 commit comments

Comments
 (0)