|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +"""Tests for phi4mm's multimodal preprocessing kwargs.""" |
| 3 | +import pytest |
| 4 | + |
| 5 | +from vllm.multimodal import MULTIMODAL_REGISTRY |
| 6 | + |
| 7 | +from ....conftest import _ImageAssets |
| 8 | +from ...utils import build_model_context |
| 9 | + |
| 10 | + |
| 11 | +@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"]) |
| 12 | +# yapf: disable |
| 13 | +@pytest.mark.parametrize( |
| 14 | + ("mm_processor_kwargs", "expected_toks_per_img"), |
| 15 | + [ |
| 16 | + ({"dynamic_hd": 4}, 1329), |
| 17 | + ({"dynamic_hd": 16}, 4433), |
| 18 | + # the default num_crops of phi-4-multimodal is 36 |
| 19 | + ({}, 9585), |
| 20 | + ]) |
| 21 | +# yapf: enable |
| 22 | +@pytest.mark.parametrize("num_imgs", [1, 2]) |
| 23 | +@pytest.mark.parametrize("kwargs_on_init", [True, False]) |
| 24 | +def test_processor_override( |
| 25 | + image_assets: _ImageAssets, |
| 26 | + model_id: str, |
| 27 | + mm_processor_kwargs: dict[str, int], |
| 28 | + expected_toks_per_img: int, |
| 29 | + num_imgs: int, |
| 30 | + kwargs_on_init: bool, |
| 31 | +): |
| 32 | + """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly.""" |
| 33 | + # Avoid initializing CUDA early |
| 34 | + from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID |
| 35 | + |
| 36 | + ctx = build_model_context( |
| 37 | + model_id, |
| 38 | + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, |
| 39 | + limit_mm_per_prompt={"image": num_imgs}, |
| 40 | + ) |
| 41 | + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) |
| 42 | + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs |
| 43 | + |
| 44 | + # Build the image str / prompt based on the number of images we pass |
| 45 | + img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) |
| 46 | + prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" |
| 47 | + |
| 48 | + image_size = ctx.get_hf_config( |
| 49 | + ).embd_layer["image_embd_layer"]["crop_size"] |
| 50 | + dummy_image_size = (image_size * 7, image_size * 7) |
| 51 | + dummy_image = image_assets[0].pil_image.resize(dummy_image_size) |
| 52 | + mm_data = {"image": [dummy_image] * num_imgs} |
| 53 | + |
| 54 | + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) |
| 55 | + |
| 56 | + # Ensure we have the right number of placeholders per num_crops size |
| 57 | + img_tok_count = processed_inputs["prompt_token_ids"].count( |
| 58 | + _IMAGE_PLACEHOLDER_TOKEN_ID) |
| 59 | + assert img_tok_count == expected_toks_per_img * num_imgs |
0 commit comments