|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | + |
| 4 | +import pytest |
| 5 | + |
| 6 | +from vllm.assets.image import ImageAsset |
| 7 | +from vllm.assets.video import VideoAsset |
| 8 | +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig |
| 9 | +from vllm.platforms.interface import UnspecifiedPlatform |
| 10 | +from vllm.sampling_params import SamplingParams |
| 11 | +from vllm.v1.engine import processor as processor_mod |
| 12 | +from vllm.v1.engine.processor import Processor |
| 13 | + |
| 14 | +cherry_pil_image = ImageAsset("cherry_blossom").pil_image |
| 15 | +stop_pil_image = ImageAsset("stop_sign").pil_image |
| 16 | +baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays |
| 17 | + |
| 18 | + |
| 19 | +# Mock processor for testing |
| 20 | +def _mk_processor(monkeypatch, |
| 21 | + *, |
| 22 | + mm_cache_gb: float = 4.0, |
| 23 | + enable_prefix_caching: bool = True) -> Processor: |
| 24 | + """ |
| 25 | + Create a Processor instance with minimal configuration suitable for unit |
| 26 | + tests without accessing external resources. |
| 27 | + """ |
| 28 | + monkeypatch.setattr(ModelConfig, |
| 29 | + "try_get_generation_config", |
| 30 | + lambda self: {}, |
| 31 | + raising=True) |
| 32 | + monkeypatch.setattr(ModelConfig, |
| 33 | + "__post_init__", |
| 34 | + lambda self: None, |
| 35 | + raising=True) |
| 36 | + monkeypatch.setattr(UnspecifiedPlatform, |
| 37 | + "is_async_output_supported", |
| 38 | + classmethod(lambda cls, enforce_eager: True), |
| 39 | + raising=True) |
| 40 | + monkeypatch.setattr( |
| 41 | + ModelConfig, |
| 42 | + "verify_async_output_proc", |
| 43 | + lambda self, parallel_config, speculative_config, device_config: None, |
| 44 | + raising=True) |
| 45 | + monkeypatch.setattr(ModelConfig, |
| 46 | + "verify_with_parallel_config", |
| 47 | + lambda self, parallel_config: None, |
| 48 | + raising=True) |
| 49 | + monkeypatch.setattr(processor_mod, |
| 50 | + "processor_cache_from_config", |
| 51 | + lambda vllm_config, mm_registry: None, |
| 52 | + raising=True) |
| 53 | + |
| 54 | + monkeypatch.setattr(VllmConfig, |
| 55 | + "__post_init__", |
| 56 | + lambda self: None, |
| 57 | + raising=True) |
| 58 | + |
| 59 | + model_config = ModelConfig( |
| 60 | + skip_tokenizer_init=True, |
| 61 | + max_model_len=128, |
| 62 | + mm_processor_cache_gb=mm_cache_gb, |
| 63 | + generation_config="vllm", |
| 64 | + tokenizer="dummy", |
| 65 | + ) |
| 66 | + |
| 67 | + # Minimal multimodal_config to satisfy references in |
| 68 | + # Processor.process_inputs. |
| 69 | + class _MockMMConfig: |
| 70 | + |
| 71 | + def __init__(self, gb: float): |
| 72 | + self.mm_processor_cache_gb = gb |
| 73 | + |
| 74 | + model_config.multimodal_config = _MockMMConfig( |
| 75 | + mm_cache_gb) # type: ignore[attr-defined] |
| 76 | + vllm_config = VllmConfig( |
| 77 | + model_config=model_config, |
| 78 | + cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), |
| 79 | + device_config=DeviceConfig(device="cpu"), |
| 80 | + ) |
| 81 | + |
| 82 | + # Pass tokenizer=None; InputPreprocessor handles None when |
| 83 | + # skip_tokenizer_init is True. |
| 84 | + return Processor(vllm_config, tokenizer=None) # type: ignore[arg-type] |
| 85 | + |
| 86 | + |
| 87 | +def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): |
| 88 | + processor = _mk_processor(monkeypatch) |
| 89 | + |
| 90 | + prompt = { |
| 91 | + "prompt": "USER: <image>\nDescribe\nASSISTANT:", |
| 92 | + "multi_modal_data": { |
| 93 | + "image": [cherry_pil_image, stop_pil_image] |
| 94 | + }, |
| 95 | + # Mismatch: 2 items but only 1 uuid provided |
| 96 | + "multi_modal_uuids": { |
| 97 | + "image": ["hash_cherry"] |
| 98 | + }, |
| 99 | + } |
| 100 | + |
| 101 | + with pytest.raises(ValueError, match="must have same length as data"): |
| 102 | + processor.process_inputs( |
| 103 | + request_id="req-1", |
| 104 | + prompt=prompt, # type: ignore[arg-type] |
| 105 | + params=SamplingParams(), |
| 106 | + ) |
| 107 | + |
| 108 | + |
| 109 | +def test_multi_modal_uuids_missing_modality_raises(monkeypatch): |
| 110 | + processor = _mk_processor(monkeypatch) |
| 111 | + |
| 112 | + prompt = { |
| 113 | + "prompt": "USER: <image><video>\nDescribe\nASSISTANT:", |
| 114 | + # Two modalities provided in data |
| 115 | + "multi_modal_data": { |
| 116 | + "image": [cherry_pil_image], |
| 117 | + "video": [baby_reading_np_ndarrays] |
| 118 | + }, |
| 119 | + # Only image uuids provided; video missing should raise |
| 120 | + "multi_modal_uuids": { |
| 121 | + "image": ["hash_cherry"] |
| 122 | + }, |
| 123 | + } |
| 124 | + |
| 125 | + with pytest.raises(ValueError, |
| 126 | + match="must be provided if multi_modal_data"): |
| 127 | + processor.process_inputs( |
| 128 | + request_id="req-2", |
| 129 | + prompt=prompt, # type: ignore[arg-type] |
| 130 | + params=SamplingParams(), |
| 131 | + ) |
| 132 | + |
| 133 | + |
| 134 | +@pytest.mark.parametrize( |
| 135 | + "mm_cache_gb, enable_prefix_caching", |
| 136 | + [ |
| 137 | + (4.0, True), # default behavior |
| 138 | + (4.0, False), # prefix caching disabled |
| 139 | + (0.0, True), # processor cache disabled |
| 140 | + ], |
| 141 | +) |
| 142 | +def test_multi_modal_uuids_accepts_none_and_passes_through( |
| 143 | + monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool): |
| 144 | + processor = _mk_processor(monkeypatch, |
| 145 | + mm_cache_gb=mm_cache_gb, |
| 146 | + enable_prefix_caching=enable_prefix_caching) |
| 147 | + |
| 148 | + # Capture the overrides passed to InputPreprocessor.preprocess |
| 149 | + captured: dict[str, object] = {} |
| 150 | + |
| 151 | + def fake_preprocess(prompt, |
| 152 | + *, |
| 153 | + tokenization_kwargs=None, |
| 154 | + lora_request=None, |
| 155 | + mm_hash_overrides=None): |
| 156 | + captured["mm_hash_overrides"] = mm_hash_overrides |
| 157 | + # Minimal processed inputs for decoder-only flow |
| 158 | + return {"type": "token", "prompt_token_ids": [1]} |
| 159 | + |
| 160 | + # Monkeypatch only the bound preprocess method on this instance |
| 161 | + monkeypatch.setattr(processor.input_preprocessor, |
| 162 | + "preprocess", |
| 163 | + fake_preprocess, |
| 164 | + raising=True) |
| 165 | + |
| 166 | + # Use a consistent two-image scenario across all configurations |
| 167 | + mm_uuids = {"image": [None, "hash_stop"], "video": None} |
| 168 | + prompt = { |
| 169 | + "prompt": "USER: <image><image>\nTwo images\nASSISTANT:", |
| 170 | + "multi_modal_data": { |
| 171 | + "image": [cherry_pil_image, stop_pil_image], |
| 172 | + "video": baby_reading_np_ndarrays, |
| 173 | + }, |
| 174 | + "multi_modal_uuids": mm_uuids, |
| 175 | + } |
| 176 | + |
| 177 | + processor.process_inputs( |
| 178 | + request_id="req-3", |
| 179 | + prompt=prompt, # type: ignore[arg-type] |
| 180 | + params=SamplingParams(), |
| 181 | + ) |
| 182 | + |
| 183 | + assert captured["mm_hash_overrides"] == mm_uuids |
| 184 | + |
| 185 | + |
| 186 | +def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): |
| 187 | + # When both processor cache is 0 and prefix caching disabled, the |
| 188 | + # processor builds overrides from request id instead of using user UUIDs. |
| 189 | + processor = _mk_processor(monkeypatch, |
| 190 | + mm_cache_gb=0.0, |
| 191 | + enable_prefix_caching=False) |
| 192 | + |
| 193 | + captured: dict[str, object] = {} |
| 194 | + |
| 195 | + def fake_preprocess(prompt, |
| 196 | + *, |
| 197 | + tokenization_kwargs=None, |
| 198 | + lora_request=None, |
| 199 | + mm_hash_overrides=None): |
| 200 | + captured["mm_hash_overrides"] = mm_hash_overrides |
| 201 | + return {"type": "token", "prompt_token_ids": [1]} |
| 202 | + |
| 203 | + monkeypatch.setattr(processor.input_preprocessor, |
| 204 | + "preprocess", |
| 205 | + fake_preprocess, |
| 206 | + raising=True) |
| 207 | + |
| 208 | + request_id = "req-42" |
| 209 | + mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"} |
| 210 | + prompt = { |
| 211 | + "prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:", |
| 212 | + "multi_modal_data": { |
| 213 | + "image": [cherry_pil_image, stop_pil_image], |
| 214 | + "video": baby_reading_np_ndarrays, |
| 215 | + }, |
| 216 | + "multi_modal_uuids": mm_uuids, |
| 217 | + } |
| 218 | + |
| 219 | + processor.process_inputs( |
| 220 | + request_id=request_id, |
| 221 | + prompt=prompt, # type: ignore[arg-type] |
| 222 | + params=SamplingParams(), |
| 223 | + ) |
| 224 | + |
| 225 | + # Expect request-id-based overrides are passed through |
| 226 | + assert captured["mm_hash_overrides"] == { |
| 227 | + "image": [f"{request_id}-image-0", f"{request_id}-image-1"], |
| 228 | + "video": [f"{request_id}-video-0"], |
| 229 | + } |
0 commit comments