Skip to content

Commit 749be00

Browse files
authored
[Core][Multimodal] Allow passing multi_modal_uuids as multimodal identifiers. (#23394)
Signed-off-by: Roger Wang <[email protected]>
1 parent 5b8077b commit 749be00

File tree

10 files changed

+455
-54
lines changed

10 files changed

+455
-54
lines changed

docs/features/multimodal_inputs.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
1313
- `prompt`: The prompt should follow the format that is documented on HuggingFace.
1414
- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].
1515

16+
### Stable UUIDs for Caching (multi_modal_uuids)
17+
18+
When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
19+
20+
??? code
21+
22+
```python
23+
from vllm import LLM
24+
from PIL import Image
25+
26+
# Qwen2.5-VL example with two images
27+
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
28+
29+
prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
30+
img_a = Image.open("/path/to/a.jpg")
31+
img_b = Image.open("/path/to/b.jpg")
32+
33+
outputs = llm.generate({
34+
"prompt": prompt,
35+
"multi_modal_data": {"image": [img_a, img_b]},
36+
# Provide stable IDs for caching.
37+
# Requirements (matched by this example):
38+
# - Include every modality present in multi_modal_data.
39+
# - For lists, provide the same number of entries.
40+
# - Use None to fall back to content hashing for that item.
41+
"multi_modal_uuids": {"image": ["sku-1234-a", None]},
42+
})
43+
44+
for o in outputs:
45+
print(o.outputs[0].text)
46+
```
47+
48+
!!! warning
49+
If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
50+
1651
### Image Inputs
1752

1853
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import pytest
5+
6+
from vllm.assets.image import ImageAsset
7+
from vllm.assets.video import VideoAsset
8+
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
9+
from vllm.platforms.interface import UnspecifiedPlatform
10+
from vllm.sampling_params import SamplingParams
11+
from vllm.v1.engine import processor as processor_mod
12+
from vllm.v1.engine.processor import Processor
13+
14+
cherry_pil_image = ImageAsset("cherry_blossom").pil_image
15+
stop_pil_image = ImageAsset("stop_sign").pil_image
16+
baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
17+
18+
19+
# Mock processor for testing
20+
def _mk_processor(monkeypatch,
21+
*,
22+
mm_cache_gb: float = 4.0,
23+
enable_prefix_caching: bool = True) -> Processor:
24+
"""
25+
Create a Processor instance with minimal configuration suitable for unit
26+
tests without accessing external resources.
27+
"""
28+
monkeypatch.setattr(ModelConfig,
29+
"try_get_generation_config",
30+
lambda self: {},
31+
raising=True)
32+
monkeypatch.setattr(ModelConfig,
33+
"__post_init__",
34+
lambda self: None,
35+
raising=True)
36+
monkeypatch.setattr(UnspecifiedPlatform,
37+
"is_async_output_supported",
38+
classmethod(lambda cls, enforce_eager: True),
39+
raising=True)
40+
monkeypatch.setattr(
41+
ModelConfig,
42+
"verify_async_output_proc",
43+
lambda self, parallel_config, speculative_config, device_config: None,
44+
raising=True)
45+
monkeypatch.setattr(ModelConfig,
46+
"verify_with_parallel_config",
47+
lambda self, parallel_config: None,
48+
raising=True)
49+
monkeypatch.setattr(processor_mod,
50+
"processor_cache_from_config",
51+
lambda vllm_config, mm_registry: None,
52+
raising=True)
53+
54+
monkeypatch.setattr(VllmConfig,
55+
"__post_init__",
56+
lambda self: None,
57+
raising=True)
58+
59+
model_config = ModelConfig(
60+
skip_tokenizer_init=True,
61+
max_model_len=128,
62+
mm_processor_cache_gb=mm_cache_gb,
63+
generation_config="vllm",
64+
tokenizer="dummy",
65+
)
66+
67+
# Minimal multimodal_config to satisfy references in
68+
# Processor.process_inputs.
69+
class _MockMMConfig:
70+
71+
def __init__(self, gb: float):
72+
self.mm_processor_cache_gb = gb
73+
74+
model_config.multimodal_config = _MockMMConfig(
75+
mm_cache_gb) # type: ignore[attr-defined]
76+
vllm_config = VllmConfig(
77+
model_config=model_config,
78+
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
79+
device_config=DeviceConfig(device="cpu"),
80+
)
81+
82+
# Pass tokenizer=None; InputPreprocessor handles None when
83+
# skip_tokenizer_init is True.
84+
return Processor(vllm_config, tokenizer=None) # type: ignore[arg-type]
85+
86+
87+
def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
88+
processor = _mk_processor(monkeypatch)
89+
90+
prompt = {
91+
"prompt": "USER: <image>\nDescribe\nASSISTANT:",
92+
"multi_modal_data": {
93+
"image": [cherry_pil_image, stop_pil_image]
94+
},
95+
# Mismatch: 2 items but only 1 uuid provided
96+
"multi_modal_uuids": {
97+
"image": ["hash_cherry"]
98+
},
99+
}
100+
101+
with pytest.raises(ValueError, match="must have same length as data"):
102+
processor.process_inputs(
103+
request_id="req-1",
104+
prompt=prompt, # type: ignore[arg-type]
105+
params=SamplingParams(),
106+
)
107+
108+
109+
def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
110+
processor = _mk_processor(monkeypatch)
111+
112+
prompt = {
113+
"prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
114+
# Two modalities provided in data
115+
"multi_modal_data": {
116+
"image": [cherry_pil_image],
117+
"video": [baby_reading_np_ndarrays]
118+
},
119+
# Only image uuids provided; video missing should raise
120+
"multi_modal_uuids": {
121+
"image": ["hash_cherry"]
122+
},
123+
}
124+
125+
with pytest.raises(ValueError,
126+
match="must be provided if multi_modal_data"):
127+
processor.process_inputs(
128+
request_id="req-2",
129+
prompt=prompt, # type: ignore[arg-type]
130+
params=SamplingParams(),
131+
)
132+
133+
134+
@pytest.mark.parametrize(
135+
"mm_cache_gb, enable_prefix_caching",
136+
[
137+
(4.0, True), # default behavior
138+
(4.0, False), # prefix caching disabled
139+
(0.0, True), # processor cache disabled
140+
],
141+
)
142+
def test_multi_modal_uuids_accepts_none_and_passes_through(
143+
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool):
144+
processor = _mk_processor(monkeypatch,
145+
mm_cache_gb=mm_cache_gb,
146+
enable_prefix_caching=enable_prefix_caching)
147+
148+
# Capture the overrides passed to InputPreprocessor.preprocess
149+
captured: dict[str, object] = {}
150+
151+
def fake_preprocess(prompt,
152+
*,
153+
tokenization_kwargs=None,
154+
lora_request=None,
155+
mm_hash_overrides=None):
156+
captured["mm_hash_overrides"] = mm_hash_overrides
157+
# Minimal processed inputs for decoder-only flow
158+
return {"type": "token", "prompt_token_ids": [1]}
159+
160+
# Monkeypatch only the bound preprocess method on this instance
161+
monkeypatch.setattr(processor.input_preprocessor,
162+
"preprocess",
163+
fake_preprocess,
164+
raising=True)
165+
166+
# Use a consistent two-image scenario across all configurations
167+
mm_uuids = {"image": [None, "hash_stop"], "video": None}
168+
prompt = {
169+
"prompt": "USER: <image><image>\nTwo images\nASSISTANT:",
170+
"multi_modal_data": {
171+
"image": [cherry_pil_image, stop_pil_image],
172+
"video": baby_reading_np_ndarrays,
173+
},
174+
"multi_modal_uuids": mm_uuids,
175+
}
176+
177+
processor.process_inputs(
178+
request_id="req-3",
179+
prompt=prompt, # type: ignore[arg-type]
180+
params=SamplingParams(),
181+
)
182+
183+
assert captured["mm_hash_overrides"] == mm_uuids
184+
185+
186+
def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
187+
# When both processor cache is 0 and prefix caching disabled, the
188+
# processor builds overrides from request id instead of using user UUIDs.
189+
processor = _mk_processor(monkeypatch,
190+
mm_cache_gb=0.0,
191+
enable_prefix_caching=False)
192+
193+
captured: dict[str, object] = {}
194+
195+
def fake_preprocess(prompt,
196+
*,
197+
tokenization_kwargs=None,
198+
lora_request=None,
199+
mm_hash_overrides=None):
200+
captured["mm_hash_overrides"] = mm_hash_overrides
201+
return {"type": "token", "prompt_token_ids": [1]}
202+
203+
monkeypatch.setattr(processor.input_preprocessor,
204+
"preprocess",
205+
fake_preprocess,
206+
raising=True)
207+
208+
request_id = "req-42"
209+
mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
210+
prompt = {
211+
"prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
212+
"multi_modal_data": {
213+
"image": [cherry_pil_image, stop_pil_image],
214+
"video": baby_reading_np_ndarrays,
215+
},
216+
"multi_modal_uuids": mm_uuids,
217+
}
218+
219+
processor.process_inputs(
220+
request_id=request_id,
221+
prompt=prompt, # type: ignore[arg-type]
222+
params=SamplingParams(),
223+
)
224+
225+
# Expect request-id-based overrides are passed through
226+
assert captured["mm_hash_overrides"] == {
227+
"image": [f"{request_id}-image-0", f"{request_id}-image-1"],
228+
"video": [f"{request_id}-video-0"],
229+
}

vllm/entrypoints/openai/serving_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
from vllm.logger import init_logger
6868
from vllm.lora.request import LoRARequest
6969
from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
70-
MultiModalDataDict)
70+
MultiModalDataDict, MultiModalUUIDDict)
7171
from vllm.outputs import PoolingRequestOutput, RequestOutput
7272
from vllm.pooling_params import PoolingParams
7373
from vllm.sampling_params import BeamSearchParams, SamplingParams

vllm/inputs/data.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar
88

99
if TYPE_CHECKING:
10-
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
10+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalInputs,
11+
MultiModalUUIDDict)
1112

1213

1314
class TextPrompt(TypedDict):
@@ -30,6 +31,15 @@ class TextPrompt(TypedDict):
3031
to pass the mm_processor_kwargs to each of them.
3132
"""
3233

34+
multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
35+
"""
36+
Optional user-specified UUIDs for multimodal items, mapped by modality.
37+
Lists must match the number of items per modality and may contain `None`.
38+
For `None` entries, the hasher will compute IDs automatically; non-None
39+
entries override the default hashes for caching, and MUST be unique per
40+
multimodal item.
41+
"""
42+
3343
cache_salt: NotRequired[str]
3444
"""
3545
Optional cache salt to be used for prefix caching.
@@ -59,6 +69,14 @@ class TokensPrompt(TypedDict):
5969
to pass the mm_processor_kwargs to each of them.
6070
"""
6171

72+
multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
73+
"""
74+
Optional user-specified UUIDs for multimodal items, mapped by modality.
75+
Lists must match the number of items per modality and may contain `None`.
76+
For `None` entries, the hasher will compute IDs automatically; non-None
77+
entries override the default hashes for caching.
78+
"""
79+
6280
cache_salt: NotRequired[str]
6381
"""
6482
Optional cache salt to be used for prefix caching.

0 commit comments

Comments
 (0)