Skip to content

Commit 377d10b

Browse files
[VLM][Bugfix] Pass processor kwargs properly on init (vllm-project#13516)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 52ce14d commit 377d10b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+675
-453
lines changed

examples/offline_inference/vision_language_multi_image.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
8585
trust_remote_code=True,
8686
max_model_len=8192,
8787
limit_mm_per_prompt={"image": len(image_urls)},
88+
mm_processor_kwargs={"max_dynamic_patch": 4},
8889
)
8990

9091
placeholders = "\n".join(f"Image-{i}: <image>\n"

tests/models/multimodal/processing/test_common.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from vllm.inputs import InputProcessingContext
1111
from vllm.multimodal import MULTIMODAL_REGISTRY
1212
from vllm.multimodal.processing import ProcessingCache
13-
from vllm.multimodal.utils import cached_get_tokenizer
13+
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
1414

1515
from ....multimodal.utils import random_audio, random_image, random_video
1616
from ...registry import HF_EXAMPLE_MODELS
@@ -42,10 +42,7 @@ def _test_processing_correctness(
4242
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
4343
ctx = InputProcessingContext(
4444
model_config,
45-
tokenizer=cached_get_tokenizer(
46-
model_config.tokenizer,
47-
trust_remote_code=model_info.trust_remote_code,
48-
),
45+
tokenizer=cached_tokenizer_from_config(model_config),
4946
)
5047
# Ensure that it can fit all of the data
5148
cache = ProcessingCache(capacity=1 << 30)
Lines changed: 131 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,118 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""Tests for H2OVL's multimodal preprocessing kwargs."""
3-
from typing import Optional
3+
from typing import Mapping, Optional
44

55
import pytest
6+
from PIL import Image
7+
from transformers import PretrainedConfig
68

79
from vllm.multimodal import MULTIMODAL_REGISTRY
810
from vllm.multimodal.image import rescale_image_size
9-
from vllm.multimodal.utils import cached_get_tokenizer
11+
from vllm.multimodal.processing import BaseMultiModalProcessor
12+
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
1013

1114
from ....conftest import _ImageAssets
1215
from ...utils import build_model_context
1316

1417

18+
def _get_expected_num_patches(
19+
config: PretrainedConfig,
20+
image: Image.Image,
21+
num_imgs: int,
22+
min_num: int,
23+
max_num: int,
24+
):
25+
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
26+
get_h2ovl_target_ratios)
27+
28+
width, height = image.size
29+
30+
# Calculate the expected number of blocks
31+
if num_imgs == 1 and config.use_msac:
32+
# First pass
33+
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
34+
orig_width=width,
35+
orig_height=height,
36+
target_ratios=get_h2ovl_target_ratios(
37+
min_num=1,
38+
max_num=max_num,
39+
prior_aspect_ratio=None,
40+
),
41+
image_size=config.vision_config.image_size,
42+
use_thumbnail=False, # Thumbnail is handled separately
43+
)
44+
45+
# Second pass
46+
blocks2, _, _, _ = calculate_h2ovl_targets(
47+
orig_width=width,
48+
orig_height=height,
49+
target_ratios=get_h2ovl_target_ratios(
50+
min_num=3,
51+
max_num=max_num,
52+
prior_aspect_ratio=aspect_ratio,
53+
),
54+
image_size=config.vision_config.image_size,
55+
use_thumbnail=False,
56+
)
57+
58+
# Add thumbnail if use_thumbnail is True and total_blocks > 1
59+
if config.use_thumbnail:
60+
blocks1 += 1 if blocks1 > 1 else 0
61+
blocks2 += 1 if blocks2 > 1 else 0
62+
63+
# Total blocks is the sum of blocks from both passes minus
64+
# overlapping
65+
total_blocks = blocks1 + blocks2 - 1
66+
67+
return total_blocks
68+
69+
blocks, _, _, _ = calculate_h2ovl_targets(
70+
orig_width=width,
71+
orig_height=height,
72+
target_ratios=get_h2ovl_target_ratios(
73+
min_num,
74+
max_num,
75+
prior_aspect_ratio=None,
76+
),
77+
image_size=config.vision_config.image_size,
78+
use_thumbnail=False,
79+
)
80+
expected_num_patches = blocks
81+
82+
if config.use_thumbnail and expected_num_patches > 1:
83+
expected_num_patches += 1
84+
85+
return expected_num_patches
86+
87+
88+
def _run_check(
89+
processor: BaseMultiModalProcessor,
90+
images: list[Image.Image],
91+
min_num: int,
92+
max_num: int,
93+
mm_processor_kwargs: Mapping[str, object],
94+
):
95+
tokenizer = processor.info.get_tokenizer()
96+
config = processor.info.get_hf_config()
97+
98+
mm_data = {"image": images}
99+
100+
total_expected_num_patches = sum(
101+
_get_expected_num_patches(config, image, len(images), min_num, max_num)
102+
for image in images)
103+
104+
processed_inputs = processor.apply("<image>" * len(images), mm_data,
105+
mm_processor_kwargs)
106+
107+
# Ensure we have the right number of placeholders per num_crops size
108+
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
109+
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
110+
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
111+
112+
assert img_tok_count == 256 * total_expected_num_patches
113+
assert pixel_shape[0] == total_expected_num_patches
114+
115+
15116
@pytest.mark.parametrize("model_id", [
16117
"h2oai/h2ovl-mississippi-800m",
17118
"h2oai/h2ovl-mississippi-2b",
@@ -25,118 +126,54 @@
25126
[1.0, 1.0, 1.0],
26127
# Multi-scale
27128
[0.25, 0.5, 1.0],
129+
[4.0, 2.0, 1.0],
28130
],
29131
)
30-
@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
132+
@pytest.mark.parametrize(
133+
("min_dynamic_patch", "max_dynamic_patch"),
134+
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
135+
)
31136
@pytest.mark.parametrize("dynamic_image_size", [True, False])
32-
@pytest.mark.parametrize("num_imgs", [1, 2])
137+
@pytest.mark.parametrize("kwargs_on_init", [True, False])
33138
def test_processor_override(
34139
model_id: str,
35140
image_assets: _ImageAssets,
36141
size_factors: list[int],
142+
min_dynamic_patch: int,
37143
max_dynamic_patch: int,
38144
dynamic_image_size: Optional[bool],
39-
num_imgs: int,
145+
kwargs_on_init: bool,
40146
):
41-
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
42-
get_h2ovl_target_ratios)
147+
mm_processor_kwargs = {
148+
"min_dynamic_patch": min_dynamic_patch,
149+
"max_dynamic_patch": max_dynamic_patch,
150+
"dynamic_image_size": dynamic_image_size,
151+
}
43152

44153
ctx = build_model_context(
45154
model_name=model_id,
46155
tokenizer_name=model_id,
47156
trust_remote_code=True,
48-
mm_processor_kwargs=None,
49-
limit_mm_per_prompt={"image": num_imgs},
50-
)
51-
tokenizer = cached_get_tokenizer(
52-
ctx.model_config.tokenizer,
53-
trust_remote_code=ctx.model_config.trust_remote_code,
157+
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
158+
limit_mm_per_prompt={"image": len(size_factors)},
54159
)
160+
tokenizer = cached_tokenizer_from_config(ctx.model_config)
55161
processor = MULTIMODAL_REGISTRY.create_processor(
56162
ctx.model_config,
57163
tokenizer=tokenizer,
58164
)
165+
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
59166

60-
config = processor.info.get_hf_config()
61-
use_msac = config.use_msac
62-
63-
mm_processor_kwargs = {
64-
"max_dynamic_patch": max_dynamic_patch,
65-
}
66-
if dynamic_image_size is not None:
67-
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
68-
69-
min_num = config.min_dynamic_patch
167+
min_num = min_dynamic_patch if dynamic_image_size else 1
70168
max_num = max_dynamic_patch if dynamic_image_size else 1
71169

72-
# Build the image str / prompt based on the number of images we pass
73-
prompt = "<image>" * num_imgs
74-
75-
for asset in image_assets:
76-
for factor in size_factors:
77-
image = rescale_image_size(asset.pil_image, factor)
78-
mm_data = {"image": [image] * num_imgs}
79-
80-
width, height = image.size
81-
82-
# Calculate the expected number of blocks
83-
if num_imgs == 1 and use_msac:
84-
# First pass
85-
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
86-
orig_width=width,
87-
orig_height=height,
88-
target_ratios=get_h2ovl_target_ratios(
89-
min_num,
90-
max_num,
91-
prior_aspect_ratio=None,
92-
),
93-
image_size=config.vision_config.image_size,
94-
use_thumbnail=False, # Thumbnail is handled separately
95-
)
96-
97-
# Second pass
98-
blocks2, _, _, _ = calculate_h2ovl_targets(
99-
orig_width=width,
100-
orig_height=height,
101-
target_ratios=get_h2ovl_target_ratios(
102-
min_num,
103-
max_num,
104-
prior_aspect_ratio=aspect_ratio,
105-
),
106-
image_size=config.vision_config.image_size,
107-
use_thumbnail=False,
108-
)
109-
110-
# Add thumbnail if use_thumbnail is True and total_blocks > 1
111-
if config.use_thumbnail:
112-
blocks1 += 1 if blocks1 > 1 else 0
113-
blocks2 += 1 if blocks2 > 1 else 0
114-
115-
# Total blocks is the sum of blocks from both passes minus
116-
# overlapping
117-
total_blocks = blocks1 + blocks2 - 1
118-
119-
expected_num_patches = total_blocks
120-
else:
121-
blocks, _, _, _ = calculate_h2ovl_targets(
122-
orig_width=width,
123-
orig_height=height,
124-
target_ratios=get_h2ovl_target_ratios(
125-
min_num,
126-
max_num,
127-
prior_aspect_ratio=None,
128-
),
129-
image_size=config.vision_config.image_size,
130-
use_thumbnail=False,
131-
)
132-
expected_num_patches = blocks
133-
134-
if config.use_thumbnail and expected_num_patches != 1:
135-
expected_num_patches += 1
136-
137-
processed_inputs = processor.apply(prompt, mm_data,
138-
mm_processor_kwargs)
139-
pixel_shape = (
140-
processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
141-
142-
assert pixel_shape[0] == expected_num_patches * num_imgs
170+
_run_check(
171+
processor,
172+
[
173+
rescale_image_size(image_assets[0].pil_image, f)
174+
for f in size_factors
175+
],
176+
min_num,
177+
max_num,
178+
hf_processor_mm_kwargs,
179+
)

tests/models/multimodal/processing/test_idefics3.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from transformers import Idefics3Config
55

66
from vllm.multimodal import MULTIMODAL_REGISTRY
7-
from vllm.multimodal.utils import cached_get_tokenizer
7+
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
88

99
from ....conftest import _ImageAssets
1010
from ...utils import build_model_context
@@ -22,9 +22,15 @@
2222
])
2323
# yapf: enable
2424
@pytest.mark.parametrize("num_imgs", [1, 2])
25-
def test_processor_override(image_assets: _ImageAssets, model: str,
26-
mm_processor_kwargs: dict[str, object],
27-
expected_toks_per_img: int, num_imgs: int):
25+
@pytest.mark.parametrize("kwargs_on_init", [True, False])
26+
def test_processor_override(
27+
image_assets: _ImageAssets,
28+
model: str,
29+
mm_processor_kwargs: dict[str, object],
30+
expected_toks_per_img: int,
31+
num_imgs: int,
32+
kwargs_on_init: bool,
33+
):
2834
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
2935
# Same as the previous test - don't initialize mm_processor_kwargs
3036
# in this test and assume that the kwargs will be correctly expanded by
@@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
3339
model_name=model,
3440
tokenizer_name=model,
3541
trust_remote_code=True,
36-
mm_processor_kwargs=None,
42+
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
3743
limit_mm_per_prompt={"image": num_imgs},
3844
)
39-
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
45+
tokenizer = cached_tokenizer_from_config(ctx.model_config)
4046
processor = MULTIMODAL_REGISTRY.create_processor(
4147
ctx.model_config,
4248
tokenizer=tokenizer,
4349
)
44-
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
50+
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
4551

4652
# Build the image str / prompt based on the number of images we pass
4753
placeholders = "<image>" if num_imgs == 1 else "\n".join(
@@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
5460
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
5561
mm_data = {"image": [dummy_image] * num_imgs}
5662

57-
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
63+
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
64+
5865
# Ensure the placeholders format are correct
66+
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
5967
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
6068
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
6169
"input_ids"][0]

0 commit comments

Comments
 (0)