|
1 | 1 | # SPDX-License-Identifier: Apache-2.0 |
2 | 2 | """Tests for H2OVL's multimodal preprocessing kwargs.""" |
3 | | -from typing import Optional |
| 3 | +from typing import Mapping, Optional |
4 | 4 |
|
5 | 5 | import pytest |
| 6 | +from PIL import Image |
| 7 | +from transformers import PretrainedConfig |
6 | 8 |
|
7 | 9 | from vllm.multimodal import MULTIMODAL_REGISTRY |
8 | 10 | from vllm.multimodal.image import rescale_image_size |
9 | | -from vllm.multimodal.utils import cached_get_tokenizer |
| 11 | +from vllm.multimodal.processing import BaseMultiModalProcessor |
| 12 | +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config |
10 | 13 |
|
11 | 14 | from ....conftest import _ImageAssets |
12 | 15 | from ...utils import build_model_context |
13 | 16 |
|
14 | 17 |
|
| 18 | +def _get_expected_num_patches( |
| 19 | + config: PretrainedConfig, |
| 20 | + image: Image.Image, |
| 21 | + num_imgs: int, |
| 22 | + min_num: int, |
| 23 | + max_num: int, |
| 24 | +): |
| 25 | + from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, |
| 26 | + get_h2ovl_target_ratios) |
| 27 | + |
| 28 | + width, height = image.size |
| 29 | + |
| 30 | + # Calculate the expected number of blocks |
| 31 | + if num_imgs == 1 and config.use_msac: |
| 32 | + # First pass |
| 33 | + blocks1, _, _, aspect_ratio = calculate_h2ovl_targets( |
| 34 | + orig_width=width, |
| 35 | + orig_height=height, |
| 36 | + target_ratios=get_h2ovl_target_ratios( |
| 37 | + min_num=1, |
| 38 | + max_num=max_num, |
| 39 | + prior_aspect_ratio=None, |
| 40 | + ), |
| 41 | + image_size=config.vision_config.image_size, |
| 42 | + use_thumbnail=False, # Thumbnail is handled separately |
| 43 | + ) |
| 44 | + |
| 45 | + # Second pass |
| 46 | + blocks2, _, _, _ = calculate_h2ovl_targets( |
| 47 | + orig_width=width, |
| 48 | + orig_height=height, |
| 49 | + target_ratios=get_h2ovl_target_ratios( |
| 50 | + min_num=3, |
| 51 | + max_num=max_num, |
| 52 | + prior_aspect_ratio=aspect_ratio, |
| 53 | + ), |
| 54 | + image_size=config.vision_config.image_size, |
| 55 | + use_thumbnail=False, |
| 56 | + ) |
| 57 | + |
| 58 | + # Add thumbnail if use_thumbnail is True and total_blocks > 1 |
| 59 | + if config.use_thumbnail: |
| 60 | + blocks1 += 1 if blocks1 > 1 else 0 |
| 61 | + blocks2 += 1 if blocks2 > 1 else 0 |
| 62 | + |
| 63 | + # Total blocks is the sum of blocks from both passes minus |
| 64 | + # overlapping |
| 65 | + total_blocks = blocks1 + blocks2 - 1 |
| 66 | + |
| 67 | + return total_blocks |
| 68 | + |
| 69 | + blocks, _, _, _ = calculate_h2ovl_targets( |
| 70 | + orig_width=width, |
| 71 | + orig_height=height, |
| 72 | + target_ratios=get_h2ovl_target_ratios( |
| 73 | + min_num, |
| 74 | + max_num, |
| 75 | + prior_aspect_ratio=None, |
| 76 | + ), |
| 77 | + image_size=config.vision_config.image_size, |
| 78 | + use_thumbnail=False, |
| 79 | + ) |
| 80 | + expected_num_patches = blocks |
| 81 | + |
| 82 | + if config.use_thumbnail and expected_num_patches > 1: |
| 83 | + expected_num_patches += 1 |
| 84 | + |
| 85 | + return expected_num_patches |
| 86 | + |
| 87 | + |
| 88 | +def _run_check( |
| 89 | + processor: BaseMultiModalProcessor, |
| 90 | + images: list[Image.Image], |
| 91 | + min_num: int, |
| 92 | + max_num: int, |
| 93 | + mm_processor_kwargs: Mapping[str, object], |
| 94 | +): |
| 95 | + tokenizer = processor.info.get_tokenizer() |
| 96 | + config = processor.info.get_hf_config() |
| 97 | + |
| 98 | + mm_data = {"image": images} |
| 99 | + |
| 100 | + total_expected_num_patches = sum( |
| 101 | + _get_expected_num_patches(config, image, len(images), min_num, max_num) |
| 102 | + for image in images) |
| 103 | + |
| 104 | + processed_inputs = processor.apply("<image>" * len(images), mm_data, |
| 105 | + mm_processor_kwargs) |
| 106 | + |
| 107 | + # Ensure we have the right number of placeholders per num_crops size |
| 108 | + image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>") |
| 109 | + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) |
| 110 | + pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape |
| 111 | + |
| 112 | + assert img_tok_count == 256 * total_expected_num_patches |
| 113 | + assert pixel_shape[0] == total_expected_num_patches |
| 114 | + |
| 115 | + |
15 | 116 | @pytest.mark.parametrize("model_id", [ |
16 | 117 | "h2oai/h2ovl-mississippi-800m", |
17 | 118 | "h2oai/h2ovl-mississippi-2b", |
|
25 | 126 | [1.0, 1.0, 1.0], |
26 | 127 | # Multi-scale |
27 | 128 | [0.25, 0.5, 1.0], |
| 129 | + [4.0, 2.0, 1.0], |
28 | 130 | ], |
29 | 131 | ) |
30 | | -@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8]) |
| 132 | +@pytest.mark.parametrize( |
| 133 | + ("min_dynamic_patch", "max_dynamic_patch"), |
| 134 | + [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)], |
| 135 | +) |
31 | 136 | @pytest.mark.parametrize("dynamic_image_size", [True, False]) |
32 | | -@pytest.mark.parametrize("num_imgs", [1, 2]) |
| 137 | +@pytest.mark.parametrize("kwargs_on_init", [True, False]) |
33 | 138 | def test_processor_override( |
34 | 139 | model_id: str, |
35 | 140 | image_assets: _ImageAssets, |
36 | 141 | size_factors: list[int], |
| 142 | + min_dynamic_patch: int, |
37 | 143 | max_dynamic_patch: int, |
38 | 144 | dynamic_image_size: Optional[bool], |
39 | | - num_imgs: int, |
| 145 | + kwargs_on_init: bool, |
40 | 146 | ): |
41 | | - from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, |
42 | | - get_h2ovl_target_ratios) |
| 147 | + mm_processor_kwargs = { |
| 148 | + "min_dynamic_patch": min_dynamic_patch, |
| 149 | + "max_dynamic_patch": max_dynamic_patch, |
| 150 | + "dynamic_image_size": dynamic_image_size, |
| 151 | + } |
43 | 152 |
|
44 | 153 | ctx = build_model_context( |
45 | 154 | model_name=model_id, |
46 | 155 | tokenizer_name=model_id, |
47 | 156 | trust_remote_code=True, |
48 | | - mm_processor_kwargs=None, |
49 | | - limit_mm_per_prompt={"image": num_imgs}, |
50 | | - ) |
51 | | - tokenizer = cached_get_tokenizer( |
52 | | - ctx.model_config.tokenizer, |
53 | | - trust_remote_code=ctx.model_config.trust_remote_code, |
| 157 | + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, |
| 158 | + limit_mm_per_prompt={"image": len(size_factors)}, |
54 | 159 | ) |
| 160 | + tokenizer = cached_tokenizer_from_config(ctx.model_config) |
55 | 161 | processor = MULTIMODAL_REGISTRY.create_processor( |
56 | 162 | ctx.model_config, |
57 | 163 | tokenizer=tokenizer, |
58 | 164 | ) |
| 165 | + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs |
59 | 166 |
|
60 | | - config = processor.info.get_hf_config() |
61 | | - use_msac = config.use_msac |
62 | | - |
63 | | - mm_processor_kwargs = { |
64 | | - "max_dynamic_patch": max_dynamic_patch, |
65 | | - } |
66 | | - if dynamic_image_size is not None: |
67 | | - mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size |
68 | | - |
69 | | - min_num = config.min_dynamic_patch |
| 167 | + min_num = min_dynamic_patch if dynamic_image_size else 1 |
70 | 168 | max_num = max_dynamic_patch if dynamic_image_size else 1 |
71 | 169 |
|
72 | | - # Build the image str / prompt based on the number of images we pass |
73 | | - prompt = "<image>" * num_imgs |
74 | | - |
75 | | - for asset in image_assets: |
76 | | - for factor in size_factors: |
77 | | - image = rescale_image_size(asset.pil_image, factor) |
78 | | - mm_data = {"image": [image] * num_imgs} |
79 | | - |
80 | | - width, height = image.size |
81 | | - |
82 | | - # Calculate the expected number of blocks |
83 | | - if num_imgs == 1 and use_msac: |
84 | | - # First pass |
85 | | - blocks1, _, _, aspect_ratio = calculate_h2ovl_targets( |
86 | | - orig_width=width, |
87 | | - orig_height=height, |
88 | | - target_ratios=get_h2ovl_target_ratios( |
89 | | - min_num, |
90 | | - max_num, |
91 | | - prior_aspect_ratio=None, |
92 | | - ), |
93 | | - image_size=config.vision_config.image_size, |
94 | | - use_thumbnail=False, # Thumbnail is handled separately |
95 | | - ) |
96 | | - |
97 | | - # Second pass |
98 | | - blocks2, _, _, _ = calculate_h2ovl_targets( |
99 | | - orig_width=width, |
100 | | - orig_height=height, |
101 | | - target_ratios=get_h2ovl_target_ratios( |
102 | | - min_num, |
103 | | - max_num, |
104 | | - prior_aspect_ratio=aspect_ratio, |
105 | | - ), |
106 | | - image_size=config.vision_config.image_size, |
107 | | - use_thumbnail=False, |
108 | | - ) |
109 | | - |
110 | | - # Add thumbnail if use_thumbnail is True and total_blocks > 1 |
111 | | - if config.use_thumbnail: |
112 | | - blocks1 += 1 if blocks1 > 1 else 0 |
113 | | - blocks2 += 1 if blocks2 > 1 else 0 |
114 | | - |
115 | | - # Total blocks is the sum of blocks from both passes minus |
116 | | - # overlapping |
117 | | - total_blocks = blocks1 + blocks2 - 1 |
118 | | - |
119 | | - expected_num_patches = total_blocks |
120 | | - else: |
121 | | - blocks, _, _, _ = calculate_h2ovl_targets( |
122 | | - orig_width=width, |
123 | | - orig_height=height, |
124 | | - target_ratios=get_h2ovl_target_ratios( |
125 | | - min_num, |
126 | | - max_num, |
127 | | - prior_aspect_ratio=None, |
128 | | - ), |
129 | | - image_size=config.vision_config.image_size, |
130 | | - use_thumbnail=False, |
131 | | - ) |
132 | | - expected_num_patches = blocks |
133 | | - |
134 | | - if config.use_thumbnail and expected_num_patches != 1: |
135 | | - expected_num_patches += 1 |
136 | | - |
137 | | - processed_inputs = processor.apply(prompt, mm_data, |
138 | | - mm_processor_kwargs) |
139 | | - pixel_shape = ( |
140 | | - processed_inputs["mm_kwargs"]["pixel_values_flat"].shape) |
141 | | - |
142 | | - assert pixel_shape[0] == expected_num_patches * num_imgs |
| 170 | + _run_check( |
| 171 | + processor, |
| 172 | + [ |
| 173 | + rescale_image_size(image_assets[0].pil_image, f) |
| 174 | + for f in size_factors |
| 175 | + ], |
| 176 | + min_num, |
| 177 | + max_num, |
| 178 | + hf_processor_mm_kwargs, |
| 179 | + ) |
0 commit comments