|
14 | 14 |
|
15 | 15 | pytestmark = pytest.mark.vlm
|
16 | 16 |
|
| 17 | + |
| 18 | +class NestedInputs(UserDict): |
| 19 | + |
| 20 | + def __init__(self, model_inputs: BatchFeature): |
| 21 | + super().__init__({"model_inputs": model_inputs}) |
| 22 | + |
| 23 | + self.model_inputs = model_inputs |
| 24 | + |
| 25 | + def to(self, device: torch.types.Device): |
| 26 | + return NestedInputs(self.model_inputs.to(device)) |
| 27 | + |
| 28 | + |
17 | 29 | # The image token is placed before "user" on purpose so that the test can pass
|
18 | 30 | HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
19 | 31 | "stop_sign":
|
|
23 | 35 | "cherry_blossom":
|
24 | 36 | "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
|
25 | 37 | "(<image>./</image>)\nWhat is the season?<|eot_id|>" \
|
26 |
| - "<|start_header_id|>assistant<|end_header_id|>\n\n" |
| 38 | + "<|start_header_id|>assistant<|end_header_id|>\n\n", |
27 | 39 | })
|
28 | 40 |
|
29 | 41 | models = ["openbmb/MiniCPM-Llama3-V-2_5"]
|
@@ -94,22 +106,10 @@ def run_test(
|
94 | 106 | ]
|
95 | 107 |
|
96 | 108 | with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
|
97 |
| - |
98 |
| - class NestedInputs(UserDict): |
99 |
| - |
100 |
| - def __init__(self, model_inputs: BatchFeature): |
101 |
| - super().__init__({"model_inputs": model_inputs}) |
102 |
| - |
103 |
| - self.model_inputs = model_inputs |
104 |
| - |
105 |
| - def to(self, device: torch.types.Device): |
106 |
| - return NestedInputs(self.model_inputs.to(device)) |
107 |
| - |
108 | 109 | hf_processor = hf_model.processor
|
109 | 110 | hf_model.processor = lambda **kw: NestedInputs(
|
110 | 111 | hf_processor(**kw) # type: ignore
|
111 | 112 | )
|
112 |
| - |
113 | 113 | hf_outputs_per_image = [
|
114 | 114 | hf_model.generate_greedy_logprobs_limit(prompts,
|
115 | 115 | max_tokens,
|
@@ -161,3 +161,123 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
161 | 161 | num_logprobs=num_logprobs,
|
162 | 162 | tensor_parallel_size=1,
|
163 | 163 | )
|
| 164 | + |
| 165 | + |
| 166 | +HF_MULTIIMAGE_IMAGE_PROMPT = \ |
| 167 | + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \ |
| 168 | + "(<image>./</image>)\n(<image>./</image>)\n" \ |
| 169 | + "Describe these images.<|eot_id|>" \ |
| 170 | + "<|start_header_id|>assistant<|end_header_id|>\n\n" |
| 171 | + |
| 172 | + |
| 173 | +def run_multi_image_test( |
| 174 | + hf_runner: Type[HfRunner], |
| 175 | + vllm_runner: Type[VllmRunner], |
| 176 | + image_assets: _ImageAssets, |
| 177 | + model: str, |
| 178 | + *, |
| 179 | + size_factors: List[float], |
| 180 | + dtype: str, |
| 181 | + max_tokens: int, |
| 182 | + num_logprobs: int, |
| 183 | + tensor_parallel_size: int, |
| 184 | + distributed_executor_backend: Optional[str] = None, |
| 185 | +): |
| 186 | + """Inference result should be the same between hf and vllm. |
| 187 | +
|
| 188 | + All the image fixtures for the test is under tests/images. |
| 189 | + For huggingface runner, we provide the PIL images as input. |
| 190 | + For vllm runner, we provide MultiModalDataDict objects |
| 191 | + and corresponding vision language config as input. |
| 192 | + Note, the text input is also adjusted to abide by vllm contract. |
| 193 | + The text output is sanitized to be able to compare with hf. |
| 194 | + """ |
| 195 | + images = [asset.pil_image for asset in image_assets] |
| 196 | + |
| 197 | + inputs_per_case = [ |
| 198 | + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], |
| 199 | + [[rescale_image_size(image, factor) for image in images] |
| 200 | + for factor in size_factors]) |
| 201 | + ] |
| 202 | + |
| 203 | + # NOTE: take care of the order. run vLLM first, and then run HF. |
| 204 | + # vLLM needs a fresh new process without cuda initialization. |
| 205 | + # if we run HF first, the cuda initialization will be done and it |
| 206 | + # will hurt multiprocessing backend with fork method (the default method). |
| 207 | + |
| 208 | + # max_model_len should be greater than image_feature_size |
| 209 | + with vllm_runner(model, |
| 210 | + max_model_len=4096, |
| 211 | + max_num_seqs=1, |
| 212 | + dtype=dtype, |
| 213 | + tensor_parallel_size=tensor_parallel_size, |
| 214 | + distributed_executor_backend=distributed_executor_backend, |
| 215 | + enforce_eager=True) as vllm_model: |
| 216 | + tokenizer = vllm_model.model.get_tokenizer() |
| 217 | + stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] |
| 218 | + vllm_outputs_per_case = [ |
| 219 | + vllm_model.generate_greedy_logprobs(prompts, |
| 220 | + max_tokens, |
| 221 | + num_logprobs=num_logprobs, |
| 222 | + images=images, |
| 223 | + stop_token_ids=stop_token_ids) |
| 224 | + for prompts, images in inputs_per_case |
| 225 | + ] |
| 226 | + |
| 227 | + with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): |
| 228 | + hf_processor = hf_model.processor |
| 229 | + hf_model.processor = lambda **kw: NestedInputs( |
| 230 | + hf_processor(**kw) # type: ignore |
| 231 | + ) |
| 232 | + hf_outputs_per_case = [ |
| 233 | + hf_model.generate_greedy_logprobs_limit(prompts, |
| 234 | + max_tokens, |
| 235 | + num_logprobs=num_logprobs, |
| 236 | + images=images, |
| 237 | + tokenizer=tokenizer) |
| 238 | + for prompts, images in inputs_per_case |
| 239 | + ] |
| 240 | + |
| 241 | + for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, |
| 242 | + vllm_outputs_per_case): |
| 243 | + check_logprobs_close( |
| 244 | + outputs_0_lst=[ |
| 245 | + trunc_hf_output(hf_output) for hf_output in hf_outputs |
| 246 | + ], |
| 247 | + outputs_1_lst=vllm_outputs, |
| 248 | + name_0="hf", |
| 249 | + name_1="vllm", |
| 250 | + ) |
| 251 | + |
| 252 | + |
| 253 | +@pytest.mark.parametrize("model", models) |
| 254 | +@pytest.mark.parametrize( |
| 255 | + "size_factors", |
| 256 | + [ |
| 257 | + # No image |
| 258 | + [], |
| 259 | + # Single-scale |
| 260 | + [1.0], |
| 261 | + # Single-scale, batched |
| 262 | + [1.0, 1.0, 1.0], |
| 263 | + # Multi-scale |
| 264 | + [0.25, 0.5, 1.0], |
| 265 | + ], |
| 266 | +) |
| 267 | +@pytest.mark.parametrize("dtype", [target_dtype]) |
| 268 | +@pytest.mark.parametrize("max_tokens", [128]) |
| 269 | +@pytest.mark.parametrize("num_logprobs", [5]) |
| 270 | +def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, |
| 271 | + size_factors, dtype: str, max_tokens: int, |
| 272 | + num_logprobs: int) -> None: |
| 273 | + run_multi_image_test( |
| 274 | + hf_runner, |
| 275 | + vllm_runner, |
| 276 | + image_assets, |
| 277 | + model, |
| 278 | + size_factors=size_factors, |
| 279 | + dtype=dtype, |
| 280 | + max_tokens=max_tokens, |
| 281 | + num_logprobs=num_logprobs, |
| 282 | + tensor_parallel_size=1, |
| 283 | + ) |
0 commit comments