|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | + |
| 4 | +import pytest |
| 5 | + |
| 6 | +from vllm.assets.video import VideoAsset |
| 7 | +from vllm.multimodal import MULTIMODAL_REGISTRY |
| 8 | + |
| 9 | +from ...utils import build_model_context |
| 10 | + |
| 11 | + |
| 12 | +@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"]) |
| 13 | +@pytest.mark.parametrize("expected_toks_per_frame", [299]) |
| 14 | +@pytest.mark.parametrize("num_frames", [32, 128]) |
| 15 | +@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)]) |
| 16 | +def test_processor_override( |
| 17 | + model_id: str, |
| 18 | + expected_toks_per_frame: int, |
| 19 | + expected_grid_t: int, |
| 20 | + fps: int, |
| 21 | + num_frames: int, |
| 22 | +): |
| 23 | + """Ensure GLM4vMultiModalProcessor can handle video frames properly.""" |
| 24 | + ctx = build_model_context( |
| 25 | + model_id, |
| 26 | + mm_processor_kwargs=None, |
| 27 | + limit_mm_per_prompt={"video": 1}, |
| 28 | + ) |
| 29 | + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) |
| 30 | + tokenizer = processor.info.get_tokenizer() |
| 31 | + hf_processor_mm_kwargs = {"fps": fps} |
| 32 | + |
| 33 | + # Build the image str / prompt based on the number of images we pass |
| 34 | + video_assets = VideoAsset(name="baby_reading", num_frames=num_frames) |
| 35 | + prompt = "<|begin_of_video|><|video|><|end_of_video|>" |
| 36 | + |
| 37 | + video, metadata = video_assets.np_ndarrays, video_assets.metadata |
| 38 | + metadata["fps"] = fps |
| 39 | + mm_data = {"video": [(video, metadata)]} |
| 40 | + |
| 41 | + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) |
| 42 | + |
| 43 | + # Ensure we have the right number of placeholders per num_crops size |
| 44 | + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) |
| 45 | + video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) |
| 46 | + video_tok_count = processed_inputs["prompt_token_ids"].count( |
| 47 | + video_token_id) |
| 48 | + grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] |
| 49 | + |
| 50 | + assert grid_t == expected_grid_t |
| 51 | + assert video_tok_count == expected_toks_per_frame * grid_t |
0 commit comments