Skip to content

Commit cc2a1a5

Browse files
Isotr0pyepwalsh
authored andcommitted
[Bugfix] Fix glm4.1v video inference issue (vllm-project#22067)
Signed-off-by: Isotr0py <[email protected]>
1 parent 4f09520 commit cc2a1a5

File tree

2 files changed

+53
-6
lines changed

2 files changed

+53
-6
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import pytest
5+
6+
from vllm.assets.video import VideoAsset
7+
from vllm.multimodal import MULTIMODAL_REGISTRY
8+
9+
from ...utils import build_model_context
10+
11+
12+
@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"])
13+
@pytest.mark.parametrize("expected_toks_per_frame", [299])
14+
@pytest.mark.parametrize("num_frames", [32, 128])
15+
@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
16+
def test_processor_override(
17+
model_id: str,
18+
expected_toks_per_frame: int,
19+
expected_grid_t: int,
20+
fps: int,
21+
num_frames: int,
22+
):
23+
"""Ensure GLM4vMultiModalProcessor can handle video frames properly."""
24+
ctx = build_model_context(
25+
model_id,
26+
mm_processor_kwargs=None,
27+
limit_mm_per_prompt={"video": 1},
28+
)
29+
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
30+
tokenizer = processor.info.get_tokenizer()
31+
hf_processor_mm_kwargs = {"fps": fps}
32+
33+
# Build the image str / prompt based on the number of images we pass
34+
video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
35+
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
36+
37+
video, metadata = video_assets.np_ndarrays, video_assets.metadata
38+
metadata["fps"] = fps
39+
mm_data = {"video": [(video, metadata)]}
40+
41+
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
42+
43+
# Ensure we have the right number of placeholders per num_crops size
44+
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
45+
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
46+
video_tok_count = processed_inputs["prompt_token_ids"].count(
47+
video_token_id)
48+
grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
49+
50+
assert grid_t == expected_grid_t
51+
assert video_tok_count == expected_toks_per_frame * grid_t

vllm/model_executor/models/glm4_1v.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,7 @@ def _get_video_second_idx(self, metadata: dict[str, Any],
937937
total_frames: int) -> list[int]:
938938
video_processor = self.get_video_processor()
939939

940-
video_fps = metadata.get("fps", 2.0)
940+
video_fps = metadata.get("fps", video_processor.fps)
941941
meta_frames = metadata.get("total_num_frames", total_frames)
942942
max_frame_idx = meta_frames - 1
943943
duration = metadata.get("duration",
@@ -1120,11 +1120,7 @@ def _call_hf_processor(
11201120
video_placeholder,
11211121
)
11221122

1123-
grid_t = len(video_outputs["video_grid_thw"])
1124-
_, grid_h, grid_w = video_outputs["video_grid_thw"][0]
1125-
grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
1126-
1127-
video_grid_thw_lst.append(grid_thw)
1123+
video_grid_thw_lst.append(video_outputs["video_grid_thw"])
11281124
pixel_values_videos_lst.append(
11291125
video_outputs["pixel_values_videos"])
11301126
video_outputs = dict(

0 commit comments

Comments
 (0)