[Bugfix] Fix glm4.1v video inference issue (vllm-project#22067)

Isotr0py · epwalsh · commit cc2a1a521763 · 2025-08-27T16:55:31.000-07:00
Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.video import VideoAsset
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("expected_toks_per_frame", [299])
+@pytest.mark.parametrize("num_frames", [32, 128])
+@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
+def test_processor_override(
+    model_id: str,
+    expected_toks_per_frame: int,
+    expected_grid_t: int,
+    fps: int,
+    num_frames: int,
+):
+    """Ensure GLM4vMultiModalProcessor can handle video frames properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video, metadata = video_assets.np_ndarrays, video_assets.metadata
+    metadata["fps"] = fps
+    mm_data = {"video": [(video, metadata)]}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
+    video_tok_count = processed_inputs["prompt_token_ids"].count(
+        video_token_id)
+    grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
+
+    assert grid_t == expected_grid_t
+    assert video_tok_count == expected_toks_per_frame * grid_t
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
@@ -937,7 +937,7 @@ def _get_video_second_idx(self, metadata: dict[str, Any],
                               total_frames: int) -> list[int]:
         video_processor = self.get_video_processor()
 
-        video_fps = metadata.get("fps", 2.0)
+        video_fps = metadata.get("fps", video_processor.fps)
         meta_frames = metadata.get("total_num_frames", total_frames)
         max_frame_idx = meta_frames - 1
         duration = metadata.get("duration",
@@ -1120,11 +1120,7 @@ def _call_hf_processor(
                     video_placeholder,
                 )
 
-                grid_t = len(video_outputs["video_grid_thw"])
-                _, grid_h, grid_w = video_outputs["video_grid_thw"][0]
-                grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
-
-                video_grid_thw_lst.append(grid_thw)
+                video_grid_thw_lst.append(video_outputs["video_grid_thw"])
                 pixel_values_videos_lst.append(
                     video_outputs["pixel_values_videos"])
             video_outputs = dict(