[Frontend] Implement robust video frame recovery for corrupted videos (vllm-project#29197)

vSeamar · web-flow · commit 6f351548b258 · 2026-01-07T01:13:24.000Z
Signed-off-by: cmartinez &lt;cmartinez@roblox.com&gt;
Signed-off-by: vSeamar &lt;cmartinez@roblox.com&gt;
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
@@ -689,6 +689,31 @@ Full example: [examples/online_serving/openai_chat_completion_client_for_multimo
     export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
     ```
 
+#### Video Frame Recovery
+
+For improved robustness when processing potentially corrupted or truncated video files, vLLM supports optional frame recovery using a dynamic window forward-scan approach. When enabled, if a target frame fails to load during sequential reading, the next successfully grabbed frame (before the next target frame) will be used in its place.
+
+To enable video frame recovery, pass the `frame_recovery` parameter via `--media-io-kwargs`:
+
+```bash
+# Example: Enable frame recovery
+vllm serve Qwen/Qwen3-VL-30B-A3B-Instruct \
+  --media-io-kwargs '{"video": {"frame_recovery": true}}'
+```
+
+**Parameters:**
+
+- `frame_recovery`: Boolean flag to enable forward-scan recovery. When `true`, failed frames are recovered using the next available frame within the dynamic window (up to the next target frame). Default is `false`.
+
+**How it works:**
+
+1. The system reads frames sequentially
+2. If a target frame fails to grab, it's marked as "failed"
+3. The next successfully grabbed frame (before reaching the next target) is used to recover the failed frame
+4. This approach handles both mid-video corruption and end-of-video truncation
+
+Works with common video formats like MP4 when using OpenCV backends.
+
 #### Custom RGBA Background Color
 
 To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
@@ -299,3 +299,212 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
         frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
         np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
         assert metadata_missing["video_backend"] == "test_video_backend_override_2"
+
+
+# ============================================================================
+# Frame Recovery Tests
+# ============================================================================
+
+
+def test_video_recovery_simulated_failures(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that frame recovery correctly uses the next valid frame when
+    target frames fail to load.
+
+    Uses corrupted.mp4 and mocks VideoCapture.grab() to fail on specific
+    frame indices (in addition to the real corruption at frame 17), then
+    verifies recovery produces more frames.
+    """
+    import cv2
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        # Load corrupted.mp4 (26 frames, frame 17 is genuinely corrupted)
+        video_path = ASSETS_DIR / "corrupted.mp4"
+        with open(video_path, "rb") as f:
+            video_data = f.read()
+
+        # Simulate additional failures on frames 3 and 10
+        # (in addition to the real corruption at frame 17)
+        fail_on_frames = {3, 10}
+
+        # Store original VideoCapture class
+        original_video_capture = cv2.VideoCapture
+
+        class MockVideoCapture:
+            """Wrapper that simulates grab() failures on specific frames."""
+
+            def __init__(self, *args, **kwargs):
+                self._cap = original_video_capture(*args, **kwargs)
+                self._current_frame = -1
+
+            def grab(self):
+                self._current_frame += 1
+                if self._current_frame in fail_on_frames:
+                    return False  # Simulate failure
+                return self._cap.grab()
+
+            def retrieve(self):
+                return self._cap.retrieve()
+
+            def get(self, prop):
+                return self._cap.get(prop)
+
+            def isOpened(self):
+                return self._cap.isOpened()
+
+            def release(self):
+                return self._cap.release()
+
+        # Patch cv2.VideoCapture
+        m.setattr(cv2, "VideoCapture", MockVideoCapture)
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+
+        # Use num_frames=8 which samples: [0, 3, 7, 10, 14, 17, 21, 25]
+        # Frame 3: mocked failure, recovery window [3, 7) -> use frame 4
+        # Frame 10: mocked failure, recovery window [10, 14) -> use frame 11
+        # Frame 17: real corruption, recovery window [17, 21) -> use frame 18
+
+        # Test WITHOUT recovery - should have fewer frames due to failures
+        frames_no_recovery, meta_no = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=False
+        )
+
+        # Test WITH recovery - should recover using next valid frames
+        frames_with_recovery, meta_yes = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=True
+        )
+
+        # With recovery should have MORE frames than without
+        # Without: 5 frames (3, 10, 17 all fail)
+        # With: 8 frames (all recovered)
+        assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
+            f"Recovery should produce more frames. "
+            f"Without: {frames_no_recovery.shape[0]}, "
+            f"With: {frames_with_recovery.shape[0]}"
+        )
+
+        # Verify metadata consistency
+        assert frames_no_recovery.shape[0] == len(meta_no["frames_indices"])
+        assert frames_with_recovery.shape[0] == len(meta_yes["frames_indices"])
+
+        # Verify temporal order is preserved
+        assert meta_yes["frames_indices"] == sorted(meta_yes["frames_indices"])
+
+
+def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test frame recovery with an actual corrupted video file using sparse sampling.
+
+    This test uses corrupted.mp4 which has genuine H.264 codec errors on
+    frame 17. With num_frames=8, the target frames are [0, 3, 7, 10, 14, 17, 21, 25].
+    Frame 17 is corrupted but frames 18-20 are readable, so recovery can use
+    frame 18 to fill in for the failed frame 17.
+
+    This test verifies:
+    1. Without recovery: frame 17 is skipped (7 frames loaded)
+    2. With recovery: frame 18 fills in for frame 17 (8 frames loaded)
+    3. Recovery produces MORE frames than without recovery
+    4. Metadata is consistent with loaded frames
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+
+        # Use num_frames=8 which makes frame 17 a target with recovery window [17, 21)
+        # Target frames: [0, 3, 7, 10, 14, 17, 21, 25]
+        # Frame 17 is corrupted, but frames 18-20 are readable for recovery
+
+        # Test without recovery - frame 17 will be skipped
+        frames_no_recovery, meta_no_recovery = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=False
+        )
+
+        # Test with recovery - frame 18 should fill in for frame 17
+        frames_with_recovery, meta_with_recovery = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=True
+        )
+
+        # Verify metadata consistency for both modes
+        assert frames_no_recovery.shape[0] == len(meta_no_recovery["frames_indices"]), (
+            "Frame count must match indices without recovery"
+        )
+        assert frames_with_recovery.shape[0] == len(
+            meta_with_recovery["frames_indices"]
+        ), "Frame count must match indices with recovery"
+
+        # KEY ASSERTION: Recovery should produce MORE frames than without recovery
+        # Without recovery: 7 frames (frame 17 skipped)
+        # With recovery: 8 frames (frame 18 used for frame 17)
+        assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
+            f"Recovery should produce more frames with sparse sampling. "
+            f"Got {frames_with_recovery.shape[0]} with recovery vs "
+            f"{frames_no_recovery.shape[0]} without"
+        )
+
+        # Verify we got all 8 requested frames with recovery
+        assert frames_with_recovery.shape[0] == 8, (
+            f"With recovery, should load all 8 requested frames. "
+            f"Got {frames_with_recovery.shape[0]}"
+        )
+
+        # Verify the video metadata is correct
+        expected_total_frames = 26
+        assert meta_with_recovery["total_num_frames"] == expected_total_frames, (
+            f"Expected {expected_total_frames} total frames in metadata"
+        )
+
+
+def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that frame_recovery works with the dynamic video backend.
+
+    The dynamic backend samples frames based on fps/duration rather than
+    loading all frames. This test verifies recovery works in that context.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
+
+        # Test without recovery
+        frames_no_recovery, meta_no = loader.load_bytes(
+            video_data, fps=2, max_duration=10, frame_recovery=False
+        )
+
+        # Test with frame_recovery enabled
+        frames_with_recovery, meta_with = loader.load_bytes(
+            video_data, fps=2, max_duration=10, frame_recovery=True
+        )
+
+        # Verify basic properties
+        assert frames_no_recovery.shape[0] > 0, (
+            "Should load some frames without recovery"
+        )
+        assert frames_with_recovery.shape[0] > 0, (
+            "Should load some frames with recovery"
+        )
+        assert "do_sample_frames" in meta_with
+        assert meta_with["do_sample_frames"] is False  # Dynamic backend always False
+        assert frames_with_recovery.shape[0] == len(meta_with["frames_indices"])
+
+        # Key assertion: recovery should help when corrupted frames are sampled
+        # We expect recovery to produce >= frames than without recovery
+        assert frames_with_recovery.shape[0] >= frames_no_recovery.shape[0], (
+            f"Recovery should produce at least as many frames. "
+            f"Got {frames_with_recovery.shape[0]} with recovery vs "
+            f"{frames_no_recovery.shape[0]} without"
+        )
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
@@ -867,7 +867,7 @@ def generate_synthetic_video(
         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         fps = 30  # frames per second
 
-        with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file:
+        with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
             temp_path = temp_file.name
 
             # Create video writer
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py