Skip to content

Commit 6f35154

Browse files
authored
[Frontend] Implement robust video frame recovery for corrupted videos (vllm-project#29197)
Signed-off-by: cmartinez <cmartinez@roblox.com> Signed-off-by: vSeamar <cmartinez@roblox.com>
1 parent 364a8bc commit 6f35154

File tree

4 files changed

+421
-15
lines changed

4 files changed

+421
-15
lines changed

docs/features/multimodal_inputs.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,31 @@ Full example: [examples/online_serving/openai_chat_completion_client_for_multimo
689689
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
690690
```
691691

692+
#### Video Frame Recovery
693+
694+
For improved robustness when processing potentially corrupted or truncated video files, vLLM supports optional frame recovery using a dynamic window forward-scan approach. When enabled, if a target frame fails to load during sequential reading, the next successfully grabbed frame (before the next target frame) will be used in its place.
695+
696+
To enable video frame recovery, pass the `frame_recovery` parameter via `--media-io-kwargs`:
697+
698+
```bash
699+
# Example: Enable frame recovery
700+
vllm serve Qwen/Qwen3-VL-30B-A3B-Instruct \
701+
--media-io-kwargs '{"video": {"frame_recovery": true}}'
702+
```
703+
704+
**Parameters:**
705+
706+
- `frame_recovery`: Boolean flag to enable forward-scan recovery. When `true`, failed frames are recovered using the next available frame within the dynamic window (up to the next target frame). Default is `false`.
707+
708+
**How it works:**
709+
710+
1. The system reads frames sequentially
711+
2. If a target frame fails to grab, it's marked as "failed"
712+
3. The next successfully grabbed frame (before reaching the next target) is used to recover the failed frame
713+
4. This approach handles both mid-video corruption and end-of-video truncation
714+
715+
Works with common video formats like MP4 when using OpenCV backends.
716+
692717
#### Custom RGBA Background Color
693718

694719
To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:

tests/multimodal/test_video.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,3 +299,212 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
299299
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
300300
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
301301
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
302+
303+
304+
# ============================================================================
305+
# Frame Recovery Tests
306+
# ============================================================================
307+
308+
309+
def test_video_recovery_simulated_failures(monkeypatch: pytest.MonkeyPatch):
310+
"""
311+
Test that frame recovery correctly uses the next valid frame when
312+
target frames fail to load.
313+
314+
Uses corrupted.mp4 and mocks VideoCapture.grab() to fail on specific
315+
frame indices (in addition to the real corruption at frame 17), then
316+
verifies recovery produces more frames.
317+
"""
318+
import cv2
319+
320+
with monkeypatch.context() as m:
321+
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
322+
323+
# Load corrupted.mp4 (26 frames, frame 17 is genuinely corrupted)
324+
video_path = ASSETS_DIR / "corrupted.mp4"
325+
with open(video_path, "rb") as f:
326+
video_data = f.read()
327+
328+
# Simulate additional failures on frames 3 and 10
329+
# (in addition to the real corruption at frame 17)
330+
fail_on_frames = {3, 10}
331+
332+
# Store original VideoCapture class
333+
original_video_capture = cv2.VideoCapture
334+
335+
class MockVideoCapture:
336+
"""Wrapper that simulates grab() failures on specific frames."""
337+
338+
def __init__(self, *args, **kwargs):
339+
self._cap = original_video_capture(*args, **kwargs)
340+
self._current_frame = -1
341+
342+
def grab(self):
343+
self._current_frame += 1
344+
if self._current_frame in fail_on_frames:
345+
return False # Simulate failure
346+
return self._cap.grab()
347+
348+
def retrieve(self):
349+
return self._cap.retrieve()
350+
351+
def get(self, prop):
352+
return self._cap.get(prop)
353+
354+
def isOpened(self):
355+
return self._cap.isOpened()
356+
357+
def release(self):
358+
return self._cap.release()
359+
360+
# Patch cv2.VideoCapture
361+
m.setattr(cv2, "VideoCapture", MockVideoCapture)
362+
363+
loader = VIDEO_LOADER_REGISTRY.load("opencv")
364+
365+
# Use num_frames=8 which samples: [0, 3, 7, 10, 14, 17, 21, 25]
366+
# Frame 3: mocked failure, recovery window [3, 7) -> use frame 4
367+
# Frame 10: mocked failure, recovery window [10, 14) -> use frame 11
368+
# Frame 17: real corruption, recovery window [17, 21) -> use frame 18
369+
370+
# Test WITHOUT recovery - should have fewer frames due to failures
371+
frames_no_recovery, meta_no = loader.load_bytes(
372+
video_data, num_frames=8, frame_recovery=False
373+
)
374+
375+
# Test WITH recovery - should recover using next valid frames
376+
frames_with_recovery, meta_yes = loader.load_bytes(
377+
video_data, num_frames=8, frame_recovery=True
378+
)
379+
380+
# With recovery should have MORE frames than without
381+
# Without: 5 frames (3, 10, 17 all fail)
382+
# With: 8 frames (all recovered)
383+
assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
384+
f"Recovery should produce more frames. "
385+
f"Without: {frames_no_recovery.shape[0]}, "
386+
f"With: {frames_with_recovery.shape[0]}"
387+
)
388+
389+
# Verify metadata consistency
390+
assert frames_no_recovery.shape[0] == len(meta_no["frames_indices"])
391+
assert frames_with_recovery.shape[0] == len(meta_yes["frames_indices"])
392+
393+
# Verify temporal order is preserved
394+
assert meta_yes["frames_indices"] == sorted(meta_yes["frames_indices"])
395+
396+
397+
def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
398+
"""
399+
Test frame recovery with an actual corrupted video file using sparse sampling.
400+
401+
This test uses corrupted.mp4 which has genuine H.264 codec errors on
402+
frame 17. With num_frames=8, the target frames are [0, 3, 7, 10, 14, 17, 21, 25].
403+
Frame 17 is corrupted but frames 18-20 are readable, so recovery can use
404+
frame 18 to fill in for the failed frame 17.
405+
406+
This test verifies:
407+
1. Without recovery: frame 17 is skipped (7 frames loaded)
408+
2. With recovery: frame 18 fills in for frame 17 (8 frames loaded)
409+
3. Recovery produces MORE frames than without recovery
410+
4. Metadata is consistent with loaded frames
411+
"""
412+
with monkeypatch.context() as m:
413+
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
414+
415+
corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
416+
417+
with open(corrupted_video_path, "rb") as f:
418+
video_data = f.read()
419+
420+
loader = VIDEO_LOADER_REGISTRY.load("opencv")
421+
422+
# Use num_frames=8 which makes frame 17 a target with recovery window [17, 21)
423+
# Target frames: [0, 3, 7, 10, 14, 17, 21, 25]
424+
# Frame 17 is corrupted, but frames 18-20 are readable for recovery
425+
426+
# Test without recovery - frame 17 will be skipped
427+
frames_no_recovery, meta_no_recovery = loader.load_bytes(
428+
video_data, num_frames=8, frame_recovery=False
429+
)
430+
431+
# Test with recovery - frame 18 should fill in for frame 17
432+
frames_with_recovery, meta_with_recovery = loader.load_bytes(
433+
video_data, num_frames=8, frame_recovery=True
434+
)
435+
436+
# Verify metadata consistency for both modes
437+
assert frames_no_recovery.shape[0] == len(meta_no_recovery["frames_indices"]), (
438+
"Frame count must match indices without recovery"
439+
)
440+
assert frames_with_recovery.shape[0] == len(
441+
meta_with_recovery["frames_indices"]
442+
), "Frame count must match indices with recovery"
443+
444+
# KEY ASSERTION: Recovery should produce MORE frames than without recovery
445+
# Without recovery: 7 frames (frame 17 skipped)
446+
# With recovery: 8 frames (frame 18 used for frame 17)
447+
assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
448+
f"Recovery should produce more frames with sparse sampling. "
449+
f"Got {frames_with_recovery.shape[0]} with recovery vs "
450+
f"{frames_no_recovery.shape[0]} without"
451+
)
452+
453+
# Verify we got all 8 requested frames with recovery
454+
assert frames_with_recovery.shape[0] == 8, (
455+
f"With recovery, should load all 8 requested frames. "
456+
f"Got {frames_with_recovery.shape[0]}"
457+
)
458+
459+
# Verify the video metadata is correct
460+
expected_total_frames = 26
461+
assert meta_with_recovery["total_num_frames"] == expected_total_frames, (
462+
f"Expected {expected_total_frames} total frames in metadata"
463+
)
464+
465+
466+
def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
467+
"""
468+
Test that frame_recovery works with the dynamic video backend.
469+
470+
The dynamic backend samples frames based on fps/duration rather than
471+
loading all frames. This test verifies recovery works in that context.
472+
"""
473+
with monkeypatch.context() as m:
474+
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
475+
476+
corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
477+
478+
with open(corrupted_video_path, "rb") as f:
479+
video_data = f.read()
480+
481+
loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
482+
483+
# Test without recovery
484+
frames_no_recovery, meta_no = loader.load_bytes(
485+
video_data, fps=2, max_duration=10, frame_recovery=False
486+
)
487+
488+
# Test with frame_recovery enabled
489+
frames_with_recovery, meta_with = loader.load_bytes(
490+
video_data, fps=2, max_duration=10, frame_recovery=True
491+
)
492+
493+
# Verify basic properties
494+
assert frames_no_recovery.shape[0] > 0, (
495+
"Should load some frames without recovery"
496+
)
497+
assert frames_with_recovery.shape[0] > 0, (
498+
"Should load some frames with recovery"
499+
)
500+
assert "do_sample_frames" in meta_with
501+
assert meta_with["do_sample_frames"] is False # Dynamic backend always False
502+
assert frames_with_recovery.shape[0] == len(meta_with["frames_indices"])
503+
504+
# Key assertion: recovery should help when corrupted frames are sampled
505+
# We expect recovery to produce >= frames than without recovery
506+
assert frames_with_recovery.shape[0] >= frames_no_recovery.shape[0], (
507+
f"Recovery should produce at least as many frames. "
508+
f"Got {frames_with_recovery.shape[0]} with recovery vs "
509+
f"{frames_no_recovery.shape[0]} without"
510+
)

vllm/benchmarks/datasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -867,7 +867,7 @@ def generate_synthetic_video(
867867
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
868868
fps = 30 # frames per second
869869

870-
with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file:
870+
with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
871871
temp_path = temp_file.name
872872

873873
# Create video writer

0 commit comments

Comments
 (0)