Skip to content

Commit 2ecf666

Browse files
authored
Add tutorial on seek_mode (#497)
1 parent 76d4d10 commit 2ecf666

File tree

3 files changed

+198
-4
lines changed

3 files changed

+198
-4
lines changed

docs/source/glossary.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ Glossary
1717
A scan corresponds to an entire pass over a video file, with the purpose
1818
of retrieving metadata about the different streams and frames. **It does
1919
not involve decoding**, so it is a lot cheaper than decoding the file.
20+
The :class:`~torchcodec.decoders.VideoDecoder` performs a scan when using
21+
``seek_mode="exact"``, and doesn't scan when using
22+
``seek_mode="approximate"``.
2023

2124
clips
2225
A clip is a sequence of frames, usually in :term:`pts` order. The frames

examples/approximate_mode.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
===================================================================
9+
Exact vs Approximate seek mode: Performance and accuracy comparison
10+
===================================================================
11+
12+
In this example, we will describe the ``seek_mode`` parameter of the
13+
:class:`~torchcodec.decoders.VideoDecoder` class.
14+
This parameter offers a trade-off between the speed of the
15+
:class:`~torchcodec.decoders.VideoDecoder` creation, against the seeking
16+
accuracy of the retreived frames (i.e. in approximate mode, requesting the
17+
``i``'th frame may not necessarily return frame ``i``).
18+
"""
19+
20+
# %%
21+
# First, a bit of boilerplate: we'll download a short video from the web, and
22+
# use the ffmpeg CLI to repeat it 100 times. We'll end up with two videos: a
23+
# short video of approximately 13s and a long one of about 20 mins.
24+
# You can ignore that part and jump right below to :ref:`perf_creation`.
25+
26+
27+
import torch
28+
import requests
29+
import tempfile
30+
from pathlib import Path
31+
import shutil
32+
import subprocess
33+
from time import perf_counter_ns
34+
35+
36+
# Video source: https://www.pexels.com/video/dog-eating-854132/
37+
# License: CC0. Author: Coverr.
38+
url = "https://videos.pexels.com/video-files/854132/854132-sd_640_360_25fps.mp4"
39+
response = requests.get(url, headers={"User-Agent": ""})
40+
if response.status_code != 200:
41+
raise RuntimeError(f"Failed to download video. {response.status_code = }.")
42+
43+
temp_dir = tempfile.mkdtemp()
44+
short_video_path = Path(temp_dir) / "short_video.mp4"
45+
with open(short_video_path, 'wb') as f:
46+
for chunk in response.iter_content():
47+
f.write(chunk)
48+
49+
long_video_path = Path(temp_dir) / "long_video.mp4"
50+
ffmpeg_command = [
51+
"ffmpeg",
52+
"-stream_loop", "99", # repeat video 100 times
53+
"-i", f"{short_video_path}",
54+
"-c", "copy",
55+
f"{long_video_path}"
56+
]
57+
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
58+
59+
from torchcodec.decoders import VideoDecoder
60+
print(f"Short video duration: {VideoDecoder(short_video_path).metadata.duration_seconds} seconds")
61+
print(f"Long video duration: {VideoDecoder(long_video_path).metadata.duration_seconds / 60} minutes")
62+
63+
# %%
64+
# .. _perf_creation:
65+
#
66+
# Performance: ``VideoDecoder`` creation
67+
# --------------------------------------
68+
#
69+
# In terms of performance, the ``seek_mode`` parameter ultimately affects the
70+
# **creation** of a :class:`~torchcodec.decoders.VideoDecoder` object. The
71+
# longer the video, the higher the performance gain.
72+
73+
74+
def bench(f, average_over=50, warmup=2, **f_kwargs):
75+
76+
for _ in range(warmup):
77+
f(**f_kwargs)
78+
79+
times = []
80+
for _ in range(average_over):
81+
start = perf_counter_ns()
82+
f(**f_kwargs)
83+
end = perf_counter_ns()
84+
times.append(end - start)
85+
86+
times = torch.tensor(times) * 1e-6 # ns to ms
87+
std = times.std().item()
88+
med = times.median().item()
89+
print(f"{med = :.2f}ms +- {std:.2f}")
90+
91+
92+
print("Creating a VideoDecoder object with seek_mode='exact' on a short video:")
93+
bench(VideoDecoder, source=short_video_path, seek_mode="exact")
94+
print("Creating a VideoDecoder object with seek_mode='approximate' on a short video:")
95+
bench(VideoDecoder, source=short_video_path, seek_mode="approximate")
96+
print()
97+
print("Creating a VideoDecoder object with seek_mode='exact' on a long video:")
98+
bench(VideoDecoder, source=long_video_path, seek_mode="exact")
99+
print("Creating a VideoDecoder object with seek_mode='approximate' on a long video:")
100+
bench(VideoDecoder, source=long_video_path, seek_mode="approximate")
101+
102+
# %%
103+
# Performance: frame decoding and clip sampling
104+
# ---------------------------------------------
105+
#
106+
# Strictly speaking the ``seek_mode`` parameter only affects the performance of
107+
# the :class:`~torchcodec.decoders.VideoDecoder` creation. It does not have a
108+
# direct effect on the performance of frame decoding or sampling. **However**,
109+
# because frame decoding and sampling patterns typically involve the creation of
110+
# the :class:`~torchcodec.decoders.VideoDecoder` (one per video), ``seek_mode``
111+
# may very well end up affecting the performance of decoding and samplers. For
112+
# example:
113+
114+
from torchcodec import samplers
115+
116+
117+
def sample_clips(seek_mode):
118+
return samplers.clips_at_random_indices(
119+
decoder=VideoDecoder(
120+
source=long_video_path,
121+
seek_mode=seek_mode
122+
),
123+
num_clips=5,
124+
num_frames_per_clip=2,
125+
)
126+
127+
128+
print("Sampling clips with seek_mode='exact':")
129+
bench(sample_clips, seek_mode="exact")
130+
print("Sampling clips with seek_mode='approximate':")
131+
bench(sample_clips, seek_mode="approximate")
132+
133+
# %%
134+
# Accuracy: Metadata and frame retrieval
135+
# --------------------------------------
136+
#
137+
# We've seen that using ``seek_mode="approximate"`` can significantly speed up
138+
# the :class:`~torchcodec.decoders.VideoDecoder` creation. The price to pay for
139+
# that is that seeking won't always be as accurate as with
140+
# ``seek_mode="exact"``. It can also affect the exactness of the metadata.
141+
#
142+
# However, in a lot of cases, you'll find that there will be no accuracy
143+
# difference between the two modes, which means that ``seek_mode="approximate"``
144+
# is a net win:
145+
146+
print("Metadata of short video with seek_mode='exact':")
147+
print(VideoDecoder(short_video_path, seek_mode="exact").metadata)
148+
print("Metadata of short video with seek_mode='approximate':")
149+
print(VideoDecoder(short_video_path, seek_mode="approximate").metadata)
150+
151+
exact_decoder = VideoDecoder(short_video_path, seek_mode="exact")
152+
approx_decoder = VideoDecoder(short_video_path, seek_mode="approximate")
153+
for i in range(len(exact_decoder)):
154+
torch.testing.assert_close(
155+
exact_decoder.get_frame_at(i).data,
156+
approx_decoder.get_frame_at(i).data,
157+
atol=0, rtol=0,
158+
)
159+
print("Frame seeking is the same for this video!")
160+
161+
# %%
162+
# What is this doing under the hood?
163+
# ----------------------------------
164+
#
165+
# With ``seek_mode="exact"``, the :class:`~torchcodec.decoders.VideoDecoder`
166+
# performs a :term:`scan` when it is instantiated. The scan doesn't involve
167+
# decoding, but processes an entire file to infer more accurate metadata (like
168+
# duration), and also builds an internal index of frames and key-frames. This
169+
# internal index is potentially more accurate than the one in the file's
170+
# headers, which leads to more accurate seeking behavior.
171+
# Without the scan, TorchCodec relies only on the metadata contained in the
172+
# file, which may not always be as accurate.
173+
#
174+
# Which mode should I use?
175+
# ------------------------
176+
#
177+
# The general rule of thumb is as follows:
178+
#
179+
# - If you really care about exactness of frame seeking, use "exact".
180+
# - If you can sacrifice exactness of seeking for speed, which is usually the
181+
# case when doing clip sampling, use "approximate".
182+
# - If your videos don't have variable framerate and their metadata is correct,
183+
# then "approximate" mode is a net win: it will be just as accurate as the
184+
# "exact" mode while still being significantly faster.
185+
186+
# %%
187+
shutil.rmtree(temp_dir)
188+
# %%

src/torchcodec/decoders/_video_decoder.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
class VideoDecoder:
2323
"""A single-stream video decoder.
2424
25-
This decoder always performs a :term:`scan` of the video.
2625
2726
Args:
2827
source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the video.
@@ -51,9 +50,13 @@ class VideoDecoder:
5150
Default: 1.
5251
device (str or torch.device, optional): The device to use for decoding. Default: "cpu".
5352
seek_mode (str, optional): Determines if frame access will be "exact" or
54-
"approximate". Exact guarantees that requesting frame i will always return frame i,
55-
but doing so requires an initial :term:`scan` of the file. Approximate is faster as it avoids scanning the
56-
file, but less accurate as it uses the file's metadata to calculate where i probably is. Default: "exact".
53+
"approximate". Exact guarantees that requesting frame i will always
54+
return frame i, but doing so requires an initial :term:`scan` of the
55+
file. Approximate is faster as it avoids scanning the file, but less
56+
accurate as it uses the file's metadata to calculate where i
57+
probably is. Default: "exact".
58+
Read more about this parameter in:
59+
:ref:`sphx_glr_generated_examples_approximate_mode.py`
5760
5861
5962
Attributes:

0 commit comments

Comments
 (0)