|
| 1 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 2 | +# All rights reserved. |
| 3 | +# |
| 4 | +# This source code is licensed under the BSD-style license found in the |
| 5 | +# LICENSE file in the root directory of this source tree. |
| 6 | + |
| 7 | +""" |
| 8 | +=================================================================== |
| 9 | +Exact vs Approximate seek mode: Performance and accuracy comparison |
| 10 | +=================================================================== |
| 11 | +
|
| 12 | +In this example, we will describe the ``seek_mode`` parameter of the |
| 13 | +:class:`~torchcodec.decoders.VideoDecoder` class. |
| 14 | +This parameter offers a trade-off between the speed of the |
| 15 | +:class:`~torchcodec.decoders.VideoDecoder` creation, against the seeking |
| 16 | +accuracy of the retreived frames (i.e. in approximate mode, requesting the |
| 17 | +``i``'th frame may not necessarily return frame ``i``). |
| 18 | +""" |
| 19 | + |
| 20 | +# %% |
| 21 | +# First, a bit of boilerplate: we'll download a short video from the web, and |
| 22 | +# use the ffmpeg CLI to repeat it 100 times. We'll end up with two videos: a |
| 23 | +# short video of approximately 13s and a long one of about 20 mins. |
| 24 | +# You can ignore that part and jump right below to :ref:`perf_creation`. |
| 25 | + |
| 26 | + |
| 27 | +import torch |
| 28 | +import requests |
| 29 | +import tempfile |
| 30 | +from pathlib import Path |
| 31 | +import shutil |
| 32 | +import subprocess |
| 33 | +from time import perf_counter_ns |
| 34 | + |
| 35 | + |
| 36 | +# Video source: https://www.pexels.com/video/dog-eating-854132/ |
| 37 | +# License: CC0. Author: Coverr. |
| 38 | +url = "https://videos.pexels.com/video-files/854132/854132-sd_640_360_25fps.mp4" |
| 39 | +response = requests.get(url, headers={"User-Agent": ""}) |
| 40 | +if response.status_code != 200: |
| 41 | + raise RuntimeError(f"Failed to download video. {response.status_code = }.") |
| 42 | + |
| 43 | +temp_dir = tempfile.mkdtemp() |
| 44 | +short_video_path = Path(temp_dir) / "short_video.mp4" |
| 45 | +with open(short_video_path, 'wb') as f: |
| 46 | + for chunk in response.iter_content(): |
| 47 | + f.write(chunk) |
| 48 | + |
| 49 | +long_video_path = Path(temp_dir) / "long_video.mp4" |
| 50 | +ffmpeg_command = [ |
| 51 | + "ffmpeg", |
| 52 | + "-stream_loop", "99", # repeat video 100 times |
| 53 | + "-i", f"{short_video_path}", |
| 54 | + "-c", "copy", |
| 55 | + f"{long_video_path}" |
| 56 | +] |
| 57 | +subprocess.run(ffmpeg_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| 58 | + |
| 59 | +from torchcodec.decoders import VideoDecoder |
| 60 | +print(f"Short video duration: {VideoDecoder(short_video_path).metadata.duration_seconds} seconds") |
| 61 | +print(f"Long video duration: {VideoDecoder(long_video_path).metadata.duration_seconds / 60} minutes") |
| 62 | + |
| 63 | +# %% |
| 64 | +# .. _perf_creation: |
| 65 | +# |
| 66 | +# Performance: ``VideoDecoder`` creation |
| 67 | +# -------------------------------------- |
| 68 | +# |
| 69 | +# In terms of performance, the ``seek_mode`` parameter ultimately affects the |
| 70 | +# **creation** of a :class:`~torchcodec.decoders.VideoDecoder` object. The |
| 71 | +# longer the video, the higher the performance gain. |
| 72 | + |
| 73 | + |
| 74 | +def bench(f, average_over=50, warmup=2, **f_kwargs): |
| 75 | + |
| 76 | + for _ in range(warmup): |
| 77 | + f(**f_kwargs) |
| 78 | + |
| 79 | + times = [] |
| 80 | + for _ in range(average_over): |
| 81 | + start = perf_counter_ns() |
| 82 | + f(**f_kwargs) |
| 83 | + end = perf_counter_ns() |
| 84 | + times.append(end - start) |
| 85 | + |
| 86 | + times = torch.tensor(times) * 1e-6 # ns to ms |
| 87 | + std = times.std().item() |
| 88 | + med = times.median().item() |
| 89 | + print(f"{med = :.2f}ms +- {std:.2f}") |
| 90 | + |
| 91 | + |
| 92 | +print("Creating a VideoDecoder object with seek_mode='exact' on a short video:") |
| 93 | +bench(VideoDecoder, source=short_video_path, seek_mode="exact") |
| 94 | +print("Creating a VideoDecoder object with seek_mode='approximate' on a short video:") |
| 95 | +bench(VideoDecoder, source=short_video_path, seek_mode="approximate") |
| 96 | +print() |
| 97 | +print("Creating a VideoDecoder object with seek_mode='exact' on a long video:") |
| 98 | +bench(VideoDecoder, source=long_video_path, seek_mode="exact") |
| 99 | +print("Creating a VideoDecoder object with seek_mode='approximate' on a long video:") |
| 100 | +bench(VideoDecoder, source=long_video_path, seek_mode="approximate") |
| 101 | + |
| 102 | +# %% |
| 103 | +# Performance: frame decoding and clip sampling |
| 104 | +# --------------------------------------------- |
| 105 | +# |
| 106 | +# Strictly speaking the ``seek_mode`` parameter only affects the performance of |
| 107 | +# the :class:`~torchcodec.decoders.VideoDecoder` creation. It does not have a |
| 108 | +# direct effect on the performance of frame decoding or sampling. **However**, |
| 109 | +# because frame decoding and sampling patterns typically involve the creation of |
| 110 | +# the :class:`~torchcodec.decoders.VideoDecoder` (one per video), ``seek_mode`` |
| 111 | +# may very well end up affecting the performance of decoding and samplers. For |
| 112 | +# example: |
| 113 | + |
| 114 | +from torchcodec import samplers |
| 115 | + |
| 116 | + |
| 117 | +def sample_clips(seek_mode): |
| 118 | + return samplers.clips_at_random_indices( |
| 119 | + decoder=VideoDecoder( |
| 120 | + source=long_video_path, |
| 121 | + seek_mode=seek_mode |
| 122 | + ), |
| 123 | + num_clips=5, |
| 124 | + num_frames_per_clip=2, |
| 125 | + ) |
| 126 | + |
| 127 | + |
| 128 | +print("Sampling clips with seek_mode='exact':") |
| 129 | +bench(sample_clips, seek_mode="exact") |
| 130 | +print("Sampling clips with seek_mode='approximate':") |
| 131 | +bench(sample_clips, seek_mode="approximate") |
| 132 | + |
| 133 | +# %% |
| 134 | +# Accuracy: Metadata and frame retrieval |
| 135 | +# -------------------------------------- |
| 136 | +# |
| 137 | +# We've seen that using ``seek_mode="approximate"`` can significantly speed up |
| 138 | +# the :class:`~torchcodec.decoders.VideoDecoder` creation. The price to pay for |
| 139 | +# that is that seeking won't always be as accurate as with |
| 140 | +# ``seek_mode="exact"``. It can also affect the exactness of the metadata. |
| 141 | +# |
| 142 | +# However, in a lot of cases, you'll find that there will be no accuracy |
| 143 | +# difference between the two modes, which means that ``seek_mode="approximate"`` |
| 144 | +# is a net win: |
| 145 | + |
| 146 | +print("Metadata of short video with seek_mode='exact':") |
| 147 | +print(VideoDecoder(short_video_path, seek_mode="exact").metadata) |
| 148 | +print("Metadata of short video with seek_mode='approximate':") |
| 149 | +print(VideoDecoder(short_video_path, seek_mode="approximate").metadata) |
| 150 | + |
| 151 | +exact_decoder = VideoDecoder(short_video_path, seek_mode="exact") |
| 152 | +approx_decoder = VideoDecoder(short_video_path, seek_mode="approximate") |
| 153 | +for i in range(len(exact_decoder)): |
| 154 | + torch.testing.assert_close( |
| 155 | + exact_decoder.get_frame_at(i).data, |
| 156 | + approx_decoder.get_frame_at(i).data, |
| 157 | + atol=0, rtol=0, |
| 158 | + ) |
| 159 | +print("Frame seeking is the same for this video!") |
| 160 | + |
| 161 | +# %% |
| 162 | +# What is this doing under the hood? |
| 163 | +# ---------------------------------- |
| 164 | +# |
| 165 | +# With ``seek_mode="exact"``, the :class:`~torchcodec.decoders.VideoDecoder` |
| 166 | +# performs a :term:`scan` when it is instantiated. The scan doesn't involve |
| 167 | +# decoding, but processes an entire file to infer more accurate metadata (like |
| 168 | +# duration), and also builds an internal index of frames and key-frames. This |
| 169 | +# internal index is potentially more accurate than the one in the file's |
| 170 | +# headers, which leads to more accurate seeking behavior. |
| 171 | +# Without the scan, TorchCodec relies only on the metadata contained in the |
| 172 | +# file, which may not always be as accurate. |
| 173 | +# |
| 174 | +# Which mode should I use? |
| 175 | +# ------------------------ |
| 176 | +# |
| 177 | +# The general rule of thumb is as follows: |
| 178 | +# |
| 179 | +# - If you really care about exactness of frame seeking, use "exact". |
| 180 | +# - If you can sacrifice exactness of seeking for speed, which is usually the |
| 181 | +# case when doing clip sampling, use "approximate". |
| 182 | +# - If your videos don't have variable framerate and their metadata is correct, |
| 183 | +# then "approximate" mode is a net win: it will be just as accurate as the |
| 184 | +# "exact" mode while still being significantly faster. |
| 185 | + |
| 186 | +# %% |
| 187 | +shutil.rmtree(temp_dir) |
| 188 | +# %% |
0 commit comments