Merge branch 'main' of github.com:pytorch/torchcodec into fallback-expose

NicolasHug · NicolasHug · commit 329e13006096 · 2025-10-20T23:07:03.000+01:00
diff --git a/benchmarks/decoders/benchmark_transforms.py b/benchmarks/decoders/benchmark_transforms.py
@@ -0,0 +1,164 @@
+import math
+from argparse import ArgumentParser
+from pathlib import Path
+from time import perf_counter_ns
+
+import torch
+from torch import Tensor
+from torchcodec._core import add_video_stream, create_from_file, get_frames_by_pts
+from torchcodec.decoders import VideoDecoder
+from torchvision.transforms import v2
+
+DEFAULT_NUM_EXP = 20
+
+
+def bench(f, *args, num_exp=DEFAULT_NUM_EXP, warmup=1) -> Tensor:
+
+    for _ in range(warmup):
+        f(*args)
+
+    times = []
+    for _ in range(num_exp):
+        start = perf_counter_ns()
+        f(*args)
+        end = perf_counter_ns()
+        times.append(end - start)
+    return torch.tensor(times).float()
+
+
+def report_stats(times: Tensor, unit: str = "ms", prefix: str = "") -> float:
+    mul = {
+        "ns": 1,
+        "µs": 1e-3,
+        "ms": 1e-6,
+        "s": 1e-9,
+    }[unit]
+    times = times * mul
+    std = times.std().item()
+    med = times.median().item()
+    mean = times.mean().item()
+    min = times.min().item()
+    max = times.max().item()
+    print(
+        f"{prefix:<45} {med = :.2f}, {mean = :.2f} +- {std:.2f}, {min = :.2f}, {max = :.2f} - in {unit}"
+    )
+
+
+def torchvision_resize(
+    path: Path, pts_seconds: list[float], dims: tuple[int, int]
+) -> None:
+    decoder = create_from_file(str(path), seek_mode="approximate")
+    add_video_stream(decoder)
+    raw_frames, *_ = get_frames_by_pts(decoder, timestamps=pts_seconds)
+    return v2.functional.resize(raw_frames, size=dims)
+
+
+def torchvision_crop(
+    path: Path, pts_seconds: list[float], dims: tuple[int, int], x: int, y: int
+) -> None:
+    decoder = create_from_file(str(path), seek_mode="approximate")
+    add_video_stream(decoder)
+    raw_frames, *_ = get_frames_by_pts(decoder, timestamps=pts_seconds)
+    return v2.functional.crop(raw_frames, top=y, left=x, height=dims[0], width=dims[1])
+
+
+def decoder_native_resize(
+    path: Path, pts_seconds: list[float], dims: tuple[int, int]
+) -> None:
+    decoder = create_from_file(str(path), seek_mode="approximate")
+    add_video_stream(decoder, transform_specs=f"resize, {dims[0]}, {dims[1]}")
+    return get_frames_by_pts(decoder, timestamps=pts_seconds)[0]
+
+
+def decoder_native_crop(
+    path: Path, pts_seconds: list[float], dims: tuple[int, int], x: int, y: int
+) -> None:
+    decoder = create_from_file(str(path), seek_mode="approximate")
+    add_video_stream(decoder, transform_specs=f"crop, {dims[0]}, {dims[1]}, {x}, {y}")
+    return get_frames_by_pts(decoder, timestamps=pts_seconds)[0]
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--path", type=str, help="path to file", required=True)
+    parser.add_argument(
+        "--num-exp",
+        type=int,
+        default=DEFAULT_NUM_EXP,
+        help="number of runs to average over",
+    )
+
+    args = parser.parse_args()
+    path = Path(args.path)
+
+    metadata = VideoDecoder(path).metadata
+    duration = metadata.duration_seconds
+
+    print(
+        f"Benchmarking {path.name}, duration: {duration}, codec: {metadata.codec}, averaging over {args.num_exp} runs:"
+    )
+
+    input_height = metadata.height
+    input_width = metadata.width
+    fraction_of_total_frames_to_sample = [0.005, 0.01, 0.05, 0.1]
+    fraction_of_input_dimensions = [0.5, 0.25, 0.125]
+
+    for num_fraction in fraction_of_total_frames_to_sample:
+        num_frames_to_sample = math.ceil(metadata.num_frames * num_fraction)
+        print(
+            f"Sampling {num_fraction * 100}%, {num_frames_to_sample}, of {metadata.num_frames} frames"
+        )
+        uniform_timestamps = [
+            i * duration / num_frames_to_sample for i in range(num_frames_to_sample)
+        ]
+
+        for dims_fraction in fraction_of_input_dimensions:
+            dims = (int(input_height * dims_fraction), int(input_width * dims_fraction))
+
+            times = bench(
+                torchvision_resize, path, uniform_timestamps, dims, num_exp=args.num_exp
+            )
+            report_stats(times, prefix=f"torchvision_resize({dims})")
+
+            times = bench(
+                decoder_native_resize,
+                path,
+                uniform_timestamps,
+                dims,
+                num_exp=args.num_exp,
+            )
+            report_stats(times, prefix=f"decoder_native_resize({dims})")
+            print()
+
+            center_x = (input_height - dims[0]) // 2
+            center_y = (input_width - dims[1]) // 2
+            times = bench(
+                torchvision_crop,
+                path,
+                uniform_timestamps,
+                dims,
+                center_x,
+                center_y,
+                num_exp=args.num_exp,
+            )
+            report_stats(
+                times, prefix=f"torchvision_crop({dims}, {center_x}, {center_y})"
+            )
+
+            times = bench(
+                decoder_native_crop,
+                path,
+                uniform_timestamps,
+                dims,
+                center_x,
+                center_y,
+                num_exp=args.num_exp,
+            )
+            report_stats(
+                times, prefix=f"decoder_native_crop({dims}, {center_x}, {center_y})"
+            )
+            print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py
@@ -6,23 +6,20 @@
 
 import subprocess
 from pathlib import Path
+from typing import Optional
 
 import numpy as np
 
 import torch
 from PIL import Image
 
-from .utils import sanitize_filtergraph_expression
+from .utils import AV1_VIDEO, H265_VIDEO, NASA_VIDEO, TestVideo
 
 # Run this script to update the resources used in unit tests. The resources are all derived
 # from source media already checked into the repo.
 
-SCRIPT_DIR = Path(__file__).resolve().parent
-TORCHCODEC_PATH = SCRIPT_DIR.parent
-RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
 
-
-def convert_image_to_tensor(image_path):
+def convert_image_to_tensor(image_path: str) -> None:
     image_path = Path(image_path)
     if not image_path.exists():
         return
@@ -37,37 +34,56 @@ def convert_image_to_tensor(image_path):
     image_path.unlink()
 
 
-def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
+def generate_frame_by_index(
+    video: TestVideo,
+    *,
+    frame_index: int,
+    stream_index: int,
+    filters: Optional[str] = None,
+) -> None:
+    # Note that we are using 0-based index naming. As a result, we are
+    # generating files one-by-one, giving the actual file name that we want.
+    # ffmpeg does have an option to generate multiple files for us, but it uses
+    # 1-based indexing. We can't use 1-based indexing because we want to match
+    # the 0-based indexing in our tests.
+    base_path = video.get_base_path_by_index(
+        frame_index, stream_index=stream_index, filters=filters
+    )
+    output_bmp = f"{base_path}.bmp"
+
     # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
     # which always happens BEFORE any of the filters that we receive as input. We do this to
     # ensure that the color conversion happens BEFORE the filters, matching the behavior of the
     # torchcodec filtergraph implementation.
     #
     # Not doing this would result in the color conversion happening AFTER the filters, which
     # would result in different color values for the same frame.
-    filtergraph = f"select='eq(n\\,{frame})',format=rgb24"
+    filtergraph = f"select='eq(n\\,{frame_index})',format=rgb24"
     if filters is not None:
         filtergraph = filtergraph + f",{filters}"
 
     cmd = [
         "ffmpeg",
         "-y",
         "-i",
-        video_path,
+        video.path,
         "-map",
-        f"0:{stream}",
+        f"0:{stream_index}",
         "-vf",
         filtergraph,
         "-fps_mode",
         "passthrough",
         "-update",
         "1",
-        output_path,
+        output_bmp,
     ]
     subprocess.run(cmd, check=True)
+    convert_image_to_tensor(output_bmp)
 
 
-def get_frame_by_timestamp(video_path, timestamp, output_path):
+def generate_frame_by_timestamp(
+    video_path: str, timestamp: float, output_path: str
+) -> None:
     cmd = [
         "ffmpeg",
         "-y",
@@ -80,40 +96,32 @@ def get_frame_by_timestamp(video_path, timestamp, output_path):
         output_path,
     ]
     subprocess.run(cmd, check=True)
+    convert_image_to_tensor(output_path)
 
 
 def generate_nasa_13013_references():
-    VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4"
-
     # Note: The naming scheme used here must match the naming scheme used to load
     # tensors in ./utils.py.
-    STREAMS = [0, 3]
-    FRAMES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
-    for stream in STREAMS:
-        for frame in FRAMES:
-            # Note that we are using 0-based index naming. Asking ffmpeg to number output
-            # frames would result in 1-based index naming. We enforce 0-based index naming
-            # so that the name of reference frames matches the index when accessing that
-            # frame in the Python decoder.
-            output_bmp = f"{VIDEO_PATH}.stream{stream}.frame{frame:06d}.bmp"
-            get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=stream)
-            convert_image_to_tensor(output_bmp)
+    streams = [0, 3]
+    frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
+    for stream in streams:
+        for frame in frames:
+            generate_frame_by_index(NASA_VIDEO, frame_index=frame, stream_index=stream)
 
     # Extract individual frames at specific timestamps, including the last frame of the video.
     seek_timestamp = [6.0, 6.1, 10.0, 12.979633]
     timestamp_name = [f"{seek_timestamp:06f}" for seek_timestamp in seek_timestamp]
     for timestamp, name in zip(seek_timestamp, timestamp_name):
-        output_bmp = f"{VIDEO_PATH}.time{name}.bmp"
-        get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp)
-        convert_image_to_tensor(output_bmp)
+        output_bmp = f"{NASA_VIDEO.path}.time{name}.bmp"
+        generate_frame_by_timestamp(NASA_VIDEO.path, timestamp, output_bmp)
 
     # Extract frames with specific filters. We have tests that assume these exact filters.
-    FRAMES = [0, 15, 200, 389]
+    frames = [0, 15, 200, 389]
     crop_filter = "crop=300:200:50:35:exact=1"
-    for frame in FRAMES:
-        output_bmp = f"{VIDEO_PATH}.{sanitize_filtergraph_expression(crop_filter)}.stream3.frame{frame:06d}.bmp"
-        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter)
-        convert_image_to_tensor(output_bmp)
+    for frame in frames:
+        generate_frame_by_index(
+            NASA_VIDEO, frame_index=frame, stream_index=3, filters=crop_filter
+        )
 
 
 def generate_h265_video_references():
@@ -122,25 +130,18 @@ def generate_h265_video_references():
     # ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265  --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
     # ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y
     # Note that this video only has 1 stream, at index 0.
-    VIDEO_PATH = RESOURCES_DIR / "h265_video.mp4"
-    FRAMES = [5]
-    for frame in FRAMES:
-        output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp"
-        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
-        convert_image_to_tensor(output_bmp)
+    frames = [5]
+    for frame in frames:
+        generate_frame_by_index(H265_VIDEO, frame_index=frame, stream_index=0)
 
 
 def generate_av1_video_references():
     # This video was generated by running the following:
     # ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv
     # Note that this video only has 1 stream, at index 0.
-    VIDEO_PATH = RESOURCES_DIR / "av1_video.mkv"
-    FRAMES = [10]
-
-    for frame in FRAMES:
-        output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp"
-        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
-        convert_image_to_tensor(output_bmp)
+    frames = [10]
+    for frame in frames:
+        generate_frame_by_index(AV1_VIDEO, frame_index=frame, stream_index=0)
 
 
 def main():
diff --git a/test/utils.py b/test/utils.py