From c749aea4714bc752f6ac62d680c506458d573139 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Sat, 18 Oct 2025 11:28:35 -0700
Subject: [PATCH 1/4] Refactor resource generation

---
 test/generate_reference_resources.py |  95 +++++------
 test/utils.py                        | 236 ++++++++++++++-------------
 2 files changed, 170 insertions(+), 161 deletions(-)

diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py
index fe515ebde..ccc94d614 100644
--- a/test/generate_reference_resources.py
+++ b/test/generate_reference_resources.py
@@ -6,23 +6,20 @@
 
 import subprocess
 from pathlib import Path
+from typing import Optional
 
 import numpy as np
 
 import torch
 from PIL import Image
 
-from .utils import sanitize_filtergraph_expression
+from .utils import AV1_VIDEO, H265_VIDEO, NASA_VIDEO, TestVideo
 
 # Run this script to update the resources used in unit tests. The resources are all derived
 # from source media already checked into the repo.
 
-SCRIPT_DIR = Path(__file__).resolve().parent
-TORCHCODEC_PATH = SCRIPT_DIR.parent
-RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
 
-
-def convert_image_to_tensor(image_path):
+def convert_image_to_tensor(image_path: str) -> None:
     image_path = Path(image_path)
     if not image_path.exists():
         return
@@ -37,7 +34,23 @@ def convert_image_to_tensor(image_path):
     image_path.unlink()
 
 
-def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
+def generate_frame_by_index(
+    video: TestVideo,
+    *,
+    frame_index: int,
+    stream_index: int,
+    filters: Optional[str] = None,
+) -> None:
+    # Note that we are using 0-based index naming. As a result, we are
+    # generating files one-by-one, giving the actual file name that we want.
+    # ffmpeg does have an option to generate multiple files for us, but it uses
+    # 1-based indexing. We can't use 1-based indexing because we want to match
+    # the 0-based indexing in our tests.
+    base_path = video.get_base_path_by_index(
+        frame_index, stream_index=stream_index, filters=filters
+    )
+    output_bmp = f"{base_path}.bmp"
+
     # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
     # which always happens BEFORE any of the filters that we receive as input. We do this to
     # ensure that the color conversion happens BEFORE the filters, matching the behavior of the
@@ -45,7 +58,7 @@ def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
     #
     # Not doing this would result in the color conversion happening AFTER the filters, which
     # would result in different color values for the same frame.
-    filtergraph = f"select='eq(n\\,{frame})',format=rgb24"
+    filtergraph = f"select='eq(n\\,{frame_index})',format=rgb24"
     if filters is not None:
         filtergraph = filtergraph + f",{filters}"
 
@@ -53,21 +66,24 @@ def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
         "ffmpeg",
         "-y",
         "-i",
-        video_path,
+        video.path,
         "-map",
-        f"0:{stream}",
+        f"0:{stream_index}",
         "-vf",
         filtergraph,
         "-fps_mode",
         "passthrough",
         "-update",
         "1",
-        output_path,
+        output_bmp,
     ]
     subprocess.run(cmd, check=True)
+    convert_image_to_tensor(output_bmp)
 
 
-def get_frame_by_timestamp(video_path, timestamp, output_path):
+def generate_frame_by_timestamp(
+    video_path: str, timestamp: float, output_path: str
+) -> None:
     cmd = [
         "ffmpeg",
         "-y",
@@ -80,40 +96,34 @@ def get_frame_by_timestamp(video_path, timestamp, output_path):
         output_path,
     ]
     subprocess.run(cmd, check=True)
+    convert_image_to_tensor(output_path)
 
 
 def generate_nasa_13013_references():
-    VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4"
-
     # Note: The naming scheme used here must match the naming scheme used to load
     # tensors in ./utils.py.
-    STREAMS = [0, 3]
-    FRAMES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
-    for stream in STREAMS:
-        for frame in FRAMES:
-            # Note that we are using 0-based index naming. Asking ffmpeg to number output
-            # frames would result in 1-based index naming. We enforce 0-based index naming
-            # so that the name of reference frames matches the index when accessing that
-            # frame in the Python decoder.
-            output_bmp = f"{VIDEO_PATH}.stream{stream}.frame{frame:06d}.bmp"
-            get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=stream)
-            convert_image_to_tensor(output_bmp)
+    streams = [0, 3]
+    frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
+    for stream in streams:
+        for frame in frames:
+            output_bmp = generate_frame_by_index(
+                NASA_VIDEO, frame_index=frame, stream_index=stream
+            )
 
     # Extract individual frames at specific timestamps, including the last frame of the video.
     seek_timestamp = [6.0, 6.1, 10.0, 12.979633]
     timestamp_name = [f"{seek_timestamp:06f}" for seek_timestamp in seek_timestamp]
     for timestamp, name in zip(seek_timestamp, timestamp_name):
-        output_bmp = f"{VIDEO_PATH}.time{name}.bmp"
-        get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp)
-        convert_image_to_tensor(output_bmp)
+        output_bmp = f"{NASA_VIDEO.path}.time{name}.bmp"
+        generate_frame_by_timestamp(NASA_VIDEO.path, timestamp, output_bmp)
 
     # Extract frames with specific filters. We have tests that assume these exact filters.
-    FRAMES = [0, 15, 200, 389]
+    frames = [0, 15, 200, 389]
     crop_filter = "crop=300:200:50:35:exact=1"
-    for frame in FRAMES:
-        output_bmp = f"{VIDEO_PATH}.{sanitize_filtergraph_expression(crop_filter)}.stream3.frame{frame:06d}.bmp"
-        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter)
-        convert_image_to_tensor(output_bmp)
+    for frame in frames:
+        generate_frame_by_index(
+            NASA_VIDEO, frame_index=frame, stream_index=3, filters=crop_filter
+        )
 
 
 def generate_h265_video_references():
@@ -122,25 +132,18 @@ def generate_h265_video_references():
     # ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265  --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
     # ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y
     # Note that this video only has 1 stream, at index 0.
-    VIDEO_PATH = RESOURCES_DIR / "h265_video.mp4"
-    FRAMES = [5]
-    for frame in FRAMES:
-        output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp"
-        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
-        convert_image_to_tensor(output_bmp)
+    frames = [5]
+    for frame in frames:
+        generate_frame_by_index(H265_VIDEO, frame_index=frame, stream_index=0)
 
 
 def generate_av1_video_references():
     # This video was generated by running the following:
     # ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv
     # Note that this video only has 1 stream, at index 0.
-    VIDEO_PATH = RESOURCES_DIR / "av1_video.mkv"
-    FRAMES = [10]
-
-    for frame in FRAMES:
-        output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp"
-        get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
-        convert_image_to_tensor(output_bmp)
+    frames = [10]
+    for frame in frames:
+        generate_frame_by_index(AV1_VIDEO, frame_index=frame, stream_index=0)
 
 
 def main():
diff --git a/test/utils.py b/test/utils.py
index b59681b37..2d3ab82a8 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -371,6 +371,17 @@ def empty_duration_seconds(self) -> torch.Tensor:
 class TestVideo(TestContainerFile):
     """Base class for the *video* streams of a video container"""
 
+    def get_base_path_by_index(
+        self, idx: int, *, stream_index: int, filters: Optional[str] = None
+    ) -> pathlib.Path:
+        stream_and_frame = f"stream{stream_index}.frame{idx:06d}"
+        if filters is not None:
+            full_name = f"{self.filename}.{sanitize_filtergraph_expression(filters)}.{stream_and_frame}"
+        else:
+            full_name = f"{self.filename}.{stream_and_frame}"
+
+        return _get_file_path(full_name)
+
     def get_frame_data_by_index(
         self,
         idx: int,
@@ -381,14 +392,8 @@ def get_frame_data_by_index(
         if stream_index is None:
             stream_index = self.default_stream_index
 
-        stream_and_frame = f"stream{stream_index}.frame{idx:06d}"
-        if filters is not None:
-            full_name = f"{self.filename}.{sanitize_filtergraph_expression(filters)}.{stream_and_frame}.pt"
-        else:
-            full_name = f"{self.filename}.{stream_and_frame}.pt"
-
-        file_path = _get_file_path(full_name)
-        return torch.load(file_path, weights_only=True).permute(2, 0, 1)
+        tensor_file_path = f"{self.get_base_path_by_index(idx, stream_index=stream_index, filters=filters)}.pt"
+        return torch.load(tensor_file_path, weights_only=True).permute(2, 0, 1)
 
     def get_frame_data_by_range(
         self,
@@ -485,6 +490,114 @@ def get_empty_chw_tensor(self, *, stream_index: int) -> torch.Tensor:
 )
 
 
+H265_VIDEO = TestVideo(
+    filename="h265_video.mp4",
+    default_stream_index=0,
+    # This metadata is extracted manually.
+    #  $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/h265_video.mp4 > out.json
+    stream_infos={
+        0: TestVideoStreamInfo(width=128, height=128, num_color_channels=3),
+    },
+    frames={
+        0: {
+            6: TestFrameInfo(pts_seconds=0.6, duration_seconds=0.1),
+        },
+    },
+)
+
+AV1_VIDEO = TestVideo(
+    filename="av1_video.mkv",
+    default_stream_index=0,
+    # This metadata is extracted manually.
+    #  $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/av1_video.mkv > out.json
+    stream_infos={
+        0: TestVideoStreamInfo(width=640, height=360, num_color_channels=3),
+    },
+    frames={
+        0: {
+            10: TestFrameInfo(pts_seconds=0.400000, duration_seconds=0.040000),
+        },
+    },
+)
+
+
+# This is a BT.709 full range video, generated with:
+# ffmpeg -f lavfi -i testsrc2=duration=1:size=1920x720:rate=30 \
+# -c:v libx264 -pix_fmt yuv420p -color_primaries bt709 -color_trc bt709 \
+# -colorspace bt709 -color_range pc bt709_full_range.mp4
+#
+# We can confirm the color space and color range with:
+# ffprobe -v quiet -select_streams v:0 -show_entries stream=color_space,color_transfer,color_primaries,color_range -of default=noprint_wrappers=1 test/resources/bt709_full_range.mp4
+# color_range=pc
+# color_space=bt709
+# color_transfer=bt709
+# color_primaries=bt709
+BT709_FULL_RANGE = TestVideo(
+    filename="bt709_full_range.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+# ffmpeg -f lavfi -i testsrc2=duration=2:size=1280x720:rate=30 -c:v libx264 -profile:v baseline -level 3.1 -pix_fmt yuv420p -b:v 2500k -r 30 -movflags +faststart output_720p_2s.mp4
+TEST_SRC_2_720P = TestVideo(
+    filename="testsrc2.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+# ffmpeg -f lavfi -i testsrc2=duration=10:size=1280x720:rate=30 -c:v libx265 -crf 23 -preset medium output.mp4
+TEST_SRC_2_720P_H265 = TestVideo(
+    filename="testsrc2_h265.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx-vp9 -b:v 1M output_vp9.webm
+TEST_SRC_2_720P_VP9 = TestVideo(
+    filename="testsrc2_vp9.webm",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx -b:v 1M output_vp8.webm
+TEST_SRC_2_720P_VP8 = TestVideo(
+    filename="testsrc2_vp8.webm",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi
+TEST_SRC_2_720P_MPEG4 = TestVideo(
+    filename="testsrc2_mpeg4.avi",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)
+
+
+def supports_approximate_mode(asset: TestVideo) -> bool:
+    # Those are missing the `duration` field so they fail in approximate mode (on all devices).
+    # TODO: we should address this, see
+    # https://github.com/meta-pytorch/torchcodec/issues/945
+    return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)
+
+
 @dataclass
 class TestAudio(TestContainerFile):
     """Base class for the *audio* streams of a container (potentially a video),
@@ -698,110 +811,3 @@ def sample_format(self) -> str:
         )
     },
 )
-
-H265_VIDEO = TestVideo(
-    filename="h265_video.mp4",
-    default_stream_index=0,
-    # This metadata is extracted manually.
-    #  $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/h265_video.mp4 > out.json
-    stream_infos={
-        0: TestVideoStreamInfo(width=128, height=128, num_color_channels=3),
-    },
-    frames={
-        0: {
-            6: TestFrameInfo(pts_seconds=0.6, duration_seconds=0.1),
-        },
-    },
-)
-
-AV1_VIDEO = TestVideo(
-    filename="av1_video.mkv",
-    default_stream_index=0,
-    # This metadata is extracted manually.
-    #  $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/av1_video.mkv > out.json
-    stream_infos={
-        0: TestVideoStreamInfo(width=640, height=360, num_color_channels=3),
-    },
-    frames={
-        0: {
-            10: TestFrameInfo(pts_seconds=0.400000, duration_seconds=0.040000),
-        },
-    },
-)
-
-
-# This is a BT.709 full range video, generated with:
-# ffmpeg -f lavfi -i testsrc2=duration=1:size=1920x720:rate=30 \
-# -c:v libx264 -pix_fmt yuv420p -color_primaries bt709 -color_trc bt709 \
-# -colorspace bt709 -color_range pc bt709_full_range.mp4
-#
-# We can confirm the color space and color range with:
-# ffprobe -v quiet -select_streams v:0 -show_entries stream=color_space,color_transfer,color_primaries,color_range -of default=noprint_wrappers=1 test/resources/bt709_full_range.mp4
-# color_range=pc
-# color_space=bt709
-# color_transfer=bt709
-# color_primaries=bt709
-BT709_FULL_RANGE = TestVideo(
-    filename="bt709_full_range.mp4",
-    default_stream_index=0,
-    stream_infos={
-        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
-    },
-    frames={0: {}},  # Not needed for now
-)
-
-# ffmpeg -f lavfi -i testsrc2=duration=2:size=1280x720:rate=30 -c:v libx264 -profile:v baseline -level 3.1 -pix_fmt yuv420p -b:v 2500k -r 30 -movflags +faststart output_720p_2s.mp4
-TEST_SRC_2_720P = TestVideo(
-    filename="testsrc2.mp4",
-    default_stream_index=0,
-    stream_infos={
-        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
-    },
-    frames={0: {}},  # Not needed for now
-)
-# ffmpeg -f lavfi -i testsrc2=duration=10:size=1280x720:rate=30 -c:v libx265 -crf 23 -preset medium output.mp4
-TEST_SRC_2_720P_H265 = TestVideo(
-    filename="testsrc2_h265.mp4",
-    default_stream_index=0,
-    stream_infos={
-        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
-    },
-    frames={0: {}},  # Not needed for now
-)
-
-# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx-vp9 -b:v 1M output_vp9.webm
-TEST_SRC_2_720P_VP9 = TestVideo(
-    filename="testsrc2_vp9.webm",
-    default_stream_index=0,
-    stream_infos={
-        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
-    },
-    frames={0: {}},  # Not needed for now
-)
-
-# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx -b:v 1M output_vp8.webm
-TEST_SRC_2_720P_VP8 = TestVideo(
-    filename="testsrc2_vp8.webm",
-    default_stream_index=0,
-    stream_infos={
-        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
-    },
-    frames={0: {}},  # Not needed for now
-)
-
-# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi
-TEST_SRC_2_720P_MPEG4 = TestVideo(
-    filename="testsrc2_mpeg4.avi",
-    default_stream_index=0,
-    stream_infos={
-        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
-    },
-    frames={0: {}},  # Not needed for now
-)
-
-
-def supports_approximate_mode(asset: TestVideo) -> bool:
-    # Those are missing the `duration` field so they fail in approximate mode (on all devices).
-    # TODO: we should address this, see
-    # https://github.com/meta-pytorch/torchcodec/issues/945
-    return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)

From 9bb3a25ed3eeac19a5b28e1a123a325a1fa81bc2 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Sat, 18 Oct 2025 11:31:33 -0700
Subject: [PATCH 2/4] No return value

---
 test/generate_reference_resources.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py
index ccc94d614..5c5a71e00 100644
--- a/test/generate_reference_resources.py
+++ b/test/generate_reference_resources.py
@@ -106,7 +106,7 @@ def generate_nasa_13013_references():
     frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
     for stream in streams:
         for frame in frames:
-            output_bmp = generate_frame_by_index(
+            generate_frame_by_index(
                 NASA_VIDEO, frame_index=frame, stream_index=stream
             )
 

From f4d5f448b44b40c3fc22acee3622ef99170f3e4d Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Sat, 18 Oct 2025 11:36:07 -0700
Subject: [PATCH 3/4] Lint

---
 test/generate_reference_resources.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py
index 5c5a71e00..953fb996e 100644
--- a/test/generate_reference_resources.py
+++ b/test/generate_reference_resources.py
@@ -106,9 +106,7 @@ def generate_nasa_13013_references():
     frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
     for stream in streams:
         for frame in frames:
-            generate_frame_by_index(
-                NASA_VIDEO, frame_index=frame, stream_index=stream
-            )
+            generate_frame_by_index(NASA_VIDEO, frame_index=frame, stream_index=stream)
 
     # Extract individual frames at specific timestamps, including the last frame of the video.
     seek_timestamp = [6.0, 6.1, 10.0, 12.979633]

From ec22aa82d5c85b83b7af43298b2dd21d88093d4d Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Sat, 18 Oct 2025 11:43:18 -0700
Subject: [PATCH 4/4] Breakup string

---
 test/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/utils.py b/test/utils.py
index 2d3ab82a8..cbd6a5bf4 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -392,7 +392,10 @@ def get_frame_data_by_index(
         if stream_index is None:
             stream_index = self.default_stream_index
 
-        tensor_file_path = f"{self.get_base_path_by_index(idx, stream_index=stream_index, filters=filters)}.pt"
+        base_path = self.get_base_path_by_index(
+            idx, stream_index=stream_index, filters=filters
+        )
+        tensor_file_path = f"{base_path}.pt"
         return torch.load(tensor_file_path, weights_only=True).permute(2, 0, 1)
 
     def get_frame_data_by_range(