From c749aea4714bc752f6ac62d680c506458d573139 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Sat, 18 Oct 2025 11:28:35 -0700 Subject: [PATCH 1/4] Refactor resource generation --- test/generate_reference_resources.py | 95 +++++------ test/utils.py | 236 ++++++++++++++------------- 2 files changed, 170 insertions(+), 161 deletions(-) diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py index fe515ebde..ccc94d614 100644 --- a/test/generate_reference_resources.py +++ b/test/generate_reference_resources.py @@ -6,23 +6,20 @@ import subprocess from pathlib import Path +from typing import Optional import numpy as np import torch from PIL import Image -from .utils import sanitize_filtergraph_expression +from .utils import AV1_VIDEO, H265_VIDEO, NASA_VIDEO, TestVideo # Run this script to update the resources used in unit tests. The resources are all derived # from source media already checked into the repo. -SCRIPT_DIR = Path(__file__).resolve().parent -TORCHCODEC_PATH = SCRIPT_DIR.parent -RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources" - -def convert_image_to_tensor(image_path): +def convert_image_to_tensor(image_path: str) -> None: image_path = Path(image_path) if not image_path.exists(): return @@ -37,7 +34,23 @@ def convert_image_to_tensor(image_path): image_path.unlink() -def get_frame_by_index(video_path, frame, output_path, stream, filters=None): +def generate_frame_by_index( + video: TestVideo, + *, + frame_index: int, + stream_index: int, + filters: Optional[str] = None, +) -> None: + # Note that we are using 0-based index naming. As a result, we are + # generating files one-by-one, giving the actual file name that we want. + # ffmpeg does have an option to generate multiple files for us, but it uses + # 1-based indexing. We can't use 1-based indexing because we want to match + # the 0-based indexing in our tests. + base_path = video.get_base_path_by_index( + frame_index, stream_index=stream_index, filters=filters + ) + output_bmp = f"{base_path}.bmp" + # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification, # which always happens BEFORE any of the filters that we receive as input. We do this to # ensure that the color conversion happens BEFORE the filters, matching the behavior of the @@ -45,7 +58,7 @@ def get_frame_by_index(video_path, frame, output_path, stream, filters=None): # # Not doing this would result in the color conversion happening AFTER the filters, which # would result in different color values for the same frame. - filtergraph = f"select='eq(n\\,{frame})',format=rgb24" + filtergraph = f"select='eq(n\\,{frame_index})',format=rgb24" if filters is not None: filtergraph = filtergraph + f",{filters}" @@ -53,21 +66,24 @@ def get_frame_by_index(video_path, frame, output_path, stream, filters=None): "ffmpeg", "-y", "-i", - video_path, + video.path, "-map", - f"0:{stream}", + f"0:{stream_index}", "-vf", filtergraph, "-fps_mode", "passthrough", "-update", "1", - output_path, + output_bmp, ] subprocess.run(cmd, check=True) + convert_image_to_tensor(output_bmp) -def get_frame_by_timestamp(video_path, timestamp, output_path): +def generate_frame_by_timestamp( + video_path: str, timestamp: float, output_path: str +) -> None: cmd = [ "ffmpeg", "-y", @@ -80,40 +96,34 @@ def get_frame_by_timestamp(video_path, timestamp, output_path): output_path, ] subprocess.run(cmd, check=True) + convert_image_to_tensor(output_path) def generate_nasa_13013_references(): - VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4" - # Note: The naming scheme used here must match the naming scheme used to load # tensors in ./utils.py. - STREAMS = [0, 3] - FRAMES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389] - for stream in STREAMS: - for frame in FRAMES: - # Note that we are using 0-based index naming. Asking ffmpeg to number output - # frames would result in 1-based index naming. We enforce 0-based index naming - # so that the name of reference frames matches the index when accessing that - # frame in the Python decoder. - output_bmp = f"{VIDEO_PATH}.stream{stream}.frame{frame:06d}.bmp" - get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=stream) - convert_image_to_tensor(output_bmp) + streams = [0, 3] + frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389] + for stream in streams: + for frame in frames: + output_bmp = generate_frame_by_index( + NASA_VIDEO, frame_index=frame, stream_index=stream + ) # Extract individual frames at specific timestamps, including the last frame of the video. seek_timestamp = [6.0, 6.1, 10.0, 12.979633] timestamp_name = [f"{seek_timestamp:06f}" for seek_timestamp in seek_timestamp] for timestamp, name in zip(seek_timestamp, timestamp_name): - output_bmp = f"{VIDEO_PATH}.time{name}.bmp" - get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp) - convert_image_to_tensor(output_bmp) + output_bmp = f"{NASA_VIDEO.path}.time{name}.bmp" + generate_frame_by_timestamp(NASA_VIDEO.path, timestamp, output_bmp) # Extract frames with specific filters. We have tests that assume these exact filters. - FRAMES = [0, 15, 200, 389] + frames = [0, 15, 200, 389] crop_filter = "crop=300:200:50:35:exact=1" - for frame in FRAMES: - output_bmp = f"{VIDEO_PATH}.{sanitize_filtergraph_expression(crop_filter)}.stream3.frame{frame:06d}.bmp" - get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter) - convert_image_to_tensor(output_bmp) + for frame in frames: + generate_frame_by_index( + NASA_VIDEO, frame_index=frame, stream_index=3, filters=crop_filter + ) def generate_h265_video_references(): @@ -122,25 +132,18 @@ def generate_h265_video_references(): # ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265 --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz # ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y # Note that this video only has 1 stream, at index 0. - VIDEO_PATH = RESOURCES_DIR / "h265_video.mp4" - FRAMES = [5] - for frame in FRAMES: - output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp" - get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0) - convert_image_to_tensor(output_bmp) + frames = [5] + for frame in frames: + generate_frame_by_index(H265_VIDEO, frame_index=frame, stream_index=0) def generate_av1_video_references(): # This video was generated by running the following: # ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv # Note that this video only has 1 stream, at index 0. - VIDEO_PATH = RESOURCES_DIR / "av1_video.mkv" - FRAMES = [10] - - for frame in FRAMES: - output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp" - get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0) - convert_image_to_tensor(output_bmp) + frames = [10] + for frame in frames: + generate_frame_by_index(AV1_VIDEO, frame_index=frame, stream_index=0) def main(): diff --git a/test/utils.py b/test/utils.py index b59681b37..2d3ab82a8 100644 --- a/test/utils.py +++ b/test/utils.py @@ -371,6 +371,17 @@ def empty_duration_seconds(self) -> torch.Tensor: class TestVideo(TestContainerFile): """Base class for the *video* streams of a video container""" + def get_base_path_by_index( + self, idx: int, *, stream_index: int, filters: Optional[str] = None + ) -> pathlib.Path: + stream_and_frame = f"stream{stream_index}.frame{idx:06d}" + if filters is not None: + full_name = f"{self.filename}.{sanitize_filtergraph_expression(filters)}.{stream_and_frame}" + else: + full_name = f"{self.filename}.{stream_and_frame}" + + return _get_file_path(full_name) + def get_frame_data_by_index( self, idx: int, @@ -381,14 +392,8 @@ def get_frame_data_by_index( if stream_index is None: stream_index = self.default_stream_index - stream_and_frame = f"stream{stream_index}.frame{idx:06d}" - if filters is not None: - full_name = f"{self.filename}.{sanitize_filtergraph_expression(filters)}.{stream_and_frame}.pt" - else: - full_name = f"{self.filename}.{stream_and_frame}.pt" - - file_path = _get_file_path(full_name) - return torch.load(file_path, weights_only=True).permute(2, 0, 1) + tensor_file_path = f"{self.get_base_path_by_index(idx, stream_index=stream_index, filters=filters)}.pt" + return torch.load(tensor_file_path, weights_only=True).permute(2, 0, 1) def get_frame_data_by_range( self, @@ -485,6 +490,114 @@ def get_empty_chw_tensor(self, *, stream_index: int) -> torch.Tensor: ) +H265_VIDEO = TestVideo( + filename="h265_video.mp4", + default_stream_index=0, + # This metadata is extracted manually. + # $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/h265_video.mp4 > out.json + stream_infos={ + 0: TestVideoStreamInfo(width=128, height=128, num_color_channels=3), + }, + frames={ + 0: { + 6: TestFrameInfo(pts_seconds=0.6, duration_seconds=0.1), + }, + }, +) + +AV1_VIDEO = TestVideo( + filename="av1_video.mkv", + default_stream_index=0, + # This metadata is extracted manually. + # $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/av1_video.mkv > out.json + stream_infos={ + 0: TestVideoStreamInfo(width=640, height=360, num_color_channels=3), + }, + frames={ + 0: { + 10: TestFrameInfo(pts_seconds=0.400000, duration_seconds=0.040000), + }, + }, +) + + +# This is a BT.709 full range video, generated with: +# ffmpeg -f lavfi -i testsrc2=duration=1:size=1920x720:rate=30 \ +# -c:v libx264 -pix_fmt yuv420p -color_primaries bt709 -color_trc bt709 \ +# -colorspace bt709 -color_range pc bt709_full_range.mp4 +# +# We can confirm the color space and color range with: +# ffprobe -v quiet -select_streams v:0 -show_entries stream=color_space,color_transfer,color_primaries,color_range -of default=noprint_wrappers=1 test/resources/bt709_full_range.mp4 +# color_range=pc +# color_space=bt709 +# color_transfer=bt709 +# color_primaries=bt709 +BT709_FULL_RANGE = TestVideo( + filename="bt709_full_range.mp4", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + +# ffmpeg -f lavfi -i testsrc2=duration=2:size=1280x720:rate=30 -c:v libx264 -profile:v baseline -level 3.1 -pix_fmt yuv420p -b:v 2500k -r 30 -movflags +faststart output_720p_2s.mp4 +TEST_SRC_2_720P = TestVideo( + filename="testsrc2.mp4", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) +# ffmpeg -f lavfi -i testsrc2=duration=10:size=1280x720:rate=30 -c:v libx265 -crf 23 -preset medium output.mp4 +TEST_SRC_2_720P_H265 = TestVideo( + filename="testsrc2_h265.mp4", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + +# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx-vp9 -b:v 1M output_vp9.webm +TEST_SRC_2_720P_VP9 = TestVideo( + filename="testsrc2_vp9.webm", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + +# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx -b:v 1M output_vp8.webm +TEST_SRC_2_720P_VP8 = TestVideo( + filename="testsrc2_vp8.webm", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + +# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi +TEST_SRC_2_720P_MPEG4 = TestVideo( + filename="testsrc2_mpeg4.avi", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +) + + +def supports_approximate_mode(asset: TestVideo) -> bool: + # Those are missing the `duration` field so they fail in approximate mode (on all devices). + # TODO: we should address this, see + # https://github.com/meta-pytorch/torchcodec/issues/945 + return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8) + + @dataclass class TestAudio(TestContainerFile): """Base class for the *audio* streams of a container (potentially a video), @@ -698,110 +811,3 @@ def sample_format(self) -> str: ) }, ) - -H265_VIDEO = TestVideo( - filename="h265_video.mp4", - default_stream_index=0, - # This metadata is extracted manually. - # $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/h265_video.mp4 > out.json - stream_infos={ - 0: TestVideoStreamInfo(width=128, height=128, num_color_channels=3), - }, - frames={ - 0: { - 6: TestFrameInfo(pts_seconds=0.6, duration_seconds=0.1), - }, - }, -) - -AV1_VIDEO = TestVideo( - filename="av1_video.mkv", - default_stream_index=0, - # This metadata is extracted manually. - # $ ffprobe -v error -hide_banner -select_streams v:0 -show_frames -of json test/resources/av1_video.mkv > out.json - stream_infos={ - 0: TestVideoStreamInfo(width=640, height=360, num_color_channels=3), - }, - frames={ - 0: { - 10: TestFrameInfo(pts_seconds=0.400000, duration_seconds=0.040000), - }, - }, -) - - -# This is a BT.709 full range video, generated with: -# ffmpeg -f lavfi -i testsrc2=duration=1:size=1920x720:rate=30 \ -# -c:v libx264 -pix_fmt yuv420p -color_primaries bt709 -color_trc bt709 \ -# -colorspace bt709 -color_range pc bt709_full_range.mp4 -# -# We can confirm the color space and color range with: -# ffprobe -v quiet -select_streams v:0 -show_entries stream=color_space,color_transfer,color_primaries,color_range -of default=noprint_wrappers=1 test/resources/bt709_full_range.mp4 -# color_range=pc -# color_space=bt709 -# color_transfer=bt709 -# color_primaries=bt709 -BT709_FULL_RANGE = TestVideo( - filename="bt709_full_range.mp4", - default_stream_index=0, - stream_infos={ - 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), - }, - frames={0: {}}, # Not needed for now -) - -# ffmpeg -f lavfi -i testsrc2=duration=2:size=1280x720:rate=30 -c:v libx264 -profile:v baseline -level 3.1 -pix_fmt yuv420p -b:v 2500k -r 30 -movflags +faststart output_720p_2s.mp4 -TEST_SRC_2_720P = TestVideo( - filename="testsrc2.mp4", - default_stream_index=0, - stream_infos={ - 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), - }, - frames={0: {}}, # Not needed for now -) -# ffmpeg -f lavfi -i testsrc2=duration=10:size=1280x720:rate=30 -c:v libx265 -crf 23 -preset medium output.mp4 -TEST_SRC_2_720P_H265 = TestVideo( - filename="testsrc2_h265.mp4", - default_stream_index=0, - stream_infos={ - 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), - }, - frames={0: {}}, # Not needed for now -) - -# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx-vp9 -b:v 1M output_vp9.webm -TEST_SRC_2_720P_VP9 = TestVideo( - filename="testsrc2_vp9.webm", - default_stream_index=0, - stream_infos={ - 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), - }, - frames={0: {}}, # Not needed for now -) - -# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx -b:v 1M output_vp8.webm -TEST_SRC_2_720P_VP8 = TestVideo( - filename="testsrc2_vp8.webm", - default_stream_index=0, - stream_infos={ - 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), - }, - frames={0: {}}, # Not needed for now -) - -# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi -TEST_SRC_2_720P_MPEG4 = TestVideo( - filename="testsrc2_mpeg4.avi", - default_stream_index=0, - stream_infos={ - 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), - }, - frames={0: {}}, # Not needed for now -) - - -def supports_approximate_mode(asset: TestVideo) -> bool: - # Those are missing the `duration` field so they fail in approximate mode (on all devices). - # TODO: we should address this, see - # https://github.com/meta-pytorch/torchcodec/issues/945 - return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8) From 9bb3a25ed3eeac19a5b28e1a123a325a1fa81bc2 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Sat, 18 Oct 2025 11:31:33 -0700 Subject: [PATCH 2/4] No return value --- test/generate_reference_resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py index ccc94d614..5c5a71e00 100644 --- a/test/generate_reference_resources.py +++ b/test/generate_reference_resources.py @@ -106,7 +106,7 @@ def generate_nasa_13013_references(): frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389] for stream in streams: for frame in frames: - output_bmp = generate_frame_by_index( + generate_frame_by_index( NASA_VIDEO, frame_index=frame, stream_index=stream ) From f4d5f448b44b40c3fc22acee3622ef99170f3e4d Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Sat, 18 Oct 2025 11:36:07 -0700 Subject: [PATCH 3/4] Lint --- test/generate_reference_resources.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py index 5c5a71e00..953fb996e 100644 --- a/test/generate_reference_resources.py +++ b/test/generate_reference_resources.py @@ -106,9 +106,7 @@ def generate_nasa_13013_references(): frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389] for stream in streams: for frame in frames: - generate_frame_by_index( - NASA_VIDEO, frame_index=frame, stream_index=stream - ) + generate_frame_by_index(NASA_VIDEO, frame_index=frame, stream_index=stream) # Extract individual frames at specific timestamps, including the last frame of the video. seek_timestamp = [6.0, 6.1, 10.0, 12.979633] From ec22aa82d5c85b83b7af43298b2dd21d88093d4d Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Sat, 18 Oct 2025 11:43:18 -0700 Subject: [PATCH 4/4] Breakup string --- test/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/utils.py b/test/utils.py index 2d3ab82a8..cbd6a5bf4 100644 --- a/test/utils.py +++ b/test/utils.py @@ -392,7 +392,10 @@ def get_frame_data_by_index( if stream_index is None: stream_index = self.default_stream_index - tensor_file_path = f"{self.get_base_path_by_index(idx, stream_index=stream_index, filters=filters)}.pt" + base_path = self.get_base_path_by_index( + idx, stream_index=stream_index, filters=filters + ) + tensor_file_path = f"{base_path}.pt" return torch.load(tensor_file_path, weights_only=True).permute(2, 0, 1) def get_frame_data_by_range(