From c8d546e3ec49bcb04dbe0b18126b2a7ef36b925f Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 8 Oct 2025 13:35:54 -0700 Subject: [PATCH 1/4] Transforms bridge between Python and C++ --- src/torchcodec/_core/custom_ops.cpp | 77 +++++++++++++------ src/torchcodec/_core/ops.py | 6 +- .../_samplers/video_clip_sampler.py | 3 +- test/test_ops.py | 14 ++-- 4 files changed, 60 insertions(+), 40 deletions(-) diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 7010c436a..e9df65203 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -43,9 +43,9 @@ TORCH_LIBRARY(torchcodec_ns, m) { m.def( "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor"); m.def( - "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()"); + "_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()"); m.def( - "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()"); + "add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()"); m.def( "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()"); m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()"); @@ -183,6 +183,50 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) { } } +Transform* makeResizeTransform( + const std::vector& resizeTransformSpec) { + TORCH_CHECK( + resizeTransformSpec.size() == 3, + "resizeTransformSpec must have 3 elements including its name"); + int height = std::stoi(resizeTransformSpec[1]); + int width = std::stoi(resizeTransformSpec[2]); + return new ResizeTransform(FrameDims(height, width)); +} + +std::vector split(const std::string& str, char delimiter) { + std::vector tokens; + std::string token; + std::istringstream tokenStream(str); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + +// The transformSpecsRaw string is always in the format: +// +// "name1, param1, param2, ...; name2, param1, param2, ...; ..." +// +// Where "nameX" is the name of the transform, and "paramX" are the parameters. +std::vector makeTransforms(const std::string& transformSpecsRaw) { + std::vector transforms; + std::vector transformSpecs = split(transformSpecsRaw, ';'); + for (const std::string& transformSpecRaw : transformSpecs) { + std::vector transformSpec = split(transformSpecRaw, ','); + TORCH_CHECK( + transformSpec.size() >= 1, + "Invalid transform spec: " + transformSpecRaw); + + auto name = transformSpec[0]; + if (name == "resize") { + transforms.push_back(makeResizeTransform(transformSpec)); + } else { + TORCH_CHECK(false, "Invalid transform name: " + name); + } + } + return transforms; +} + } // namespace // ============================== @@ -252,36 +296,18 @@ at::Tensor _create_from_file_like( void _add_video_stream( at::Tensor& decoder, - std::optional width = std::nullopt, - std::optional height = std::nullopt, std::optional num_threads = std::nullopt, std::optional dimension_order = std::nullopt, std::optional stream_index = std::nullopt, std::string_view device = "cpu", std::string_view device_variant = "default", + std::string_view transform_specs = "", std::optional> custom_frame_mappings = std::nullopt, std::optional color_conversion_library = std::nullopt) { VideoStreamOptions videoStreamOptions; videoStreamOptions.ffmpegThreadCount = num_threads; - // TODO: Eliminate this temporary bridge code. This exists because we have - // not yet exposed the transforms API on the Python side. We also want - // to remove the `width` and `height` arguments from the Python API. - // - // TEMPORARY BRIDGE CODE START - TORCH_CHECK( - width.has_value() == height.has_value(), - "width and height must both be set or unset."); - std::vector transforms; - if (width.has_value()) { - transforms.push_back( - new ResizeTransform(FrameDims(height.value(), width.value()))); - width.reset(); - height.reset(); - } - // TEMPORARY BRIDGE CODE END - if (dimension_order.has_value()) { std::string stdDimensionOrder{dimension_order.value()}; TORCH_CHECK(stdDimensionOrder == "NHWC" || stdDimensionOrder == "NCHW"); @@ -309,6 +335,9 @@ void _add_video_stream( videoStreamOptions.device = torch::Device(std::string(device)); videoStreamOptions.deviceVariant = device_variant; + std::vector transforms = + makeTransforms(std::string(transform_specs)); + std::optional converted_mappings = custom_frame_mappings.has_value() ? std::make_optional(makeFrameMappings(custom_frame_mappings.value())) @@ -324,24 +353,22 @@ void _add_video_stream( // Add a new video stream at `stream_index` using the provided options. void add_video_stream( at::Tensor& decoder, - std::optional width = std::nullopt, - std::optional height = std::nullopt, std::optional num_threads = std::nullopt, std::optional dimension_order = std::nullopt, std::optional stream_index = std::nullopt, std::string_view device = "cpu", std::string_view device_variant = "default", + std::string_view transform_specs = "", const std::optional>& custom_frame_mappings = std::nullopt) { _add_video_stream( decoder, - width, - height, num_threads, dimension_order, stream_index, device, device_variant, + transform_specs, custom_frame_mappings); } diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index 64298eab5..f8a0df980 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -299,13 +299,12 @@ def create_from_tensor_abstract( def _add_video_stream_abstract( decoder: torch.Tensor, *, - width: Optional[int] = None, - height: Optional[int] = None, num_threads: Optional[int] = None, dimension_order: Optional[str] = None, stream_index: Optional[int] = None, device: str = "cpu", device_variant: str = "default", + transform_specs: str = "", custom_frame_mappings: Optional[ tuple[torch.Tensor, torch.Tensor, torch.Tensor] ] = None, @@ -318,13 +317,12 @@ def _add_video_stream_abstract( def add_video_stream_abstract( decoder: torch.Tensor, *, - width: Optional[int] = None, - height: Optional[int] = None, num_threads: Optional[int] = None, dimension_order: Optional[str] = None, stream_index: Optional[int] = None, device: str = "cpu", device_variant: str = "default", + transform_specs: str = "", custom_frame_mappings: Optional[ tuple[torch.Tensor, torch.Tensor, torch.Tensor] ] = None, diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py index 06976ba78..343728393 100644 --- a/src/torchcodec/_samplers/video_clip_sampler.py +++ b/src/torchcodec/_samplers/video_clip_sampler.py @@ -147,8 +147,7 @@ def forward(self, video_data: Tensor) -> Union[List[Any]]: scan_all_streams_to_update_metadata(video_decoder) add_video_stream( video_decoder, - width=target_width, - height=target_height, + transform_specs=f"resize, {target_height}, {target_width}", num_threads=self.decoder_args.num_threads, ) diff --git a/test/test_ops.py b/test/test_ops.py index f16dd63ad..548442ebf 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -631,8 +631,7 @@ def test_color_conversion_library_with_scaling( filtergraph_decoder = create_from_file(str(input_video.path)) _add_video_stream( filtergraph_decoder, - width=target_width, - height=target_height, + transform_specs=f"resize, {target_height}, {target_width}", color_conversion_library="filtergraph", ) filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder) @@ -640,8 +639,7 @@ def test_color_conversion_library_with_scaling( swscale_decoder = create_from_file(str(input_video.path)) _add_video_stream( swscale_decoder, - width=target_width, - height=target_height, + transform_specs=f"resize, {target_height}, {target_width}", color_conversion_library="swscale", ) swscale_frame0, _, _ = get_next_frame(swscale_decoder) @@ -655,7 +653,7 @@ def test_scaling_on_cuda_fails(self): RuntimeError, match="Transforms are only supported for CPU devices.", ): - add_video_stream(decoder, device="cuda", width=100, height=100) + add_video_stream(decoder, device="cuda", transform_specs="resize, 100, 100") @pytest.mark.parametrize("dimension_order", ("NHWC", "NCHW")) @pytest.mark.parametrize("color_conversion_library", ("filtergraph", "swscale")) @@ -763,8 +761,7 @@ def test_color_conversion_library_with_generated_videos( filtergraph_decoder = create_from_file(str(video_path)) _add_video_stream( filtergraph_decoder, - width=target_width, - height=target_height, + transform_specs=f"resize, {target_height}, {target_width}", color_conversion_library="filtergraph", ) filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder) @@ -772,8 +769,7 @@ def test_color_conversion_library_with_generated_videos( auto_decoder = create_from_file(str(video_path)) add_video_stream( auto_decoder, - width=target_width, - height=target_height, + transform_specs=f"resize, {target_height}, {target_width}", ) auto_frame0, _, _ = get_next_frame(auto_decoder) assert_frames_equal(filtergraph_frame0, auto_frame0) From 7143f15097174737d583aa4a0a0dfdf13cb4e1ae Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 8 Oct 2025 18:51:53 -0700 Subject: [PATCH 2/4] Better error checking --- benchmarks/decoders/gpu_benchmark.py | 3 +- src/torchcodec/_core/custom_ops.cpp | 17 ++++++++-- test/test_ops.py | 46 ++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py index a22691883..94688b58b 100644 --- a/benchmarks/decoders/gpu_benchmark.py +++ b/benchmarks/decoders/gpu_benchmark.py @@ -39,8 +39,7 @@ def decode_full_video(video_path, decode_device_string, resize_device_string): stream_index=-1, device=decode_device_string, num_threads=num_threads, - width=width, - height=height, + transform_specs=f"resize, {height}, {width}", ) start_time = time.time() diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index e9df65203..1ca679775 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -183,13 +183,26 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) { } } +int checkedToPositiveInt(const std::string& str) { + int ret = 0; + try { + ret = std::stoi(str); + } catch (const std::invalid_argument&) { + TORCH_CHECK(false, "String cannot be converted to an int:" + str); + } catch (const std::out_of_range&) { + TORCH_CHECK(false, "String would become integer out of range:" + str); + } + TORCH_CHECK(ret > 0, "String must be a positive integer:" + str); + return ret; +} + Transform* makeResizeTransform( const std::vector& resizeTransformSpec) { TORCH_CHECK( resizeTransformSpec.size() == 3, "resizeTransformSpec must have 3 elements including its name"); - int height = std::stoi(resizeTransformSpec[1]); - int width = std::stoi(resizeTransformSpec[2]); + int height = checkedToPositiveInt(resizeTransformSpec[1]); + int width = checkedToPositiveInt(resizeTransformSpec[2]); return new ResizeTransform(FrameDims(height, width)); } diff --git a/test/test_ops.py b/test/test_ops.py index 548442ebf..c3233a4f9 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -655,6 +655,52 @@ def test_scaling_on_cuda_fails(self): ): add_video_stream(decoder, device="cuda", transform_specs="resize, 100, 100") + def test_transform_fails(self): + decoder = create_from_file(str(NASA_VIDEO.path)) + with pytest.raises( + RuntimeError, + match="Invalid transform spec", + ): + add_video_stream(decoder, transform_specs=";") + + with pytest.raises( + RuntimeError, + match="Invalid transform name", + ): + add_video_stream(decoder, transform_specs="invalid, 1, 2") + + def test_resize_transform_fails(self): + decoder = create_from_file(str(NASA_VIDEO.path)) + with pytest.raises( + RuntimeError, + match="must have 3 elements", + ): + add_video_stream(decoder, transform_specs="resize, 100, 100, 100") + + with pytest.raises( + RuntimeError, + match="must be a positive integer", + ): + add_video_stream(decoder, transform_specs="resize, -10, 100") + + with pytest.raises( + RuntimeError, + match="must be a positive integer", + ): + add_video_stream(decoder, transform_specs="resize, 100, 0") + + with pytest.raises( + RuntimeError, + match="cannot be converted to an int", + ): + add_video_stream(decoder, transform_specs="resize, blah, 100") + + with pytest.raises( + RuntimeError, + match="out of range", + ): + add_video_stream(decoder, transform_specs="resize, 100, 1000000000000") + @pytest.mark.parametrize("dimension_order", ("NHWC", "NCHW")) @pytest.mark.parametrize("color_conversion_library", ("filtergraph", "swscale")) def test_color_conversion_library_with_dimension_order( From a964447cc99ca3b42709641c4ffaeb4b7b57c292 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 8 Oct 2025 19:24:47 -0700 Subject: [PATCH 3/4] Fidx GPU benchmark --- benchmarks/decoders/gpu_benchmark.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py index 94688b58b..4300643dd 100644 --- a/benchmarks/decoders/gpu_benchmark.py +++ b/benchmarks/decoders/gpu_benchmark.py @@ -29,17 +29,17 @@ def decode_full_video(video_path, decode_device_string, resize_device_string): num_threads = None if "cuda" in decode_device_string: num_threads = 1 - width = None - height = None + + resize_spec = "" if "native" in resize_device_string: - width = RESIZED_WIDTH - height = RESIZED_HEIGHT + resize_spec = f"resize, {RESIZED_HEIGHT}, {RESIZED_WIDTH}" + torchcodec._core._add_video_stream( decoder, stream_index=-1, device=decode_device_string, num_threads=num_threads, - transform_specs=f"resize, {height}, {width}", + transform_specs=resize_spec, ) start_time = time.time() From 3db7aaa35c170a8eeab5f855a4f8dfbbbbedc29b Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 8 Oct 2025 19:33:40 -0700 Subject: [PATCH 4/4] Better comments --- src/torchcodec/_core/custom_ops.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 1ca679775..57753ad42 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -196,6 +196,12 @@ int checkedToPositiveInt(const std::string& str) { return ret; } +// Resize transform specs take the form: +// +// "resize, , " +// +// Where "resize" is the string literal and and are positive +// integers. Transform* makeResizeTransform( const std::vector& resizeTransformSpec) { TORCH_CHECK(