Transforms bridge between Python and C++ (#948)

scotts · web-flow · commit e5b2eefed5d1 · 2025-10-10T10:07:50.000-04:00
diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py
@@ -29,18 +29,17 @@ def decode_full_video(video_path, decode_device_string, resize_device_string):
     num_threads = None
     if "cuda" in decode_device_string:
         num_threads = 1
-    width = None
-    height = None
+
+    resize_spec = ""
     if "native" in resize_device_string:
-        width = RESIZED_WIDTH
-        height = RESIZED_HEIGHT
+        resize_spec = f"resize, {RESIZED_HEIGHT}, {RESIZED_WIDTH}"
+
     torchcodec._core._add_video_stream(
         decoder,
         stream_index=-1,
         device=decode_device_string,
         num_threads=num_threads,
-        width=width,
-        height=height,
+        transform_specs=resize_spec,
     )
 
     start_time = time.time()
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -43,9 +43,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
   m.def(
-      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
+      "_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
   m.def(
-      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
+      "add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
   m.def(
       "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -183,6 +183,69 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
   }
 }
 
+int checkedToPositiveInt(const std::string& str) {
+  int ret = 0;
+  try {
+    ret = std::stoi(str);
+  } catch (const std::invalid_argument&) {
+    TORCH_CHECK(false, "String cannot be converted to an int:" + str);
+  } catch (const std::out_of_range&) {
+    TORCH_CHECK(false, "String would become integer out of range:" + str);
+  }
+  TORCH_CHECK(ret > 0, "String must be a positive integer:" + str);
+  return ret;
+}
+
+// Resize transform specs take the form:
+//
+//   "resize, <height>, <width>"
+//
+// Where "resize" is the string literal and <height> and <width> are positive
+// integers.
+Transform* makeResizeTransform(
+    const std::vector<std::string>& resizeTransformSpec) {
+  TORCH_CHECK(
+      resizeTransformSpec.size() == 3,
+      "resizeTransformSpec must have 3 elements including its name");
+  int height = checkedToPositiveInt(resizeTransformSpec[1]);
+  int width = checkedToPositiveInt(resizeTransformSpec[2]);
+  return new ResizeTransform(FrameDims(height, width));
+}
+
+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+
+// The transformSpecsRaw string is always in the format:
+//
+//   "name1, param1, param2, ...; name2, param1, param2, ...; ..."
+//
+// Where "nameX" is the name of the transform, and "paramX" are the parameters.
+std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
+  std::vector<Transform*> transforms;
+  std::vector<std::string> transformSpecs = split(transformSpecsRaw, ';');
+  for (const std::string& transformSpecRaw : transformSpecs) {
+    std::vector<std::string> transformSpec = split(transformSpecRaw, ',');
+    TORCH_CHECK(
+        transformSpec.size() >= 1,
+        "Invalid transform spec: " + transformSpecRaw);
+
+    auto name = transformSpec[0];
+    if (name == "resize") {
+      transforms.push_back(makeResizeTransform(transformSpec));
+    } else {
+      TORCH_CHECK(false, "Invalid transform name: " + name);
+    }
+  }
+  return transforms;
+}
+
 } // namespace
 
 // ==============================
@@ -252,36 +315,18 @@ at::Tensor _create_from_file_like(
 
 void _add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::string_view device = "cpu",
     std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
         custom_frame_mappings = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.ffmpegThreadCount = num_threads;
 
-  // TODO: Eliminate this temporary bridge code. This exists because we have
-  //       not yet exposed the transforms API on the Python side. We also want
-  //       to remove the `width` and `height` arguments from the Python API.
-  //
-  // TEMPORARY BRIDGE CODE START
-  TORCH_CHECK(
-      width.has_value() == height.has_value(),
-      "width and height must both be set or unset.");
-  std::vector<Transform*> transforms;
-  if (width.has_value()) {
-    transforms.push_back(
-        new ResizeTransform(FrameDims(height.value(), width.value())));
-    width.reset();
-    height.reset();
-  }
-  // TEMPORARY BRIDGE CODE END
-
   if (dimension_order.has_value()) {
     std::string stdDimensionOrder{dimension_order.value()};
     TORCH_CHECK(stdDimensionOrder == "NHWC" || stdDimensionOrder == "NCHW");
@@ -309,6 +354,9 @@ void _add_video_stream(
   videoStreamOptions.device = torch::Device(std::string(device));
   videoStreamOptions.deviceVariant = device_variant;
 
+  std::vector<Transform*> transforms =
+      makeTransforms(std::string(transform_specs));
+
   std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
       custom_frame_mappings.has_value()
       ? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
@@ -324,24 +372,22 @@ void _add_video_stream(
 // Add a new video stream at `stream_index` using the provided options.
 void add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::string_view device = "cpu",
     std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
         custom_frame_mappings = std::nullopt) {
   _add_video_stream(
       decoder,
-      width,
-      height,
       num_threads,
       dimension_order,
       stream_index,
       device,
       device_variant,
+      transform_specs,
       custom_frame_mappings);
 }
 
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -299,13 +299,12 @@ def create_from_tensor_abstract(
 def _add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: str = "cpu",
     device_variant: str = "default",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
@@ -318,13 +317,12 @@ def _add_video_stream_abstract(
 def add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: str = "cpu",
     device_variant: str = "default",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py
@@ -147,8 +147,7 @@ def forward(self, video_data: Tensor) -> Union[List[Any]]:
         scan_all_streams_to_update_metadata(video_decoder)
         add_video_stream(
             video_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             num_threads=self.decoder_args.num_threads,
         )
 
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -631,17 +631,15 @@ def test_color_conversion_library_with_scaling(
         filtergraph_decoder = create_from_file(str(input_video.path))
         _add_video_stream(
             filtergraph_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             color_conversion_library="filtergraph",
         )
         filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder)
 
         swscale_decoder = create_from_file(str(input_video.path))
         _add_video_stream(
             swscale_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             color_conversion_library="swscale",
         )
         swscale_frame0, _, _ = get_next_frame(swscale_decoder)
@@ -655,7 +653,53 @@ def test_scaling_on_cuda_fails(self):
             RuntimeError,
             match="Transforms are only supported for CPU devices.",
         ):
-            add_video_stream(decoder, device="cuda", width=100, height=100)
+            add_video_stream(decoder, device="cuda", transform_specs="resize, 100, 100")
+
+    def test_transform_fails(self):
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        with pytest.raises(
+            RuntimeError,
+            match="Invalid transform spec",
+        ):
+            add_video_stream(decoder, transform_specs=";")
+
+        with pytest.raises(
+            RuntimeError,
+            match="Invalid transform name",
+        ):
+            add_video_stream(decoder, transform_specs="invalid, 1, 2")
+
+    def test_resize_transform_fails(self):
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        with pytest.raises(
+            RuntimeError,
+            match="must have 3 elements",
+        ):
+            add_video_stream(decoder, transform_specs="resize, 100, 100, 100")
+
+        with pytest.raises(
+            RuntimeError,
+            match="must be a positive integer",
+        ):
+            add_video_stream(decoder, transform_specs="resize, -10, 100")
+
+        with pytest.raises(
+            RuntimeError,
+            match="must be a positive integer",
+        ):
+            add_video_stream(decoder, transform_specs="resize, 100, 0")
+
+        with pytest.raises(
+            RuntimeError,
+            match="cannot be converted to an int",
+        ):
+            add_video_stream(decoder, transform_specs="resize, blah, 100")
+
+        with pytest.raises(
+            RuntimeError,
+            match="out of range",
+        ):
+            add_video_stream(decoder, transform_specs="resize, 100, 1000000000000")
 
     @pytest.mark.parametrize("dimension_order", ("NHWC", "NCHW"))
     @pytest.mark.parametrize("color_conversion_library", ("filtergraph", "swscale"))
@@ -763,17 +807,15 @@ def test_color_conversion_library_with_generated_videos(
         filtergraph_decoder = create_from_file(str(video_path))
         _add_video_stream(
             filtergraph_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             color_conversion_library="filtergraph",
         )
         filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder)
 
         auto_decoder = create_from_file(str(video_path))
         add_video_stream(
             auto_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
         )
         auto_frame0, _, _ = get_next_frame(auto_decoder)
         assert_frames_equal(filtergraph_frame0, auto_frame0)

Original file line number	Diff line number	Diff line change
`@@ -147,8 +147,7 @@ def forward(self, video_data: Tensor) -> Union[List[Any]]:`
`147`	`147`	`scan_all_streams_to_update_metadata(video_decoder)`
`148`	`148`	`add_video_stream(`
`149`	`149`	`video_decoder,`
`150`		`- width=target_width,`
`151`		`- height=target_height,`
	`150`	`+ transform_specs=f"resize, {target_height}, {target_width}",`
`152`	`151`	`num_threads=self.decoder_args.num_threads,`
`153`	`152`	`)`
`154`	`153`