From c8d546e3ec49bcb04dbe0b18126b2a7ef36b925f Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 8 Oct 2025 13:35:54 -0700
Subject: [PATCH 1/4] Transforms bridge between Python and C++

---
 src/torchcodec/_core/custom_ops.cpp           | 77 +++++++++++++------
 src/torchcodec/_core/ops.py                   |  6 +-
 .../_samplers/video_clip_sampler.py           |  3 +-
 test/test_ops.py                              | 14 ++--
 4 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 7010c436a..e9df65203 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -43,9 +43,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
   m.def(
-      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
+      "_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
   m.def(
-      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
+      "add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
   m.def(
       "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -183,6 +183,50 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
   }
 }
 
+Transform* makeResizeTransform(
+    const std::vector<std::string>& resizeTransformSpec) {
+  TORCH_CHECK(
+      resizeTransformSpec.size() == 3,
+      "resizeTransformSpec must have 3 elements including its name");
+  int height = std::stoi(resizeTransformSpec[1]);
+  int width = std::stoi(resizeTransformSpec[2]);
+  return new ResizeTransform(FrameDims(height, width));
+}
+
+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+
+// The transformSpecsRaw string is always in the format:
+//
+//   "name1, param1, param2, ...; name2, param1, param2, ...; ..."
+//
+// Where "nameX" is the name of the transform, and "paramX" are the parameters.
+std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
+  std::vector<Transform*> transforms;
+  std::vector<std::string> transformSpecs = split(transformSpecsRaw, ';');
+  for (const std::string& transformSpecRaw : transformSpecs) {
+    std::vector<std::string> transformSpec = split(transformSpecRaw, ',');
+    TORCH_CHECK(
+        transformSpec.size() >= 1,
+        "Invalid transform spec: " + transformSpecRaw);
+
+    auto name = transformSpec[0];
+    if (name == "resize") {
+      transforms.push_back(makeResizeTransform(transformSpec));
+    } else {
+      TORCH_CHECK(false, "Invalid transform name: " + name);
+    }
+  }
+  return transforms;
+}
+
 } // namespace
 
 // ==============================
@@ -252,36 +296,18 @@ at::Tensor _create_from_file_like(
 
 void _add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::string_view device = "cpu",
     std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
         custom_frame_mappings = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.ffmpegThreadCount = num_threads;
 
-  // TODO: Eliminate this temporary bridge code. This exists because we have
-  //       not yet exposed the transforms API on the Python side. We also want
-  //       to remove the `width` and `height` arguments from the Python API.
-  //
-  // TEMPORARY BRIDGE CODE START
-  TORCH_CHECK(
-      width.has_value() == height.has_value(),
-      "width and height must both be set or unset.");
-  std::vector<Transform*> transforms;
-  if (width.has_value()) {
-    transforms.push_back(
-        new ResizeTransform(FrameDims(height.value(), width.value())));
-    width.reset();
-    height.reset();
-  }
-  // TEMPORARY BRIDGE CODE END
-
   if (dimension_order.has_value()) {
     std::string stdDimensionOrder{dimension_order.value()};
     TORCH_CHECK(stdDimensionOrder == "NHWC" || stdDimensionOrder == "NCHW");
@@ -309,6 +335,9 @@ void _add_video_stream(
   videoStreamOptions.device = torch::Device(std::string(device));
   videoStreamOptions.deviceVariant = device_variant;
 
+  std::vector<Transform*> transforms =
+      makeTransforms(std::string(transform_specs));
+
   std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
       custom_frame_mappings.has_value()
       ? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
@@ -324,24 +353,22 @@ void _add_video_stream(
 // Add a new video stream at `stream_index` using the provided options.
 void add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::string_view device = "cpu",
     std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
         custom_frame_mappings = std::nullopt) {
   _add_video_stream(
       decoder,
-      width,
-      height,
       num_threads,
       dimension_order,
       stream_index,
       device,
       device_variant,
+      transform_specs,
       custom_frame_mappings);
 }
 
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index 64298eab5..f8a0df980 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -299,13 +299,12 @@ def create_from_tensor_abstract(
 def _add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: str = "cpu",
     device_variant: str = "default",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
@@ -318,13 +317,12 @@ def _add_video_stream_abstract(
 def add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: str = "cpu",
     device_variant: str = "default",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py
index 06976ba78..343728393 100644
--- a/src/torchcodec/_samplers/video_clip_sampler.py
+++ b/src/torchcodec/_samplers/video_clip_sampler.py
@@ -147,8 +147,7 @@ def forward(self, video_data: Tensor) -> Union[List[Any]]:
         scan_all_streams_to_update_metadata(video_decoder)
         add_video_stream(
             video_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             num_threads=self.decoder_args.num_threads,
         )
 
diff --git a/test/test_ops.py b/test/test_ops.py
index f16dd63ad..548442ebf 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -631,8 +631,7 @@ def test_color_conversion_library_with_scaling(
         filtergraph_decoder = create_from_file(str(input_video.path))
         _add_video_stream(
             filtergraph_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             color_conversion_library="filtergraph",
         )
         filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder)
@@ -640,8 +639,7 @@ def test_color_conversion_library_with_scaling(
         swscale_decoder = create_from_file(str(input_video.path))
         _add_video_stream(
             swscale_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             color_conversion_library="swscale",
         )
         swscale_frame0, _, _ = get_next_frame(swscale_decoder)
@@ -655,7 +653,7 @@ def test_scaling_on_cuda_fails(self):
             RuntimeError,
             match="Transforms are only supported for CPU devices.",
         ):
-            add_video_stream(decoder, device="cuda", width=100, height=100)
+            add_video_stream(decoder, device="cuda", transform_specs="resize, 100, 100")
 
     @pytest.mark.parametrize("dimension_order", ("NHWC", "NCHW"))
     @pytest.mark.parametrize("color_conversion_library", ("filtergraph", "swscale"))
@@ -763,8 +761,7 @@ def test_color_conversion_library_with_generated_videos(
         filtergraph_decoder = create_from_file(str(video_path))
         _add_video_stream(
             filtergraph_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             color_conversion_library="filtergraph",
         )
         filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder)
@@ -772,8 +769,7 @@ def test_color_conversion_library_with_generated_videos(
         auto_decoder = create_from_file(str(video_path))
         add_video_stream(
             auto_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
         )
         auto_frame0, _, _ = get_next_frame(auto_decoder)
         assert_frames_equal(filtergraph_frame0, auto_frame0)

From 7143f15097174737d583aa4a0a0dfdf13cb4e1ae Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 8 Oct 2025 18:51:53 -0700
Subject: [PATCH 2/4] Better error checking

---
 benchmarks/decoders/gpu_benchmark.py |  3 +-
 src/torchcodec/_core/custom_ops.cpp  | 17 ++++++++--
 test/test_ops.py                     | 46 ++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py
index a22691883..94688b58b 100644
--- a/benchmarks/decoders/gpu_benchmark.py
+++ b/benchmarks/decoders/gpu_benchmark.py
@@ -39,8 +39,7 @@ def decode_full_video(video_path, decode_device_string, resize_device_string):
         stream_index=-1,
         device=decode_device_string,
         num_threads=num_threads,
-        width=width,
-        height=height,
+        transform_specs=f"resize, {height}, {width}",
     )
 
     start_time = time.time()
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index e9df65203..1ca679775 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -183,13 +183,26 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
   }
 }
 
+int checkedToPositiveInt(const std::string& str) {
+  int ret = 0;
+  try {
+    ret = std::stoi(str);
+  } catch (const std::invalid_argument&) {
+    TORCH_CHECK(false, "String cannot be converted to an int:" + str);
+  } catch (const std::out_of_range&) {
+    TORCH_CHECK(false, "String would become integer out of range:" + str);
+  }
+  TORCH_CHECK(ret > 0, "String must be a positive integer:" + str);
+  return ret;
+}
+
 Transform* makeResizeTransform(
     const std::vector<std::string>& resizeTransformSpec) {
   TORCH_CHECK(
       resizeTransformSpec.size() == 3,
       "resizeTransformSpec must have 3 elements including its name");
-  int height = std::stoi(resizeTransformSpec[1]);
-  int width = std::stoi(resizeTransformSpec[2]);
+  int height = checkedToPositiveInt(resizeTransformSpec[1]);
+  int width = checkedToPositiveInt(resizeTransformSpec[2]);
   return new ResizeTransform(FrameDims(height, width));
 }
 
diff --git a/test/test_ops.py b/test/test_ops.py
index 548442ebf..c3233a4f9 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -655,6 +655,52 @@ def test_scaling_on_cuda_fails(self):
         ):
             add_video_stream(decoder, device="cuda", transform_specs="resize, 100, 100")
 
+    def test_transform_fails(self):
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        with pytest.raises(
+            RuntimeError,
+            match="Invalid transform spec",
+        ):
+            add_video_stream(decoder, transform_specs=";")
+
+        with pytest.raises(
+            RuntimeError,
+            match="Invalid transform name",
+        ):
+            add_video_stream(decoder, transform_specs="invalid, 1, 2")
+
+    def test_resize_transform_fails(self):
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        with pytest.raises(
+            RuntimeError,
+            match="must have 3 elements",
+        ):
+            add_video_stream(decoder, transform_specs="resize, 100, 100, 100")
+
+        with pytest.raises(
+            RuntimeError,
+            match="must be a positive integer",
+        ):
+            add_video_stream(decoder, transform_specs="resize, -10, 100")
+
+        with pytest.raises(
+            RuntimeError,
+            match="must be a positive integer",
+        ):
+            add_video_stream(decoder, transform_specs="resize, 100, 0")
+
+        with pytest.raises(
+            RuntimeError,
+            match="cannot be converted to an int",
+        ):
+            add_video_stream(decoder, transform_specs="resize, blah, 100")
+
+        with pytest.raises(
+            RuntimeError,
+            match="out of range",
+        ):
+            add_video_stream(decoder, transform_specs="resize, 100, 1000000000000")
+
     @pytest.mark.parametrize("dimension_order", ("NHWC", "NCHW"))
     @pytest.mark.parametrize("color_conversion_library", ("filtergraph", "swscale"))
     def test_color_conversion_library_with_dimension_order(

From a964447cc99ca3b42709641c4ffaeb4b7b57c292 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 8 Oct 2025 19:24:47 -0700
Subject: [PATCH 3/4] Fidx GPU benchmark

---
 benchmarks/decoders/gpu_benchmark.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py
index 94688b58b..4300643dd 100644
--- a/benchmarks/decoders/gpu_benchmark.py
+++ b/benchmarks/decoders/gpu_benchmark.py
@@ -29,17 +29,17 @@ def decode_full_video(video_path, decode_device_string, resize_device_string):
     num_threads = None
     if "cuda" in decode_device_string:
         num_threads = 1
-    width = None
-    height = None
+
+    resize_spec = ""
     if "native" in resize_device_string:
-        width = RESIZED_WIDTH
-        height = RESIZED_HEIGHT
+        resize_spec = f"resize, {RESIZED_HEIGHT}, {RESIZED_WIDTH}"
+
     torchcodec._core._add_video_stream(
         decoder,
         stream_index=-1,
         device=decode_device_string,
         num_threads=num_threads,
-        transform_specs=f"resize, {height}, {width}",
+        transform_specs=resize_spec,
     )
 
     start_time = time.time()

From 3db7aaa35c170a8eeab5f855a4f8dfbbbbedc29b Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 8 Oct 2025 19:33:40 -0700
Subject: [PATCH 4/4] Better comments

---
 src/torchcodec/_core/custom_ops.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 1ca679775..57753ad42 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -196,6 +196,12 @@ int checkedToPositiveInt(const std::string& str) {
   return ret;
 }
 
+// Resize transform specs take the form:
+//
+//   "resize, <height>, <width>"
+//
+// Where "resize" is the string literal and <height> and <width> are positive
+// integers.
 Transform* makeResizeTransform(
     const std::vector<std::string>& resizeTransformSpec) {
   TORCH_CHECK(