meta-pytorch
diff --git a/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 93 additions & 57 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 93 additions & 57 deletions
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 16 additions & 4 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 15 additions & 19 deletions b/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 15 additions & 19 deletions
diff --git a/‎src/torchcodec/_core/CudaDeviceInterface.h‎
Lines changed: 9 additions & 1 deletion b/‎src/torchcodec/_core/CudaDeviceInterface.h‎
Lines changed: 9 additions & 1 deletion
@@ -95,6 +95,7 @@ function(make_torchcodec_libraries
         SingleStreamDecoder.cpp
         Encoder.cpp
         ValidationUtils.cpp
+        Transform.cpp
     )
 
     if(ENABLE_CUDA)
 
@@ -46,6 +46,74 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
       device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
 }
 
+void CpuDeviceInterface::initialize(
+    [[maybe_unused]] AVCodecContext* codecContext,
+    const VideoStreamOptions& videoStreamOptions,
+    const std::vector<std::unique_ptr<Transform>>& transforms,
+    const AVRational& timeBase,
+    const FrameDims& outputDims) {
+  videoStreamOptions_ = videoStreamOptions;
+  timeBase_ = timeBase;
+  outputDims_ = outputDims;
+
+  // TODO: rationalize comment below with new stuff.
+  // By default, we want to use swscale for color conversion because it is
+  // faster. However, it has width requirements, so we may need to fall back
+  // to filtergraph. We also need to respect what was requested from the
+  // options; we respect the options unconditionally, so it's possible for
+  // swscale's width requirements to be violated. We don't expose the ability to
+  // choose color conversion library publicly; we only use this ability
+  // internally.
+
+  // If any transforms are not swscale compatible, then we can't use swscale.
+  bool areTransformsSwScaleCompatible = true;
+  for (const auto& transform : transforms) {
+    areTransformsSwScaleCompatible =
+        areTransformsSwScaleCompatible && transform->isSwScaleCompatible();
+  }
+
+  // swscale requires widths to be multiples of 32:
+  // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
+  bool isWidthSwScaleCompatible = (outputDims_.width % 32) == 0;
+
+  bool userRequestedSwScale =
+      videoStreamOptions_.colorConversionLibrary.has_value() &&
+      videoStreamOptions_.colorConversionLibrary.value() ==
+          ColorConversionLibrary::SWSCALE;
+
+  // Note that we treat the transform limitation differently from the width
+  // limitation. That is, we consider the transforms being compatible with
+  // sws_scale as a hard requirement. If the transforms are not compatiable,
+  // then we will end up not applying the transforms, and that is wrong.
+  //
+  // The width requirement, however, is a soft requirement. Even if we don't
+  // meet it, we let the user override it. We have tests that depend on this
+  // behavior. Since we don't expose the ability to choose swscale or
+  // filtergraph in our public API, this is probably okay. It's also the only
+  // way that we can be certain we are testing one versus the other.
+  if (areTransformsSwScaleCompatible &&
+      (userRequestedSwScale || isWidthSwScaleCompatible)) {
+    colorConversionLibrary_ = ColorConversionLibrary::SWSCALE;
+  } else {
+    colorConversionLibrary_ = ColorConversionLibrary::FILTERGRAPH;
+
+    // If we have any transforms, replace filters_ with the filter strings from
+    // the transforms.
+    std::stringstream filters;
+    bool first = true;
+    for (const auto& transform : transforms) {
+      if (!first) {
+        filters << ",";
+      }
+      filters << transform->getFilterGraphCpu();
+      first = false;
+    }
+    if (!transforms.empty()) {
+      filters_ = filters.str();
+    }
+  }
+}
+
 // Note [preAllocatedOutputTensor with swscale and filtergraph]:
 // Callers may pass a pre-allocated tensor, where the output.data tensor will
 // be stored. This parameter is honored in any case, but it only leads to a
@@ -56,25 +124,18 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
 // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of
 // `dimension_order` parameter. It's up to callers to re-shape it if needed.
 void CpuDeviceInterface::convertAVFrameToFrameOutput(
-    const VideoStreamOptions& videoStreamOptions,
-    const AVRational& timeBase,
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  auto frameDims =
-      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
-  int expectedOutputHeight = frameDims.height;
-  int expectedOutputWidth = frameDims.width;
-
   if (preAllocatedOutputTensor.has_value()) {
     auto shape = preAllocatedOutputTensor.value().sizes();
     TORCH_CHECK(
-        (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
-            (shape[1] == expectedOutputWidth) && (shape[2] == 3),
+        (shape.size() == 3) && (shape[0] == outputDims_.height) &&
+            (shape[1] == outputDims_.width) && (shape[2] == 3),
         "Expected pre-allocated tensor of shape ",
-        expectedOutputHeight,
+        outputDims_.height,
         "x",
-        expectedOutputWidth,
+        outputDims_.width,
         "x3, got ",
         shape);
   }
@@ -83,25 +144,7 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
   enum AVPixelFormat frameFormat =
       static_cast<enum AVPixelFormat>(avFrame->format);
 
-  // By default, we want to use swscale for color conversion because it is
-  // faster. However, it has width requirements, so we may need to fall back
-  // to filtergraph. We also need to respect what was requested from the
-  // options; we respect the options unconditionally, so it's possible for
-  // swscale's width requirements to be violated. We don't expose the ability to
-  // choose color conversion library publicly; we only use this ability
-  // internally.
-
-  // swscale requires widths to be multiples of 32:
-  // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
-  // so we fall back to filtergraph if the width is not a multiple of 32.
-  auto defaultLibrary = (expectedOutputWidth % 32 == 0)
-      ? ColorConversionLibrary::SWSCALE
-      : ColorConversionLibrary::FILTERGRAPH;
-
-  ColorConversionLibrary colorConversionLibrary =
-      videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
-
-  if (colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
+  if (colorConversionLibrary_ == ColorConversionLibrary::SWSCALE) {
     // We need to compare the current frame context with our previous frame
     // context. If they are different, then we need to re-create our colorspace
     // conversion objects. We create our colorspace conversion objects late so
@@ -113,11 +156,11 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
         avFrame->width,
         avFrame->height,
         frameFormat,
-        expectedOutputWidth,
-        expectedOutputHeight);
+        outputDims_.width,
+        outputDims_.height);
 
-    outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
-        expectedOutputHeight, expectedOutputWidth, torch::kCPU));
+    outputTensor = preAllocatedOutputTensor.value_or(
+        allocateEmptyHWCTensor(outputDims_, torch::kCPU));
 
     if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
       createSwsContext(swsFrameContext, avFrame->colorspace);
@@ -129,34 +172,28 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     // the expected height.
     // TODO: Can we do the same check for width?
     TORCH_CHECK(
-        resultHeight == expectedOutputHeight,
-        "resultHeight != expectedOutputHeight: ",
+        resultHeight == outputDims_.height,
+        "resultHeight != outputDims_.height: ",
         resultHeight,
         " != ",
-        expectedOutputHeight);
+        outputDims_.height);
 
     frameOutput.data = outputTensor;
-  } else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
-    // See comment above in swscale branch about the filterGraphContext_
-    // creation. creation
-    std::stringstream filters;
-    filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
-    filters << ":sws_flags=bilinear";
-
+  } else if (colorConversionLibrary_ == ColorConversionLibrary::FILTERGRAPH) {
     FiltersContext filtersContext(
         avFrame->width,
         avFrame->height,
         frameFormat,
         avFrame->sample_aspect_ratio,
-        expectedOutputWidth,
-        expectedOutputHeight,
+        outputDims_.width,
+        outputDims_.height,
         AV_PIX_FMT_RGB24,
-        filters.str(),
-        timeBase);
+        filters_,
+        timeBase_);
 
     if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
       filterGraphContext_ =
-          std::make_unique<FilterGraph>(filtersContext, videoStreamOptions);
+          std::make_unique<FilterGraph>(filtersContext, videoStreamOptions_);
       prevFiltersContext_ = std::move(filtersContext);
     }
     outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame);
@@ -165,12 +202,12 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     // reshaped to its expected dimensions by filtergraph.
     auto shape = outputTensor.sizes();
     TORCH_CHECK(
-        (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
-            (shape[1] == expectedOutputWidth) && (shape[2] == 3),
+        (shape.size() == 3) && (shape[0] == outputDims_.height) &&
+            (shape[1] == outputDims_.width) && (shape[2] == 3),
         "Expected output tensor of shape ",
-        expectedOutputHeight,
+        outputDims_.height,
         "x",
-        expectedOutputWidth,
+        outputDims_.width,
         "x3, got ",
         shape);
 
@@ -186,7 +223,7 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     TORCH_CHECK(
         false,
         "Invalid color conversion library: ",
-        static_cast<int>(colorConversionLibrary));
+        static_cast<int>(colorConversionLibrary_));
   }
 }
 
@@ -214,9 +251,8 @@ torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
 
   TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24);
 
-  auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredAVFrame.get());
-  int height = frameDims.height;
-  int width = frameDims.width;
+  int height = filteredAVFrame->height;
+  int width = filteredAVFrame->width;
   std::vector<int64_t> shape = {height, width, 3};
   std::vector<int64_t> strides = {filteredAVFrame->linesize[0], 3, 1};
   AVFrame* filteredAVFramePtr = filteredAVFrame.release();
 
@@ -23,12 +23,14 @@ class CpuDeviceInterface : public DeviceInterface {
     return std::nullopt;
   }
 
-  void initializeContext(
-      [[maybe_unused]] AVCodecContext* codecContext) override {}
-
-  void convertAVFrameToFrameOutput(
+  virtual void initialize(
+      [[maybe_unused]] AVCodecContext* codecContext,
       const VideoStreamOptions& videoStreamOptions,
+      const std::vector<std::unique_ptr<Transform>>& transforms,
       const AVRational& timeBase,
+      const FrameDims& outputDims) override;
+
+  void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
@@ -64,6 +66,16 @@ class CpuDeviceInterface : public DeviceInterface {
       const SwsFrameContext& swsFrameContext,
       const enum AVColorSpace colorspace);
 
+  VideoStreamOptions videoStreamOptions_;
+  ColorConversionLibrary colorConversionLibrary_;
+  AVRational timeBase_;
+  FrameDims outputDims_;
+
+  // The copy filter just copies the input to the output. Computationally, it
+  // should be a no-op. If we get no user-provided transforms, we will use the
+  // copy filter.
+  std::string filters_ = "copy";
+
   // color-conversion fields. Only one of FilterGraphContext and
   // UniqueSwsContext should be non-null.
   std::unique_ptr<FilterGraph> filterGraphContext_;
 
@@ -185,9 +185,16 @@ CudaDeviceInterface::~CudaDeviceInterface() {
   }
 }
 
-void CudaDeviceInterface::initializeContext(AVCodecContext* codecContext) {
+void CudaDeviceInterface::initialize(
+    AVCodecContext* codecContext,
+    [[maybe_unsued]] const VideoStreamOptions& videoStreamOptions,
+    [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>& transforms,
+    [[maybe_unused]] const AVRational& timeBase,
+    const FrameDims& outputDims) {
   TORCH_CHECK(!ctx_, "FFmpeg HW device context already initialized");
 
+  outputDims_ = outputDims;
+
   // It is important for pytorch itself to create the cuda context. If ffmpeg
   // creates the context it may not be compatible with pytorch.
   // This is a dummy tensor to initialize the cuda context.
@@ -196,12 +203,9 @@ void CudaDeviceInterface::initializeContext(AVCodecContext* codecContext) {
   ctx_ = getCudaContext(device_);
   nppCtx_ = getNppStreamContext(device_);
   codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
-  return;
 }
 
 void CudaDeviceInterface::convertAVFrameToFrameOutput(
-    const VideoStreamOptions& videoStreamOptions,
-    [[maybe_unused]] const AVRational& timeBase,
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
@@ -219,11 +223,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
     FrameOutput cpuFrameOutput;
     cpuInterface->convertAVFrameToFrameOutput(
-        videoStreamOptions,
-        timeBase,
-        avFrame,
-        cpuFrameOutput,
-        preAllocatedOutputTensor);
+        avFrame, cpuFrameOutput, preAllocatedOutputTensor);
 
     frameOutput.data = cpuFrameOutput.data.to(device_);
     return;
@@ -253,25 +253,21 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
       "If the video is 10bit, we are tracking 10bit support in "
       "https://github.com/pytorch/torchcodec/issues/776");
 
-  auto frameDims =
-      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
-  int height = frameDims.height;
-  int width = frameDims.width;
   torch::Tensor& dst = frameOutput.data;
   if (preAllocatedOutputTensor.has_value()) {
     dst = preAllocatedOutputTensor.value();
     auto shape = dst.sizes();
     TORCH_CHECK(
-        (shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
-            (shape[2] == 3),
+        (shape.size() == 3) && (shape[0] == outputDims_.height) &&
+            (shape[1] == outputDims_.width) && (shape[2] == 3),
         "Expected tensor of shape ",
-        height,
+        outputDims_.height,
         "x",
-        width,
+        outputDims_.width,
         "x3, got ",
         shape);
   } else {
-    dst = allocateEmptyHWCTensor(height, width, device_);
+    dst = allocateEmptyHWCTensor(outputDims_, device_);
   }
 
   torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_);
@@ -308,7 +304,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
       "cudaStreamGetFlags failed: ",
       cudaGetErrorString(err));
 
-  NppiSize oSizeROI = {width, height};
+  NppiSize oSizeROI = {outputDims_.width, outputDims_.height};
   Npp8u* yuvData[2] = {avFrame->data[0], avFrame->data[1]};
 
   NppStatus status;
 
@@ -19,17 +19,25 @@ class CudaDeviceInterface : public DeviceInterface {
 
   std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
 
-  void initializeContext(AVCodecContext* codecContext) override;
+  void initialize(
+      AVCodecContext* codecContext,
+      [[maybe_unsued]] const VideoStreamOptions& videoStreamOptions,
+      [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>&
+          transforms,
+      [[maybe_unused]] const AVRational& timeBase,
+      const FrameDims& outputDims) override;
 
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       const AVRational& timeBase,
       UniqueAVFrame& avFrame,
+      const FrameDims& outputDims,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
 
  private:
+  FrameDims outputDims_;
   UniqueAVBufferRef ctx_;
   std::unique_ptr<NppStreamContext> nppCtx_;
 };
Original file line number	Diff line number	Diff line change
`@@ -95,6 +95,7 @@ function(make_torchcodec_libraries`
`95`	`95`	`SingleStreamDecoder.cpp`
`96`	`96`	`Encoder.cpp`
`97`	`97`	`ValidationUtils.cpp`
	`98`	`+ Transform.cpp`
`98`	`99`	`)`
`99`	`100`
`100`	`101`	`if(ENABLE_CUDA)`