Apply reviewer suggestions

scotts · scotts · commit ee3b9b72c4a2 · 2025-10-01T08:02:18.000-07:00
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -161,7 +161,7 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
   // FrameBatchOutputs based on the the stream metadata. But single-frame APIs
   // can still work in such situations, so they should.
   auto outputDims =
-      resizedOutputDims_.value_or(FrameDims(avFrame->width, avFrame->height));
+      resizedOutputDims_.value_or(FrameDims(avFrame->height, avFrame->width));
 
   if (preAllocatedOutputTensor.has_value()) {
     auto shape = preAllocatedOutputTensor.value().sizes();
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -96,10 +96,14 @@ class CpuDeviceInterface : public DeviceInterface {
   UniqueSwsContext swsContext_;
   SwsFrameContext prevSwsFrameContext_;
 
-  // The filter we supply to filterGraph_, if it is used. The copy filter just
-  // copies the input to the output. Computationally, it should be a no-op. If
-  // we get no user-provided transforms, we will use the copy filter. Otherwise,
-  // we will construct the string from the transforms.
+  // The filter we supply to filterGraph_, if it is used. The default is the
+  // copy filter, which just copies the input to the output. Computationally, it
+  // should be a no-op. If we get no user-provided transforms, we will use the
+  // copy filter. Otherwise, we will construct the string from the transforms.
+  //
+  // Note that even if we only use the copy filter, we still get the desired
+  // colorspace conversion. We construct the filtergraph with its output sink
+  // set to RGB24.
   std::string filters_ = "copy";
 
   // The flags we supply to swsContext_, if it used. The flags control the
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -174,6 +174,14 @@ CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
   TORCH_CHECK(g_cuda, "CudaDeviceInterface was not registered!");
   TORCH_CHECK(
       device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
+
+  // It is important for pytorch itself to create the cuda context. If ffmpeg
+  // creates the context it may not be compatible with pytorch.
+  // This is a dummy tensor to initialize the cuda context.
+  torch::Tensor dummyTensorForCudaInitialization = torch::empty(
+      {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
+  ctx_ = getCudaContext(device_);
+  nppCtx_ = getNppStreamContext(device_);
 }
 
 CudaDeviceInterface::~CudaDeviceInterface() {
@@ -191,20 +199,12 @@ void CudaDeviceInterface::initialize(
     [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>& transforms,
     const AVRational& timeBase,
     [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims) {
-  TORCH_CHECK(!ctx_, "FFmpeg HW device context already initialized");
+  TORCH_CHECK(ctx_, "FFmpeg HW device has not been initialized");
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
 
+  codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
   videoStreamOptions_ = videoStreamOptions;
   timeBase_ = timeBase;
-
-  // It is important for pytorch itself to create the cuda context. If ffmpeg
-  // creates the context it may not be compatible with pytorch.
-  // This is a dummy tensor to initialize the cuda context.
-  torch::Tensor dummyTensorForCudaInitialization = torch::empty(
-      {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
-  ctx_ = getCudaContext(device_);
-  nppCtx_ = getNppStreamContext(device_);
-  codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
 }
 
 UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12(
@@ -304,7 +304,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   // Note that CUDA does not yet support transforms, so the only possible
   // frame dimensions are the raw decoded frame's dimensions.
-  auto frameDims = FrameDims(avFrame->width, avFrame->height);
+  auto frameDims = FrameDims(avFrame->height, avFrame->width);
 
   if (preAllocatedOutputTensor.has_value()) {
     auto shape = preAllocatedOutputTensor.value().sizes();
@@ -379,14 +379,15 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
   // Above we checked that the AVFrame was on GPU, but that's not enough, we
   // also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
-  // because this is what the NPP color conversion routines expect.
+  // because this is what the NPP color conversion routines expect. This SHOULD
+  // be enforced by our call to maybeConvertAVFrameToNV12() above.
+  auto hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
   TORCH_CHECK(
-      avFrame->hw_frames_ctx != nullptr,
+      hwFramesCtx != nullptr,
       "The AVFrame does not have a hw_frames_ctx. "
       "That's unexpected, please report this to the TorchCodec repo.");
 
-  auto hwFramesCtx =
-      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
   AVPixelFormat actualFormat = hwFramesCtx->sw_format;
 
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
@@ -14,12 +14,12 @@
 namespace facebook::torchcodec {
 
 struct FrameDims {
-  int width = 0;
   int height = 0;
+  int width = 0;
 
   FrameDims() = default;
 
-  FrameDims(int w, int h) : width(w), height(h) {}
+  FrameDims(int h, int w) : height(h), width(w) {}
 };
 
 // All public video decoding entry points return either a FrameOutput or a
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -504,7 +504,7 @@ void SingleStreamDecoder::addVideoStream(
   }
 
   metadataDims_ =
-      FrameDims(streamMetadata.width.value(), streamMetadata.height.value());
+      FrameDims(streamMetadata.height.value(), streamMetadata.width.value());
   for (auto& transform : transforms) {
     TORCH_CHECK(transform != nullptr, "Transforms should never be nullptr!");
     if (transform->getOutputFrameDims().has_value()) {
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -17,10 +17,6 @@ std::string toFilterGraphInterpolation(
   switch (mode) {
     case ResizeTransform::InterpolationMode::BILINEAR:
       return "bilinear";
-    case ResizeTransform::InterpolationMode::BICUBIC:
-      return "bicubic";
-    case ResizeTransform::InterpolationMode::NEAREST:
-      return "nearest";
     default:
       TORCH_CHECK(
           false,
@@ -33,10 +29,6 @@ int toSwsInterpolation(ResizeTransform::InterpolationMode mode) {
   switch (mode) {
     case ResizeTransform::InterpolationMode::BILINEAR:
       return SWS_BILINEAR;
-    case ResizeTransform::InterpolationMode::BICUBIC:
-      return SWS_BICUBIC;
-    case ResizeTransform::InterpolationMode::NEAREST:
-      return SWS_POINT;
     default:
       TORCH_CHECK(
           false,
@@ -48,12 +40,13 @@ int toSwsInterpolation(ResizeTransform::InterpolationMode mode) {
 } // namespace
 
 std::string ResizeTransform::getFilterGraphCpu() const {
-  return "scale=" + std::to_string(width_) + ":" + std::to_string(height_) +
+  return "scale=" + std::to_string(outputDims_.width) + ":" +
+      std::to_string(outputDims_.height) +
       ":sws_flags=" + toFilterGraphInterpolation(interpolationMode_);
 }
 
 std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
-  return FrameDims(width_, height_);
+  return outputDims_;
 }
 
 bool ResizeTransform::isResize() const {
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
@@ -37,15 +37,13 @@ class Transform {
 
 class ResizeTransform : public Transform {
  public:
-  enum class InterpolationMode { BILINEAR, BICUBIC, NEAREST };
+  enum class InterpolationMode { BILINEAR };
 
-  ResizeTransform(int width, int height)
-      : width_(width),
-        height_(height),
-        interpolationMode_(InterpolationMode::BILINEAR) {}
+  ResizeTransform(const FrameDims& dims)
+      : outputDims_(dims), interpolationMode_(InterpolationMode::BILINEAR) {}
 
-  ResizeTransform(int width, int height, InterpolationMode interpolationMode)
-      : width_(width), height_(height), interpolationMode_(interpolationMode) {}
+  ResizeTransform(const FrameDims& dims, InterpolationMode interpolationMode)
+      : outputDims_(dims), interpolationMode_(interpolationMode) {}
 
   std::string getFilterGraphCpu() const override;
   std::optional<FrameDims> getOutputFrameDims() const override;
@@ -54,8 +52,7 @@ class ResizeTransform : public Transform {
   int getSwsFlags() const;
 
  private:
-  int width_;
-  int height_;
+  FrameDims outputDims_;
   InterpolationMode interpolationMode_;
 };
 
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -274,7 +274,8 @@ void _add_video_stream(
       "width and height must both be set or unset.");
   std::vector<Transform*> transforms;
   if (width.has_value()) {
-    transforms.push_back(new ResizeTransform(width.value(), height.value()));
+    transforms.push_back(
+        new ResizeTransform(FrameDims(height.value(), width.value())));
     width.reset();
     height.reset();
   }

Original file line number	Diff line number	Diff line change
`@@ -504,7 +504,7 @@ void SingleStreamDecoder::addVideoStream(`
`504`	`504`	`}`
`505`	`505`
`506`	`506`	`metadataDims_ =`
`507`		`- FrameDims(streamMetadata.width.value(), streamMetadata.height.value());`
	`507`	`+ FrameDims(streamMetadata.height.value(), streamMetadata.width.value());`
`508`	`508`	`for (auto& transform : transforms) {`
`509`	`509`	`TORCH_CHECK(transform != nullptr, "Transforms should never be nullptr!");`
`510`	`510`	`if (transform->getOutputFrameDims().has_value()) {`