Support rotation on beta cuda (#1235)

mollyxu · web-flow · commit cc150444fba8 · 2026-02-13T15:35:33.000-05:00
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -301,6 +301,8 @@ void BetaCudaDeviceInterface::initialize(
     const AVStream* avStream,
     const UniqueDecodingAVFormatContext& avFormatCtx,
     [[maybe_unused]] const SharedAVCodecContext& codecContext) {
+  STD_TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
+  rotation_ = rotationFromDegrees(getRotationFromStream(avStream));
   if (!nvcuvidAvailable_ || !nativeNVDECSupport(device_, codecContext)) {
     cpuFallback_ = createDeviceInterface(kStableCPU);
     STD_TORCH_CHECK(
@@ -314,7 +316,6 @@ void BetaCudaDeviceInterface::initialize(
     return;
   }
 
-  STD_TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
   timeBase_ = avStream->time_base;
   frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
 
@@ -867,12 +868,54 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
       gpuFrame->format == AV_PIX_FMT_CUDA,
       "Expected CUDA format frame from BETA CUDA interface");
 
-  validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame);
-
   cudaStream_t nvdecStream = getCurrentCudaStream(device_.index());
 
-  frameOutput.data = convertNV12FrameToRGB(
-      gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
+  if (rotation_ == Rotation::NONE) {
+    validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame);
+    frameOutput.data = convertNV12FrameToRGB(
+        gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
+  } else {
+    // preAllocatedOutputTensor has post-rotation dimensions, but NV12->RGB
+    // conversion outputs pre-rotation dimensions, so we can't use it as the
+    // conversion destination or validate it against the frame shape.
+    // Once we support native transforms on the beta CUDA interface, rotation
+    // should be handled as part of the transform pipeline instead.
+    frameOutput.data = convertNV12FrameToRGB(
+        gpuFrame,
+        device_,
+        nppCtx_,
+        nvdecStream,
+        /*preAllocatedOutputTensor=*/std::nullopt);
+    applyRotation(frameOutput, preAllocatedOutputTensor);
+  }
+}
+
+void BetaCudaDeviceInterface::applyRotation(
+    FrameOutput& frameOutput,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  int k = 0;
+  switch (rotation_) {
+    case Rotation::CCW90:
+      k = 1;
+      break;
+    case Rotation::ROTATE180:
+      k = 2;
+      break;
+    case Rotation::CW90:
+      k = 3;
+      break;
+    default:
+      STD_TORCH_CHECK(false, "Unexpected rotation value");
+      break;
+  }
+  // Apply rotation using torch::rot90 on the H and W dims of our HWC tensor.
+  // torch::rot90 returns a view, so we need to make it contiguous.
+  frameOutput.data = torch::rot90(frameOutput.data, k, {0, 1}).contiguous();
+
+  if (preAllocatedOutputTensor.has_value()) {
+    preAllocatedOutputTensor.value().copy_(frameOutput.data);
+    frameOutput.data = preAllocatedOutputTensor.value();
+  }
 }
 
 std::string BetaCudaDeviceInterface::getDetails() {
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -20,6 +20,7 @@
 #include "DeviceInterface.h"
 #include "FFMPEGCommon.h"
 #include "NVDECCache.h"
+#include "Transform.h"
 
 #include <map>
 #include <memory>
@@ -82,6 +83,10 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
   UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);
 
+  void applyRotation(
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor);
+
   CUvideoparser videoParser_ = nullptr;
   UniqueCUvideodecoder decoder_;
   CUVIDEOFORMAT videoFormat_ = {};
@@ -102,6 +107,7 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   bool nvcuvidAvailable_ = false;
   UniqueSwsContext swsContext_;
   SwsFrameContext prevSwsFrameContext_;
+  Rotation rotation_ = Rotation::NONE;
 };
 
 } // namespace facebook::torchcodec
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -1966,20 +1966,28 @@ def test_cpu_fallback_no_fallback_on_cpu_device(self):
         assert "No fallback required" in str(decoder.cpu_fallback)
 
     @pytest.mark.parametrize("dimension_order", ["NCHW", "NHWC"])
-    def test_rotation_applied_to_frames(self, dimension_order):
+    @pytest.mark.parametrize(
+        # We are skipping over cuda because we do not support rotation metadata
+        # for the FFmpeg CUDA interface.
+        "device",
+        ("cpu", pytest.param("cuda:beta", marks=pytest.mark.needs_cuda)),
+    )
+    def test_rotation_applied_to_frames(self, dimension_order, device):
         """Test that rotation is correctly applied to decoded frames.
 
         Compares frames from NASA_VIDEO_ROTATED (which has 90-degree rotation
         metadata) with manually rotated frames from NASA_VIDEO.
         Tests all decoding methods to ensure rotation is applied consistently.
         """
-        decoder = VideoDecoder(
+        decoder, _ = make_video_decoder(
             NASA_VIDEO.path,
+            device=device,
             stream_index=NASA_VIDEO.default_stream_index,
             dimension_order=dimension_order,
         )
-        decoder_rotated = VideoDecoder(
+        decoder_rotated, _ = make_video_decoder(
             NASA_VIDEO_ROTATED.path,
+            device=device,
             stream_index=NASA_VIDEO_ROTATED.default_stream_index,
             dimension_order=dimension_order,
         )