Merge remote-tracking branch 'origin/main' into cuda8

ahmadsharif1 · ahmadsharif1 · commit c1a7f68b4e63 · 2024-10-21T11:40:57.000-07:00
diff --git a/.github/workflows/cuda_tests.yaml b/.github/workflows/cuda_tests.yaml
@@ -16,7 +16,8 @@ jobs:
         python_version: ["3.9"]
         # TODO: Add more cuda versions.
         cuda_arch_version: ["12.4"]
-        ffmpeg_version: ["origin/release/6.1"]
+        # TODO: Get ffmpeg 4 to work. Currently fails to build with nvcc.
+        ffmpeg_version: ["origin/release/5.1", "origin/release/6.1", "origin/release/7.1"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -77,6 +77,8 @@ AVBufferRef* getFromCache(const torch::Device& device) {
   return nullptr;
 }
 
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
+
 AVBufferRef* getFFMPEGContextFromExistingCudaContext(
     const torch::Device& device,
     torch::DeviceIndex nonNegativeDeviceIndex,
@@ -105,6 +107,8 @@ AVBufferRef* getFFMPEGContextFromExistingCudaContext(
   return hw_device_ctx;
 }
 
+#else
+
 AVBufferRef* getFFMPEGContextFromNewCudaContext(
     const torch::Device& device,
     torch::DeviceIndex nonNegativeDeviceIndex,
@@ -122,6 +126,8 @@ AVBufferRef* getFFMPEGContextFromNewCudaContext(
   return hw_device_ctx;
 }
 
+#endif
+
 AVBufferRef* getCudaContext(const torch::Device& device) {
   enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
   TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -34,6 +34,31 @@ double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
   return ptsToSeconds(pts, timeBase.den);
 }
 
+// Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
+// The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
+// or 4D.
+// Calling permute() is guaranteed to return a view as per the docs:
+// https://pytorch.org/docs/stable/generated/torch.permute.html
+torch::Tensor MaybePermuteHWC2CHW(
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    torch::Tensor& hwcTensor) {
+  if (options.dimensionOrder == "NHWC") {
+    return hwcTensor;
+  }
+  auto numDimensions = hwcTensor.dim();
+  auto shape = hwcTensor.sizes();
+  if (numDimensions == 3) {
+    TORCH_CHECK(shape[2] == 3, "Not a HWC tensor: ", shape);
+    return hwcTensor.permute({2, 0, 1});
+  } else if (numDimensions == 4) {
+    TORCH_CHECK(shape[3] == 3, "Not a NHWC tensor: ", shape);
+    return hwcTensor.permute({0, 3, 1, 2});
+  } else {
+    TORCH_CHECK(
+        false, "Expected tensor with 3 or 4 dimensions, got ", numDimensions);
+  }
+}
+
 struct AVInput {
   UniqueAVFormatContext formatContext;
   std::unique_ptr<AVIOBytesContext> ioBytesContext;
@@ -167,28 +192,13 @@ VideoDecoder::BatchDecodedOutput::BatchDecodedOutput(
     const VideoStreamDecoderOptions& options,
     const StreamMetadata& metadata)
     : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
-      durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
-  if (options.dimensionOrder == "NHWC") {
-    frames = torch::empty(
-        {numFrames,
-         options.height.value_or(*metadata.height),
-         options.width.value_or(*metadata.width),
-         3},
-        {torch::kUInt8});
-  } else if (options.dimensionOrder == "NCHW") {
-    frames = torch::empty(
-        {numFrames,
-         3,
-         options.height.value_or(*metadata.height),
-         options.width.value_or(*metadata.width)},
-        torch::TensorOptions()
-            .memory_format(torch::MemoryFormat::ChannelsLast)
-            .dtype({torch::kUInt8}));
-  } else {
-    TORCH_CHECK(
-        false, "Unsupported frame dimensionOrder =" + options.dimensionOrder)
-  }
-}
+      durationSeconds(torch::empty({numFrames}, {torch::kFloat64})),
+      frames(torch::empty(
+          {numFrames,
+           options.height.value_or(*metadata.height),
+           options.width.value_or(*metadata.width),
+           3},
+          {torch::kUInt8})) {}
 
 VideoDecoder::VideoDecoder() {}
 
@@ -652,8 +662,9 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   }
   for (int streamIndex : activeStreamIndices_) {
     StreamInfo& streamInfo = streams_[streamIndex];
-    streamInfo.discardFramesBeforePts =
-        *maybeDesiredPts_ * streamInfo.timeBase.den;
+    // clang-format off: clang format clashes
+    streamInfo.discardFramesBeforePts = *maybeDesiredPts_ * streamInfo.timeBase.den;
+    // clang-format on
   }
 
   decodeStats_.numSeeksAttempted++;
@@ -846,7 +857,8 @@ VideoDecoder::RawDecodedOutput VideoDecoder::getDecodedOutputWithFilter(
 }
 
 VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
-    VideoDecoder::RawDecodedOutput& rawOutput) {
+    VideoDecoder::RawDecodedOutput& rawOutput,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   // Convert the frame to tensor.
   DecodedOutput output;
   int streamIndex = rawOutput.streamIndex;
@@ -861,8 +873,10 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
   output.durationSeconds = ptsToSeconds(
       getDuration(frame), formatContext_->streams[streamIndex]->time_base);
   if (streamInfo.options.device.type() == torch::kCPU) {
-    convertAVFrameToDecodedOutputOnCPU(rawOutput, output);
+    convertAVFrameToDecodedOutputOnCPU(
+        rawOutput, output, preAllocatedOutputTensor);
   } else if (streamInfo.options.device.type() == torch::kCUDA) {
+    // TODO: handle pre-allocated output tensor
     convertAVFrameToDecodedOutputOnCuda(
         streamInfo.options.device,
         streamInfo.options,
@@ -878,22 +892,35 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
 
 void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
     VideoDecoder::RawDecodedOutput& rawOutput,
-    DecodedOutput& output) {
+    DecodedOutput& output,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   int streamIndex = rawOutput.streamIndex;
   AVFrame* frame = rawOutput.frame.get();
   auto& streamInfo = streams_[streamIndex];
   if (output.streamType == AVMEDIA_TYPE_VIDEO) {
     if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
+      torch::Tensor tensor;
       int width = streamInfo.options.width.value_or(frame->width);
       int height = streamInfo.options.height.value_or(frame->height);
-      torch::Tensor tensor = torch::empty(
-          {height, width, 3}, torch::TensorOptions().dtype({torch::kUInt8}));
+      if (preAllocatedOutputTensor.has_value()) {
+        tensor = preAllocatedOutputTensor.value();
+        auto shape = tensor.sizes();
+        TORCH_CHECK(
+            (shape.size() == 3) && (shape[0] == height) &&
+                (shape[1] == width) && (shape[2] == 3),
+            "Expected tensor of shape ",
+            height,
+            "x",
+            width,
+            "x3, got ",
+            shape);
+      } else {
+        tensor = torch::empty(
+            {height, width, 3}, torch::TensorOptions().dtype({torch::kUInt8}));
+      }
       rawOutput.data = tensor.data_ptr<uint8_t>();
       convertFrameToBufferUsingSwsScale(rawOutput);
 
-      if (streamInfo.options.dimensionOrder == "NCHW") {
-        tensor = tensor.permute({2, 0, 1});
-      }
       output.frame = tensor;
     } else if (
         streamInfo.colorConversionLibrary ==
@@ -904,6 +931,14 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
           "Invalid color conversion library: " +
           std::to_string(static_cast<int>(streamInfo.colorConversionLibrary)));
     }
+    if (!preAllocatedOutputTensor.has_value()) {
+      // We only convert to CHW if a pre-allocated tensor wasn't passed. When a
+      // pre-allocated tensor is passed, it's up to the caller (typically a
+      // batch API) to do the conversion. This is more efficient as it allows
+      // batch NHWC tensors to be permuted only once, instead of permuting HWC
+      // tensors N times.
+      output.frame = MaybePermuteHWC2CHW(streamInfo.options, output.frame);
+    }
 
   } else if (output.streamType == AVMEDIA_TYPE_AUDIO) {
     // TODO: https://github.com/pytorch-labs/torchcodec/issues/85 implement
@@ -980,7 +1015,8 @@ void VideoDecoder::validateFrameIndex(
 
 VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
     int streamIndex,
-    int64_t frameIndex) {
+    int64_t frameIndex,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateUserProvidedStreamIndex(streamIndex);
   validateScannedAllStreams("getFrameAtIndex");
 
@@ -989,7 +1025,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
 
   int64_t pts = stream.allFrames[frameIndex].pts;
   setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase));
-  return getNextDecodedOutputNoDemux();
+  return getNextDecodedOutputNoDemux(preAllocatedOutputTensor);
 }
 
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
@@ -999,40 +1035,25 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
   validateScannedAllStreams("getFramesAtIndices");
 
   const auto& streamMetadata = containerMetadata_.streams[streamIndex];
-  const auto& options = streams_[streamIndex].options;
+  const auto& stream = streams_[streamIndex];
+  const auto& options = stream.options;
   BatchDecodedOutput output(frameIndices.size(), options, streamMetadata);
 
-  int i = 0;
-  const auto& stream = streams_[streamIndex];
-  for (int64_t frameIndex : frameIndices) {
+  for (auto f = 0; f < frameIndices.size(); ++f) {
+    auto frameIndex = frameIndices[f];
     if (frameIndex < 0 || frameIndex >= stream.allFrames.size()) {
       throw std::runtime_error(
           "Invalid frame index=" + std::to_string(frameIndex));
     }
-    int64_t pts = stream.allFrames[frameIndex].pts;
-    setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase));
-    auto rawSingleOutput = getNextRawDecodedOutputNoDemux();
-    if (stream.colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
-      // We are using sws_scale to convert the frame to tensor. sws_scale can
-      // convert to a pre-allocated buffer so we can do the color-conversion
-      // in-place on the output tensor's data_ptr.
-      rawSingleOutput.data = output.frames[i].data_ptr<uint8_t>();
-      convertFrameToBufferUsingSwsScale(rawSingleOutput);
-    } else if (
-        stream.colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
-      // We are using a filter graph to convert the frame to tensor. The
-      // filter graph returns us an AVFrame allocated by FFMPEG. So we need to
-      // copy the AVFrame to the output tensor.
-      torch::Tensor frame = convertFrameToTensorUsingFilterGraph(
-          rawSingleOutput.streamIndex, rawSingleOutput.frame.get());
-      output.frames[i] = frame;
-    } else {
-      throw std::runtime_error(
-          "Invalid color conversion library: " +
-          std::to_string(static_cast<int>(stream.colorConversionLibrary)));
+    DecodedOutput singleOut =
+        getFrameAtIndex(streamIndex, frameIndex, output.frames[f]);
+    if (options.colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
+      output.frames[f] = singleOut.frame;
     }
-    i++;
+    // Note that for now we ignore the pts and duration parts of the output,
+    // because they're never used in any caller.
   }
+  output.frames = MaybePermuteHWC2CHW(options, output.frames);
   return output;
 }
 
@@ -1061,12 +1082,14 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesInRange(
   BatchDecodedOutput output(numOutputFrames, options, streamMetadata);
 
   for (int64_t i = start, f = 0; i < stop; i += step, ++f) {
-    DecodedOutput singleOut = getFrameAtIndex(streamIndex, i);
-    output.frames[f] = singleOut.frame;
+    DecodedOutput singleOut = getFrameAtIndex(streamIndex, i, output.frames[f]);
+    if (options.colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
+      output.frames[f] = singleOut.frame;
+    }
     output.ptsSeconds[f] = singleOut.ptsSeconds;
     output.durationSeconds[f] = singleOut.durationSeconds;
   }
-
+  output.frames = MaybePermuteHWC2CHW(options, output.frames);
   return output;
 }
 
@@ -1119,6 +1142,7 @@ VideoDecoder::getFramesDisplayedByTimestampInRange(
   // need this special case below.
   if (startSeconds == stopSeconds) {
     BatchDecodedOutput output(0, options, streamMetadata);
+    output.frames = MaybePermuteHWC2CHW(options, output.frames);
     return output;
   }
 
@@ -1154,11 +1178,14 @@ VideoDecoder::getFramesDisplayedByTimestampInRange(
   int64_t numFrames = stopFrameIndex - startFrameIndex;
   BatchDecodedOutput output(numFrames, options, streamMetadata);
   for (int64_t i = startFrameIndex, f = 0; i < stopFrameIndex; ++i, ++f) {
-    DecodedOutput singleOut = getFrameAtIndex(streamIndex, i);
-    output.frames[f] = singleOut.frame;
+    DecodedOutput singleOut = getFrameAtIndex(streamIndex, i, output.frames[f]);
+    if (options.colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
+      output.frames[f] = singleOut.frame;
+    }
     output.ptsSeconds[f] = singleOut.ptsSeconds;
     output.durationSeconds[f] = singleOut.durationSeconds;
   }
+  output.frames = MaybePermuteHWC2CHW(options, output.frames);
 
   return output;
 }
@@ -1167,15 +1194,15 @@ VideoDecoder::RawDecodedOutput VideoDecoder::getNextRawDecodedOutputNoDemux() {
   auto rawOutput =
       getDecodedOutputWithFilter([this](int frameStreamIndex, AVFrame* frame) {
         StreamInfo& activeStream = streams_[frameStreamIndex];
-        return frame->pts >=
-            activeStream.discardFramesBeforePts;
+        return frame->pts >= activeStream.discardFramesBeforePts;
       });
   return rawOutput;
 }
 
-VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux() {
+VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux(
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   auto rawOutput = getNextRawDecodedOutputNoDemux();
-  return convertAVFrameToDecodedOutput(rawOutput);
+  return convertAVFrameToDecodedOutput(rawOutput, preAllocatedOutputTensor);
 }
 
 void VideoDecoder::setCursorPtsInSeconds(double seconds) {
@@ -1285,11 +1312,6 @@ torch::Tensor VideoDecoder::convertFrameToTensorUsingFilterGraph(
   torch::Tensor tensor = torch::from_blob(
       filteredFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
   StreamInfo& activeStream = streams_[streamIndex];
-  if (activeStream.options.dimensionOrder == "NCHW") {
-    // The docs guaranty this to return a view:
-    // https://pytorch.org/docs/stable/generated/torch.permute.html
-    tensor = tensor.permute({2, 0, 1});
-  }
   return tensor;
 }
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -214,15 +214,19 @@ class VideoDecoder {
   };
   // Decodes the frame where the current cursor position is. It also advances
   // the cursor to the next frame.
-  DecodedOutput getNextDecodedOutputNoDemux();
+  DecodedOutput getNextDecodedOutputNoDemux(
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
   // Decodes the first frame in any added stream that is visible at a given
   // timestamp. Frames in the video have a presentation timestamp and a
   // duration. For example, if a frame has presentation timestamp of 5.0s and a
   // duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0).
   // i.e. it will be returned when this function is called with seconds=5.0 or
   // seconds=5.999, etc.
   DecodedOutput getFrameDisplayedAtTimestampNoDemux(double seconds);
-  DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex);
+  DecodedOutput getFrameAtIndex(
+      int streamIndex,
+      int64_t frameIndex,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
   struct BatchDecodedOutput {
     torch::Tensor frames;
     torch::Tensor ptsSeconds;
@@ -363,10 +367,13 @@ class VideoDecoder {
       int streamIndex,
       const AVFrame* frame);
   void convertFrameToBufferUsingSwsScale(RawDecodedOutput& rawOutput);
-  DecodedOutput convertAVFrameToDecodedOutput(RawDecodedOutput& rawOutput);
+  DecodedOutput convertAVFrameToDecodedOutput(
+      RawDecodedOutput& rawOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
   void convertAVFrameToDecodedOutputOnCPU(
       RawDecodedOutput& rawOutput,
-      DecodedOutput& output);
+      DecodedOutput& output,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   DecoderOptions options_;
   ContainerMetadata containerMetadata_;
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py