Use AVFrame info for height and width in GPU APIs (#347)

NicolasHug · web-flow · commit af13ac5ae57e · 2024-11-08T13:50:38.000Z
diff --git a/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp b/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp
@@ -17,7 +17,6 @@ namespace facebook::torchcodec {
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamDecoderOptions& options,
-    const VideoDecoder::StreamMetadata& metadata,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -187,7 +187,6 @@ void initializeContextOnCuda(
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamDecoderOptions& options,
-    const VideoDecoder::StreamMetadata& metadata,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
@@ -197,11 +196,9 @@ void convertAVFrameToDecodedOutputOnCuda(
       src->format == AV_PIX_FMT_CUDA,
       "Expected format to be AV_PIX_FMT_CUDA, got " +
           std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
-  auto frameDims = getHeightAndWidthFromOptionsOrMetadata(options, metadata);
+  auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(options, *src);
   int height = frameDims.height;
   int width = frameDims.width;
-  NppiSize oSizeROI = {width, height};
-  Npp8u* input[2] = {src->data[0], src->data[1]};
   torch::Tensor& dst = output.frame;
   if (preAllocatedOutputTensor.has_value()) {
     dst = preAllocatedOutputTensor.value();
@@ -222,11 +219,10 @@ void convertAVFrameToDecodedOutputOnCuda(
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device);
 
-  auto start = std::chrono::high_resolution_clock::now();
+  NppiSize oSizeROI = {width, height};
+  Npp8u* input[2] = {src->data[0], src->data[1]};
 
-  // TODO height and width info of output tensor comes from the metadata, which
-  // may not be accurate. How do we make sure we won't corrupt memory if the
-  // allocated tensor is too short/large?
+  auto start = std::chrono::high_resolution_clock::now();
   NppStatus status = nppiNV12ToRGB_8u_P2C3R(
       input,
       src->linesize[0],
diff --git a/src/torchcodec/decoders/_core/DeviceInterface.h b/src/torchcodec/decoders/_core/DeviceInterface.h
@@ -35,7 +35,6 @@ void initializeContextOnCuda(
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamDecoderOptions& options,
-    const VideoDecoder::StreamMetadata& metadata,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -863,7 +863,6 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
     convertAVFrameToDecodedOutputOnCuda(
         streamInfo.options.device,
         streamInfo.options,
-        containerMetadata_.streams[streamIndex],
         rawOutput,
         output,
         preAllocatedOutputTensor);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -440,16 +440,16 @@ class VideoDecoder {
 //   AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
 //   our code or within FFmpeg code, this should be exactly the same as
 //   getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
-//   decoding APIs, on CPU, with swscale.
+//   decoding APIs, on CPU with swscale, and on GPU.
 // - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
 //   the user-specified options if they exist, or the height and width form the
 //   stream metadata, which itself got its value from the CodecContext, when the
-//   stream was added. This is used by batch decoding APIs, or by GPU-APIs (both
-//   batch and single-frames).
+//   stream was added. This is used by batch decoding APIs, for both GPU and
+//   CPU.
 //
-// The source of truth for height and width really is the (resized) AVFrame:
-// it's the decoded ouptut from FFmpeg. The info from the metadata (i.e. from
-// the CodecContext) may not be as accurate. However, the AVFrame is only
+// The source of truth for height and width really is the (resized) AVFrame: it
+// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
+// from the CodecContext) may not be as accurate. However, the AVFrame is only
 // available late in the call stack, when the frame is decoded, while the
 // CodecContext is available early when a stream is added. This is why we use
 // the CodecContext for pre-allocating batched output tensors (we could