Remove AVFrameStream struct

NicolasHug · NicolasHug · commit 3141c188b4ef · 2025-03-17T19:58:20.000Z
diff --git a/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp b/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp
@@ -17,7 +17,7 @@ namespace facebook::torchcodec {
 void convertAVFrameToFrameOutputOnCuda(
     const torch::Device& device,
     [[maybe_unused]] const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    [[maybe_unused]] VideoDecoder::AVFrameStream& avFrameStream,
+    [[maybe_unused]] UniqueAVFrame& avFrame,
     [[maybe_unused]] VideoDecoder::FrameOutput& frameOutput,
     [[maybe_unused]] std::optional<torch::Tensor> preAllocatedOutputTensor) {
   throwUnsupportedDeviceError(device);
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -190,17 +190,15 @@ void initializeContextOnCuda(
 void convertAVFrameToFrameOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     VideoDecoder::FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrame* avFrame = avFrameStream.avFrame.get();
-
   TORCH_CHECK(
       avFrame->format == AV_PIX_FMT_CUDA,
       "Expected format to be AV_PIX_FMT_CUDA, got " +
           std::string(av_get_pix_fmt_name((AVPixelFormat)avFrame->format)));
-  auto frameDims =
-      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, *avFrame);
+  auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(
+      videoStreamOptions, *(avFrame.get()));
   int height = frameDims.height;
   int width = frameDims.width;
   torch::Tensor& dst = frameOutput.data;
diff --git a/src/torchcodec/decoders/_core/DeviceInterface.h b/src/torchcodec/decoders/_core/DeviceInterface.h
@@ -32,7 +32,7 @@ void initializeContextOnCuda(
 void convertAVFrameToFrameOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     VideoDecoder::FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -583,9 +583,9 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateActiveStream();
-  AVFrameStream avFrameStream = decodeAVFrame(
+  UniqueAVFrame avFrame = decodeAVFrame(
       [this](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
-  return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
+  return convertAVFrameToFrameOutput(avFrame, preAllocatedOutputTensor);
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndex(int64_t frameIndex) {
@@ -715,27 +715,26 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAt(double seconds) {
   }
 
   setCursorPtsInSeconds(seconds);
-  AVFrameStream avFrameStream =
-      decodeAVFrame([seconds, this](AVFrame* avFrame) {
-        StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
-        double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
-        double frameEndTime = ptsToSeconds(
-            avFrame->pts + getDuration(avFrame), streamInfo.timeBase);
-        if (frameStartTime > seconds) {
-          // FFMPEG seeked past the frame we are looking for even though we
-          // set max_ts to be our needed timestamp in avformat_seek_file()
-          // in maybeSeekToBeforeDesiredPts().
-          // This could be a bug in FFMPEG: https://trac.ffmpeg.org/ticket/11137
-          // In this case we return the very next frame instead of throwing an
-          // exception.
-          // TODO: Maybe log to stderr for Debug builds?
-          return true;
-        }
-        return seconds >= frameStartTime && seconds < frameEndTime;
-      });
+  UniqueAVFrame avFrame = decodeAVFrame([seconds, this](AVFrame* avFrame) {
+    StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
+    double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
+    double frameEndTime =
+        ptsToSeconds(avFrame->pts + getDuration(avFrame), streamInfo.timeBase);
+    if (frameStartTime > seconds) {
+      // FFMPEG seeked past the frame we are looking for even though we
+      // set max_ts to be our needed timestamp in avformat_seek_file()
+      // in maybeSeekToBeforeDesiredPts().
+      // This could be a bug in FFMPEG: https://trac.ffmpeg.org/ticket/11137
+      // In this case we return the very next frame instead of throwing an
+      // exception.
+      // TODO: Maybe log to stderr for Debug builds?
+      return true;
+    }
+    return seconds >= frameStartTime && seconds < frameEndTime;
+  });
 
   // Convert the frame to tensor.
-  FrameOutput frameOutput = convertAVFrameToFrameOutput(avFrameStream);
+  FrameOutput frameOutput = convertAVFrameToFrameOutput(avFrame);
   frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
   return frameOutput;
 }
@@ -891,14 +890,14 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
   auto finished = false;
   while (!finished) {
     try {
-      AVFrameStream avFrameStream = decodeAVFrame([startPts](AVFrame* avFrame) {
+      UniqueAVFrame avFrame = decodeAVFrame([startPts](AVFrame* avFrame) {
         return startPts < avFrame->pts + getDuration(avFrame);
       });
       // TODO: it's not great that we are getting a FrameOutput, which is
       // intended for videos. We should consider bypassing
       // convertAVFrameToFrameOutput and directly call
       // convertAudioAVFrameToFrameOutputOnCPU.
-      auto frameOutput = convertAVFrameToFrameOutput(avFrameStream);
+      auto frameOutput = convertAVFrameToFrameOutput(avFrame);
       firstFramePtsSeconds =
           std::min(firstFramePtsSeconds, frameOutput.ptsSeconds);
       frames.push_back(frameOutput.data);
@@ -1035,7 +1034,7 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
 // LOW-LEVEL DECODING
 // --------------------------------------------------------------------------
 
-VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
+UniqueAVFrame VideoDecoder::decodeAVFrame(
     std::function<bool(AVFrame*)> filterFunction) {
   validateActiveStream();
 
@@ -1150,37 +1149,36 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
   streamInfo.lastDecodedAvFramePts = avFrame->pts;
   streamInfo.lastDecodedAvFrameDuration = getDuration(avFrame);
 
-  return AVFrameStream(std::move(avFrame), activeStreamIndex_);
+  return avFrame;
 }
 
 // --------------------------------------------------------------------------
 // AVFRAME <-> FRAME OUTPUT CONVERSION
 // --------------------------------------------------------------------------
 
 VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   // Convert the frame to tensor.
   FrameOutput frameOutput;
-  int streamIndex = avFrameStream.streamIndex;
-  AVFrame* avFrame = avFrameStream.avFrame.get();
-  frameOutput.streamIndex = streamIndex;
-  auto& streamInfo = streamInfos_[streamIndex];
+  frameOutput.streamIndex = activeStreamIndex_;
+  auto& streamInfo = streamInfos_[activeStreamIndex_];
   frameOutput.ptsSeconds = ptsToSeconds(
-      avFrame->pts, formatContext_->streams[streamIndex]->time_base);
+      avFrame->pts, formatContext_->streams[activeStreamIndex_]->time_base);
   frameOutput.durationSeconds = ptsToSeconds(
-      getDuration(avFrame), formatContext_->streams[streamIndex]->time_base);
+      getDuration(avFrame),
+      formatContext_->streams[activeStreamIndex_]->time_base);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     convertAudioAVFrameToFrameOutputOnCPU(
-        avFrameStream, frameOutput, preAllocatedOutputTensor);
+        avFrame, frameOutput, preAllocatedOutputTensor);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
     convertAVFrameToFrameOutputOnCPU(
-        avFrameStream, frameOutput, preAllocatedOutputTensor);
+        avFrame, frameOutput, preAllocatedOutputTensor);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCUDA) {
     convertAVFrameToFrameOutputOnCuda(
         streamInfo.videoStreamOptions.device,
         streamInfo.videoStreamOptions,
-        avFrameStream,
+        avFrame,
         frameOutput,
         preAllocatedOutputTensor);
   } else {
@@ -1201,14 +1199,13 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
 // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of
 // `dimension_order` parameter. It's up to callers to re-shape it if needed.
 void VideoDecoder::convertAVFrameToFrameOutputOnCPU(
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrame* avFrame = avFrameStream.avFrame.get();
   auto& streamInfo = streamInfos_[activeStreamIndex_];
 
   auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(
-      streamInfo.videoStreamOptions, *avFrame);
+      streamInfo.videoStreamOptions, *(avFrame.get()));
   int expectedOutputHeight = frameDims.height;
   int expectedOutputWidth = frameDims.width;
 
@@ -1251,7 +1248,7 @@ void VideoDecoder::convertAVFrameToFrameOutputOnCPU(
       streamInfo.prevFrameContext = frameContext;
     }
     int resultHeight =
-        convertAVFrameToTensorUsingSwsScale(avFrame, outputTensor);
+        convertAVFrameToTensorUsingSwsScale(avFrame.get(), outputTensor);
     // If this check failed, it would mean that the frame wasn't reshaped to
     // the expected height.
     // TODO: Can we do the same check for width?
@@ -1271,7 +1268,7 @@ void VideoDecoder::convertAVFrameToFrameOutputOnCPU(
       createFilterGraph(streamInfo, expectedOutputHeight, expectedOutputWidth);
       streamInfo.prevFrameContext = frameContext;
     }
-    outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame);
+    outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame.get());
 
     // Similarly to above, if this check fails it means the frame wasn't
     // reshaped to its expected dimensions by filtergraph.
@@ -1350,25 +1347,25 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
 }
 
 void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& srcAVFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   TORCH_CHECK(
       !preAllocatedOutputTensor.has_value(),
       "pre-allocated audio tensor not supported yet.");
 
   AVSampleFormat sourceSampleFormat =
-      static_cast<AVSampleFormat>(avFrameStream.avFrame->format);
+      static_cast<AVSampleFormat>(srcAVFrame->format);
   AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
 
   UniqueAVFrame convertedAVFrame;
   if (sourceSampleFormat != desiredSampleFormat) {
     convertedAVFrame = convertAudioAVFrameSampleFormat(
-        avFrameStream.avFrame, sourceSampleFormat, desiredSampleFormat);
+        srcAVFrame, sourceSampleFormat, desiredSampleFormat);
   }
   const UniqueAVFrame& avFrame = (sourceSampleFormat != desiredSampleFormat)
       ? convertedAVFrame
-      : avFrameStream.avFrame;
+      : srcAVFrame;
 
   AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
   TORCH_CHECK(
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -244,23 +244,6 @@ class VideoDecoder {
   // These are APIs that should be private, but that are effectively exposed for
   // practical reasons, typically for testing purposes.
 
-  // This struct is needed because AVFrame doesn't retain the streamIndex. Only
-  // the AVPacket knows its stream. This is what the low-level private decoding
-  // entry points return. The AVFrameStream is then converted to a FrameOutput
-  // with convertAVFrameToFrameOutput. It should be private, but is currently
-  // used by DeviceInterface.
-  struct AVFrameStream {
-    // The actual decoded output as a unique pointer to an AVFrame.
-    // Usually, this is a YUV frame. It'll be converted to RGB in
-    // convertAVFrameToFrameOutput.
-    UniqueAVFrame avFrame;
-    // The stream index of the decoded frame.
-    int streamIndex;
-
-    explicit AVFrameStream(UniqueAVFrame&& a, int s)
-        : avFrame(std::move(a)), streamIndex(s) {}
-  };
-
   // Once getFrameAtIndex supports the preAllocatedOutputTensor parameter, we
   // can move it back to private.
   FrameOutput getFrameAtIndexInternal(
@@ -376,28 +359,29 @@ class VideoDecoder {
 
   void maybeSeekToBeforeDesiredPts();
 
-  AVFrameStream decodeAVFrame(std::function<bool(AVFrame*)> filterFunction);
+  UniqueAVFrame decodeAVFrame(std::function<bool(AVFrame*)> filterFunction);
 
   FrameOutput getNextFrameInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   torch::Tensor maybePermuteHWC2CHW(torch::Tensor& hwcTensor);
 
   FrameOutput convertAVFrameToFrameOutput(
-      AVFrameStream& avFrameStream,
+      UniqueAVFrame& avFrame,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   void convertAVFrameToFrameOutputOnCPU(
-      AVFrameStream& avFrameStream,
+      UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   void convertAudioAVFrameToFrameOutputOnCPU(
-      AVFrameStream& avFrameStream,
+      UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
-  torch::Tensor convertAVFrameToTensorUsingFilterGraph(const AVFrame* avFrame);
+  torch::Tensor convertAVFrameToTensorUsingFilterGraph(
+      const AVFrame* srcAVFrame);
 
   int convertAVFrameToTensorUsingSwsScale(
       const AVFrame* avFrame,