meta-pytorch
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 67 additions & 35 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 67 additions & 35 deletions
@@ -337,12 +337,12 @@ void VideoDecoder::createFilterGraph(
     StreamInfo& streamInfo,
     int expectedOutputHeight,
     int expectedOutputWidth) {
-  FilterState& filterState = streamInfo.filterState;
-  filterState.filterGraph.reset(avfilter_graph_alloc());
-  TORCH_CHECK(filterState.filterGraph.get() != nullptr);
+  FilterGraphContext& filterGraphContext = streamInfo.filterGraphContext;
+  filterGraphContext.filterGraph.reset(avfilter_graph_alloc());
+  TORCH_CHECK(filterGraphContext.filterGraph.get() != nullptr);
 
   if (streamInfo.videoStreamOptions.ffmpegThreadCount.has_value()) {
-    filterState.filterGraph->nb_threads =
+    filterGraphContext.filterGraph->nb_threads =
         streamInfo.videoStreamOptions.ffmpegThreadCount.value();
   }
 
@@ -360,25 +360,25 @@ void VideoDecoder::createFilterGraph(
              << codecContext->sample_aspect_ratio.den;
 
   int ffmpegStatus = avfilter_graph_create_filter(
-      &filterState.sourceContext,
+      &filterGraphContext.sourceContext,
       buffersrc,
       "in",
       filterArgs.str().c_str(),
       nullptr,
-      filterState.filterGraph.get());
+      filterGraphContext.filterGraph.get());
   if (ffmpegStatus < 0) {
     throw std::runtime_error(
         std::string("Failed to create filter graph: ") + filterArgs.str() +
         ": " + getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
   }
 
   ffmpegStatus = avfilter_graph_create_filter(
-      &filterState.sinkContext,
+      &filterGraphContext.sinkContext,
       buffersink,
       "out",
       nullptr,
       nullptr,
-      filterState.filterGraph.get());
+      filterGraphContext.filterGraph.get());
   if (ffmpegStatus < 0) {
     throw std::runtime_error(
         "Failed to create filter graph: " +
@@ -388,7 +388,7 @@ void VideoDecoder::createFilterGraph(
   enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24, AV_PIX_FMT_NONE};
 
   ffmpegStatus = av_opt_set_int_list(
-      filterState.sinkContext,
+      filterGraphContext.sinkContext,
       "pix_fmts",
       pix_fmts,
       AV_PIX_FMT_NONE,
@@ -403,11 +403,11 @@ void VideoDecoder::createFilterGraph(
   UniqueAVFilterInOut inputs(avfilter_inout_alloc());
 
   outputs->name = av_strdup("in");
-  outputs->filter_ctx = filterState.sourceContext;
+  outputs->filter_ctx = filterGraphContext.sourceContext;
   outputs->pad_idx = 0;
   outputs->next = nullptr;
   inputs->name = av_strdup("out");
-  inputs->filter_ctx = filterState.sinkContext;
+  inputs->filter_ctx = filterGraphContext.sinkContext;
   inputs->pad_idx = 0;
   inputs->next = nullptr;
 
@@ -418,7 +418,7 @@ void VideoDecoder::createFilterGraph(
   AVFilterInOut* outputsTmp = outputs.release();
   AVFilterInOut* inputsTmp = inputs.release();
   ffmpegStatus = avfilter_graph_parse_ptr(
-      filterState.filterGraph.get(),
+      filterGraphContext.filterGraph.get(),
       description.str().c_str(),
       &inputsTmp,
       &outputsTmp,
@@ -431,7 +431,8 @@ void VideoDecoder::createFilterGraph(
         getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
   }
 
-  ffmpegStatus = avfilter_graph_config(filterState.filterGraph.get(), nullptr);
+  ffmpegStatus =
+      avfilter_graph_config(filterGraphContext.filterGraph.get(), nullptr);
   if (ffmpegStatus < 0) {
     throw std::runtime_error(
         "Failed to configure filter graph: " +
@@ -803,16 +804,20 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   }
 }
 
-VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
+VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
     std::function<bool(int, AVFrame*)> filterFunction) {
   if (activeStreamIndices_.size() == 0) {
     throw std::runtime_error("No active streams configured.");
   }
+
   resetDecodeStats();
+
+  // Seek if needed.
   if (desiredPtsSeconds_.has_value()) {
     maybeSeekToBeforeDesiredPts();
     desiredPtsSeconds_ = std::nullopt;
   }
+
   // Need to get the next frame or error from PopFrame.
   UniqueAVFrame avFrame(av_frame_alloc());
   AutoAVPacket autoAVPacket;
@@ -822,42 +827,58 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
   while (true) {
     frameStreamIndex = -1;
     bool gotPermanentErrorOnAnyActiveStream = false;
+
+    // Get a frame on an active stream. Note that we don't know ahead of time
+    // which streams have frames to receive, so we linearly try the active
+    // streams.
     for (int streamIndex : activeStreamIndices_) {
       StreamInfo& streamInfo = streamInfos_[streamIndex];
       ffmpegStatus =
           avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
-      bool gotNonRetriableError =
-          ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN);
-      if (gotNonRetriableError) {
+
+      if (ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN)) {
         gotPermanentErrorOnAnyActiveStream = true;
         break;
       }
+
       if (ffmpegStatus == AVSUCCESS) {
         frameStreamIndex = streamIndex;
         break;
       }
     }
+
     if (gotPermanentErrorOnAnyActiveStream) {
       break;
     }
+
     decodeStats_.numFramesReceivedByDecoder++;
-    bool gotNeededFrame = ffmpegStatus == AVSUCCESS &&
-        filterFunction(frameStreamIndex, avFrame.get());
-    if (gotNeededFrame) {
+
+    // Is this the kind of frame we're looking for?
+    if (ffmpegStatus == AVSUCCESS &&
+        filterFunction(frameStreamIndex, avFrame.get())) {
+      // Yes, this is the frame we'll return; break out of the decoding loop.
       break;
     } else if (ffmpegStatus == AVSUCCESS) {
-      // No need to send more packets here as the decoder may have frames in
-      // its buffer.
+      // No, but we received a valid frame - just not the kind we're looking
+      // for. The logic below will read packets and send them to the decoder.
+      // But since we did just receive a frame, we should skip reading more
+      // packets and sending them to the decoder and just try to receive more
+      // frames from the decoder.
       continue;
     }
+
     if (reachedEOF) {
       // We don't have any more packets to send to the decoder. So keep on
       // pulling frames from its internal buffers.
       continue;
     }
+
+    // We still haven't found the frame we're looking for. So let's read more
+    // packets and send them to the decoder.
     ReferenceAVPacket packet(autoAVPacket);
     ffmpegStatus = av_read_frame(formatContext_.get(), packet.get());
     decodeStats_.numPacketsRead++;
+
     if (ffmpegStatus == AVERROR_EOF) {
       // End of file reached. We must drain all codecs by sending a nullptr
       // packet.
@@ -872,27 +893,38 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
               getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
         }
       }
+
+      // We've reached the end of file so we can't read any more packets from
+      // it, but the decoder may still have frames to read in its buffer.
+      // Continue iterating to try reading frames.
       reachedEOF = true;
       continue;
     }
+
     if (ffmpegStatus < AVSUCCESS) {
       throw std::runtime_error(
           "Could not read frame from input file: " +
           getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
     }
+
     if (activeStreamIndices_.count(packet->stream_index) == 0) {
       // This packet is not for any of the active streams.
       continue;
     }
+
+    // We got a valid packet. Send it to the decoder, and we'll receive it in
+    // the next iteration.
     ffmpegStatus = avcodec_send_packet(
         streamInfos_[packet->stream_index].codecContext.get(), packet.get());
     if (ffmpegStatus < AVSUCCESS) {
       throw std::runtime_error(
           "Could not push packet to decoder: " +
           getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
     }
+
     decodeStats_.numPacketsSentToDecoder++;
   }
+
   if (ffmpegStatus < AVSUCCESS) {
     if (reachedEOF || ffmpegStatus == AVERROR_EOF) {
       throw VideoDecoder::EndOfFileException(
@@ -903,6 +935,7 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
         "Could not receive frame from decoder: " +
         getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
   }
+
   // Note that we don't flush the decoder when we reach EOF (even though that's
   // mentioned in https://ffmpeg.org/doxygen/trunk/group__lavc__encdec.html).
   // This is because we may have packets internally in the decoder that we
@@ -912,10 +945,8 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
   StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
   activeStreamInfo.currentPts = avFrame->pts;
   activeStreamInfo.currentDuration = getDuration(avFrame);
-  AVFrameStream avFrameStream;
-  avFrameStream.streamIndex = frameStreamIndex;
-  avFrameStream.avFrame = std::move(avFrame);
-  return avFrameStream;
+
+  return AVFrameStream(std::move(avFrame), frameStreamIndex);
 }
 
 VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
@@ -1027,7 +1058,7 @@ void VideoDecoder::convertAVFrameToFrameOutputOnCPU(
   } else if (
       streamInfo.colorConversionLibrary ==
       ColorConversionLibrary::FILTERGRAPH) {
-    if (!streamInfo.filterState.filterGraph ||
+    if (!streamInfo.filterGraphContext.filterGraph ||
         streamInfo.prevFrameContext != frameContext) {
       createFilterGraph(streamInfo, expectedOutputHeight, expectedOutputWidth);
       streamInfo.prevFrameContext = frameContext;
@@ -1079,8 +1110,8 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAtNoDemux(
   }
 
   setCursorPtsInSeconds(seconds);
-  AVFrameStream avFrameStream = getAVFrameUsingFilterFunction(
-      [seconds, this](int frameStreamIndex, AVFrame* avFrame) {
+  AVFrameStream avFrameStream =
+      decodeAVFrame([seconds, this](int frameStreamIndex, AVFrame* avFrame) {
         StreamInfo& streamInfo = streamInfos_[frameStreamIndex];
         double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
         double frameEndTime = ptsToSeconds(
@@ -1480,8 +1511,8 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemux() {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemuxInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrameStream avFrameStream = getAVFrameUsingFilterFunction(
-      [this](int frameStreamIndex, AVFrame* avFrame) {
+  AVFrameStream avFrameStream =
+      decodeAVFrame([this](int frameStreamIndex, AVFrame* avFrame) {
         StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
         return avFrame->pts >= activeStreamInfo.discardFramesBeforePts;
       });
@@ -1585,16 +1616,17 @@ int VideoDecoder::convertAVFrameToTensorUsingSwsScale(
 torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
     int streamIndex,
     const AVFrame* avFrame) {
-  FilterState& filterState = streamInfos_[streamIndex].filterState;
+  FilterGraphContext& filterGraphContext =
+      streamInfos_[streamIndex].filterGraphContext;
   int ffmpegStatus =
-      av_buffersrc_write_frame(filterState.sourceContext, avFrame);
+      av_buffersrc_write_frame(filterGraphContext.sourceContext, avFrame);
   if (ffmpegStatus < AVSUCCESS) {
     throw std::runtime_error("Failed to add frame to buffer source context");
   }
 
   UniqueAVFrame filteredAVFrame(av_frame_alloc());
-  ffmpegStatus =
-      av_buffersink_get_frame(filterState.sinkContext, filteredAVFrame.get());
+  ffmpegStatus = av_buffersink_get_frame(
+      filterGraphContext.sinkContext, filteredAVFrame.get());
   TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24);
 
   auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredAVFrame.get());