Merge branch 'main' of github.com:pytorch/torchcodec into header_again

NicolasHug · NicolasHug · commit 7bfefad96b65 · 2025-01-27T17:29:11.000Z
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
@@ -67,7 +67,8 @@ jobs:
           # include more python versions.
         python-version: ['3.9']
         cuda-version: ['11.8', '12.4', '12.6']
-        ffmpeg-version-for-tests: ['5', '6', '7']
+        # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
+        ffmpeg-version-for-tests: ['6', '7']
     container:
       image: "pytorch/manylinux2_28-builder:cuda${{ matrix.cuda-version }}"
       options: "--gpus all -e NVIDIA_DRIVER_CAPABILITIES=video,compute,utility"
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -92,6 +92,8 @@ class AutoAVPacket {
 
  public:
   AutoAVPacket();
+  AutoAVPacket(const AutoAVPacket& other) = delete;
+  AutoAVPacket& operator=(const AutoAVPacket& other) = delete;
   ~AutoAVPacket();
 };
 
@@ -100,7 +102,9 @@ class ReferenceAVPacket {
   AVPacket* avPacket_;
 
  public:
-  ReferenceAVPacket(AutoAVPacket& shared);
+  explicit ReferenceAVPacket(AutoAVPacket& shared);
+  ReferenceAVPacket(const ReferenceAVPacket& other) = delete;
+  ReferenceAVPacket& operator=(const ReferenceAVPacket& other) = delete;
   ~ReferenceAVPacket();
   AVPacket* get();
   AVPacket* operator->();
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -804,16 +804,20 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   }
 }
 
-VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
+VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
     std::function<bool(int, AVFrame*)> filterFunction) {
   if (activeStreamIndices_.size() == 0) {
     throw std::runtime_error("No active streams configured.");
   }
+
   resetDecodeStats();
+
+  // Seek if needed.
   if (desiredPtsSeconds_.has_value()) {
     maybeSeekToBeforeDesiredPts();
     desiredPtsSeconds_ = std::nullopt;
   }
+
   // Need to get the next frame or error from PopFrame.
   UniqueAVFrame avFrame(av_frame_alloc());
   AutoAVPacket autoAVPacket;
@@ -823,42 +827,58 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
   while (true) {
     frameStreamIndex = -1;
     bool gotPermanentErrorOnAnyActiveStream = false;
+
+    // Get a frame on an active stream. Note that we don't know ahead of time
+    // which streams have frames to receive, so we linearly try the active
+    // streams.
     for (int streamIndex : activeStreamIndices_) {
       StreamInfo& streamInfo = streamInfos_[streamIndex];
       ffmpegStatus =
           avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
-      bool gotNonRetriableError =
-          ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN);
-      if (gotNonRetriableError) {
+
+      if (ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN)) {
         gotPermanentErrorOnAnyActiveStream = true;
         break;
       }
+
       if (ffmpegStatus == AVSUCCESS) {
         frameStreamIndex = streamIndex;
         break;
       }
     }
+
     if (gotPermanentErrorOnAnyActiveStream) {
       break;
     }
+
     decodeStats_.numFramesReceivedByDecoder++;
-    bool gotNeededFrame = ffmpegStatus == AVSUCCESS &&
-        filterFunction(frameStreamIndex, avFrame.get());
-    if (gotNeededFrame) {
+
+    // Is this the kind of frame we're looking for?
+    if (ffmpegStatus == AVSUCCESS &&
+        filterFunction(frameStreamIndex, avFrame.get())) {
+      // Yes, this is the frame we'll return; break out of the decoding loop.
       break;
     } else if (ffmpegStatus == AVSUCCESS) {
-      // No need to send more packets here as the decoder may have frames in
-      // its buffer.
+      // No, but we received a valid frame - just not the kind we're looking
+      // for. The logic below will read packets and send them to the decoder.
+      // But since we did just receive a frame, we should skip reading more
+      // packets and sending them to the decoder and just try to receive more
+      // frames from the decoder.
       continue;
     }
+
     if (reachedEOF) {
       // We don't have any more packets to send to the decoder. So keep on
       // pulling frames from its internal buffers.
       continue;
     }
+
+    // We still haven't found the frame we're looking for. So let's read more
+    // packets and send them to the decoder.
     ReferenceAVPacket packet(autoAVPacket);
     ffmpegStatus = av_read_frame(formatContext_.get(), packet.get());
     decodeStats_.numPacketsRead++;
+
     if (ffmpegStatus == AVERROR_EOF) {
       // End of file reached. We must drain all codecs by sending a nullptr
       // packet.
@@ -873,27 +893,38 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
               getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
         }
       }
+
+      // We've reached the end of file so we can't read any more packets from
+      // it, but the decoder may still have frames to read in its buffer.
+      // Continue iterating to try reading frames.
       reachedEOF = true;
       continue;
     }
+
     if (ffmpegStatus < AVSUCCESS) {
       throw std::runtime_error(
           "Could not read frame from input file: " +
           getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
     }
+
     if (activeStreamIndices_.count(packet->stream_index) == 0) {
       // This packet is not for any of the active streams.
       continue;
     }
+
+    // We got a valid packet. Send it to the decoder, and we'll receive it in
+    // the next iteration.
     ffmpegStatus = avcodec_send_packet(
         streamInfos_[packet->stream_index].codecContext.get(), packet.get());
     if (ffmpegStatus < AVSUCCESS) {
       throw std::runtime_error(
           "Could not push packet to decoder: " +
           getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
     }
+
     decodeStats_.numPacketsSentToDecoder++;
   }
+
   if (ffmpegStatus < AVSUCCESS) {
     if (reachedEOF || ffmpegStatus == AVERROR_EOF) {
       throw VideoDecoder::EndOfFileException(
@@ -904,6 +935,7 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
         "Could not receive frame from decoder: " +
         getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
   }
+
   // Note that we don't flush the decoder when we reach EOF (even though that's
   // mentioned in https://ffmpeg.org/doxygen/trunk/group__lavc__encdec.html).
   // This is because we may have packets internally in the decoder that we
@@ -913,10 +945,8 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
   StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
   activeStreamInfo.currentPts = avFrame->pts;
   activeStreamInfo.currentDuration = getDuration(avFrame);
-  AVFrameStream avFrameStream;
-  avFrameStream.streamIndex = frameStreamIndex;
-  avFrameStream.avFrame = std::move(avFrame);
-  return avFrameStream;
+
+  return AVFrameStream(std::move(avFrame), frameStreamIndex);
 }
 
 VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
@@ -1080,8 +1110,8 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAtNoDemux(
   }
 
   setCursorPtsInSeconds(seconds);
-  AVFrameStream avFrameStream = getAVFrameUsingFilterFunction(
-      [seconds, this](int frameStreamIndex, AVFrame* avFrame) {
+  AVFrameStream avFrameStream =
+      decodeAVFrame([seconds, this](int frameStreamIndex, AVFrame* avFrame) {
         StreamInfo& streamInfo = streamInfos_[frameStreamIndex];
         double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
         double frameEndTime = ptsToSeconds(
@@ -1481,8 +1511,8 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemux() {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemuxInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrameStream avFrameStream = getAVFrameUsingFilterFunction(
-      [this](int frameStreamIndex, AVFrame* avFrame) {
+  AVFrameStream avFrameStream =
+      decodeAVFrame([this](int frameStreamIndex, AVFrame* avFrame) {
         StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
         return avFrame->pts >= activeStreamInfo.discardFramesBeforePts;
       });
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -29,6 +29,9 @@ class VideoDecoder {
 
   enum class SeekMode { exact, approximate };
 
+  explicit VideoDecoder(const std::string& videoFilePath, SeekMode seekMode);
+  explicit VideoDecoder(const void* buffer, size_t length, SeekMode seekMode);
+
   // Creates a VideoDecoder from the video at videoFilePath.
   static std::unique_ptr<VideoDecoder> createFromFilePath(
       const std::string& videoFilePath,
@@ -247,6 +250,9 @@ class VideoDecoder {
     UniqueAVFrame avFrame;
     // The stream index of the decoded frame.
     int streamIndex;
+
+    explicit AVFrameStream(UniqueAVFrame&& a, int s)
+        : avFrame(std::move(a)), streamIndex(s) {}
   };
 
   // Once getFrameAtIndex supports the preAllocatedOutputTensor parameter, we
@@ -278,6 +284,7 @@ class VideoDecoder {
   // --------------------------------------------------------------------------
   // STREAMINFO AND ASSOCIATED STRUCTS
   // --------------------------------------------------------------------------
+
   struct FrameInfo {
     int64_t pts = 0;
     // The value of this default is important: the last frame's nextPts will be
@@ -326,10 +333,10 @@ class VideoDecoder {
     int64_t discardFramesBeforePts = INT64_MIN;
     VideoStreamOptions videoStreamOptions;
 
-    // color-conversion fields. Only one of FilterGraphContextr and
+    // color-conversion fields. Only one of FilterGraphContext and
     // UniqueSwsContext should be non-null.
-    ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
     FilterGraphContext filterGraphContext;
+    ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
     UniqueSwsContext swsContext;
 
     // Used to know whether a new FilterGraphContext or UniqueSwsContext should
@@ -338,12 +345,9 @@ class VideoDecoder {
   };
 
   // --------------------------------------------------------------------------
-  // CONSTRUCTORS AND INITIALIZERS
+  // INITIALIZERS
   // --------------------------------------------------------------------------
-  // Don't use those, use the static methods to create a decoder object.
 
-  explicit VideoDecoder(const std::string& videoFilePath, SeekMode seekMode);
-  explicit VideoDecoder(const void* buffer, size_t length, SeekMode seekMode);
   void initializeDecoder();
   void updateMetadataWithCodecContext(
       int streamIndex,
@@ -360,8 +364,8 @@ class VideoDecoder {
 
   void maybeSeekToBeforeDesiredPts();
 
-  AVFrameStream getAVFrameUsingFilterFunction(
-      std::function<bool(int, AVFrame*)>);
+  AVFrameStream decodeAVFrame(
+      std::function<bool(int, AVFrame*)> filterFunction);
 
   FrameOutput getNextFrameNoDemuxInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
@@ -435,6 +439,8 @@ class VideoDecoder {
   // STREAM AND METADATA APIS
   // --------------------------------------------------------------------------
 
+  void populateVideoMetadataFromStreamIndex(int streamIndex);
+
   // Returns the "best" stream index for a given media type. The "best" is
   // determined by various heuristics in FFMPEG.
   // See