meta-pytorch · NicolasHug · Oct 4, 2025 · Sep 25, 2025 · Sep 26, 2025 · Sep 30, 2025
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -37,7 +37,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   explicit BetaCudaDeviceInterface(const torch::Device& device);
   virtual ~BetaCudaDeviceInterface();
 
-  void initializeInterface(AVStream* stream) override;
+  void initializeInterface(
+      const AVStream* stream,
+      const UniqueDecodingAVFormatContext& avFormatCtx) override;
 
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
@@ -52,50 +54,22 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   }
 
   int sendPacket(ReferenceAVPacket& packet) override;
-  int receiveFrame(UniqueAVFrame& avFrame, int64_t desiredPts) override;
+  int sendEOFPacket() override;
+  int receiveFrame(UniqueAVFrame& avFrame) override;
   void flush() override;
 
   // NVDEC callback functions (must be public for C callbacks)
   int streamPropertyChange(CUVIDEOFORMAT* videoFormat);
-  int frameReadyForDecoding(CUVIDPICPARAMS* pPicParams);
+  int frameReadyForDecoding(CUVIDPICPARAMS* picParams);
+  int frameReadyInDisplayOrder(CUVIDPARSERDISPINFO* dispInfo);
 
  private:
+  int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
   // Apply bitstream filter, modifies packet in-place
   void applyBSF(ReferenceAVPacket& packet);
-
-  class FrameBuffer {
-   public:
-    struct Slot {
-      CUVIDPARSERDISPINFO dispInfo;
-      int64_t guessedPts;
-      bool occupied = false;
-
-      Slot() : guessedPts(-1), occupied(false) {
-        std::memset(&dispInfo, 0, sizeof(dispInfo));
-      }
-    };
-
-    // TODONVDEC P1: init size should probably be min_num_decode_surfaces from
-    // video format
-    FrameBuffer() : frameBuffer_(4) {}
-
-    ~FrameBuffer() = default;
-
-    Slot* findEmptySlot();
-    Slot* findFrameWithExactPts(int64_t desiredPts);
-
-    // Iterator support for range-based for loops
-    auto begin() {
-      return frameBuffer_.begin();
-    }
-
-    auto end() {
-      return frameBuffer_.end();
-    }
-
-   private:
-    std::vector<Slot> frameBuffer_;
-  };
+  void initializeBSF(
+      const AVCodecParameters* codecPar,
+      const UniqueDecodingAVFormatContext& avFormatCtx);
 
   UniqueAVFrame convertCudaFrameToAVFrame(
       CUdeviceptr framePtr,
@@ -106,17 +80,16 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   UniqueCUvideodecoder decoder_;
   CUVIDEOFORMAT videoFormat_ = {};
 
-  FrameBuffer frameBuffer_;
-
-  std::queue<int64_t> packetsPtsQueue;
+  std::queue<CUVIDPARSERDISPINFO> readyFrames_;
 
   bool eofSent_ = false;
 
   // Flush flag to prevent decode operations during flush (like DALI's
   // isFlushing_)
   bool isFlushing_ = false;
 
-  AVRational timeBase_ = {0, 0};
+  AVRational timeBase_ = {0, 1};
+  AVRational frameRateAvgFromFFmpeg_ = {0, 1};
 
   UniqueAVBSFContext bitstreamFilter_;
 
@@ -127,3 +100,92 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 };
 
 } // namespace facebook::torchcodec
+
+/* clang-format off */
+// Note: [General design, sendPacket, receiveFrame, frame ordering and NVCUVID callbacks]
+//
+// At a high level, this decoding interface mimics the FFmpeg send/receive
+// architecture:
+// - sendPacket(AVPacket) sends an AVPacket from the FFmpeg demuxer to the
+//   NVCUVID parser.
+// - receiveFrame(AVFrame) is a non-blocking call:
+//   - if a frame is ready **in display order**, it must return it. By display
+//   order, we mean that receiveFrame() must return frames with increasing pts
+//   values when called successively.
+//   - if no frame is ready, it must return AVERROR(EAGAIN) to indicate the
+//   caller should send more packets.
+//
+// The rest of this note assumes you have a reasonable level of familiarity with
+// the sendPacket/receiveFrame calling pattern. If you don't, look up the core
+// decoding loop in SingleVideoDecoder.
+//
+// The frame re-ordering problem:
+// Depending on the codec and on the encoding parameters, a packet from a video
+// stream may contain exactly one frame, more than one frame, or a fraction of a
+// frame. And, there may be non-linear frame dependencies because of B-frames,
+// which need both past *and* future frames to be decoded. Consider the
+// following stream, with frames presented in display order: I0 B1 P2 B3 P4 ...
+// - I0 is an I-frame (also key frame, can be decoded independently)
+// - B1 is a B-frame (bi-directional) which needs both I0 and P2 to be decoded
+// - P2 is a P-frame (predicted frame) which only needs I0 to be decodec.
+//
+// Because B1 needs both I0 and P2 to be properly decoded, the decode order
+// (packet order), defined by the encoder, must be: I0 P2 B1 P4 B3 ... which is
+// different from the display order.
+//
+// SendPacket(AVPacket)'s job is just to pass down the packet to the NVCUVID
+// parser by calling cuvidParseVideoData(packet). When
+// cuvidParseVideoData(packet) is called, it may trigger callbacks,
+// particularly:
+// - streamPropertyChange(videoFormat): triggered once at the start of the
+//   stream, and possibly later if the stream properties change (e.g.
+//   resolution).
+// - frameReadyForDecoding(picParams)): triggered **in decode order** when the
+//   parser has accumulated enough data to decode a frame. We send that frame to
+//   the NVDEC hardware for **async** decoding.
+// - frameReadyInDisplayOrder(dispInfo)): triggered **in display order** when a
+//   frame is ready to be "displayed" (returned). At that point, the parser also
+//   gives us the pts of that frame. We store (a reference to) that frame in a
+//   FIFO queue: readyFrames_.
+//
+// When receiveFrame(AVFrame) is called, if readyFrames_ is not empty, we pop
+// the front of the queue, which is the next frame in display order, and map it
+// to an AVFrame by calling cuvidMapVideoFrame(). If readyFrames_ is empty we
+// return EAGAIN to indicate the caller should send more packets.
+//
+// There is potentially a small inefficiency due to the callback design: in
+// order for us to know that a frame is ready in display order, we need the
+// frameReadyInDisplayOrder callback to be triggered. This can only happen
+// within cuvidParseVideoData(packet) in sendPacket(). This means there may be
+// the following sequence of calls:
+//
+// sendPacket(relevantAVPacket)
+//   cuvidParseVideoData(relevantAVPacket)
+//     frameReadyForDecoding()
+//       cuvidDecodePicture()            Send frame to NVDEC for async decoding
+//
+// receiveFrame() -> EAGAIN              Frame is potentially already decoded
+//                                       and could be returned, but we don't
+//                                       know because frameReadyInDisplayOrder
+//                                       hasn't been triggered yet. We'll only
+//                                       know after sending another,
+//                                       potentially irrelevant packet.
+//
+// sendPacket(irrelevantAVPacket)
+//   cuvidParseVideoData(irrelevantAVPacket)
+//     frameReadyInDisplayOrder()       Only now do we know that our target
+//                                      frame is ready.
+//
+// receiveFrame()                       return target frame
+//
+// How much this matters in practice is unclear, but probably negligible in
+// general. Particularly when frames are decoded consecutively anyway, the
+// "irrelevantPacket" is actually relevant for a future target frame.
+//
+// Note that the alternative is to *not* rely on the frameReadyInDisplayOrder
+// callback. It's technically possible, but it would mean we now have to solve
+// two hard, *codec-dependent* problems that the callback was solving for us:
+// - we have to guess the frame's pts ourselves
+// - we have to re-order the frames ourselves to preserve display order.
+//
+/* clang-format on */
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -55,7 +55,9 @@ class DeviceInterface {
   virtual void initializeContext(
       [[maybe_unused]] AVCodecContext* codecContext) {}
 
-  virtual void initializeInterface([[maybe_unused]] AVStream* stream) {}
+  virtual void initializeInterface(
+      [[maybe_unused]] const AVStream* stream,
+      [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx) {}
 
   virtual void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
@@ -84,12 +86,18 @@ class DeviceInterface {
     return AVERROR(ENOSYS);
   }
 
+  // Send an EOF packet to flush the decoder
+  // Returns AVSUCCESS on success, or other AVERROR on failure
+  virtual int sendEOFPacket() {
+    TORCH_CHECK(
+        false, "Send EOF packet not implemented for this device interface");
+    return AVERROR(ENOSYS);
+  }
+
   // Moral equivalent of avcodec_receive_frame()
   // Returns AVSUCCESS on success, AVERROR(EAGAIN) if no frame ready,
   // AVERROR_EOF if end of stream, or other AVERROR on failure
-  virtual int receiveFrame(
-      [[maybe_unused]] UniqueAVFrame& avFrame,
-      [[maybe_unused]] int64_t desiredPts) {
+  virtual int receiveFrame([[maybe_unused]] UniqueAVFrame& avFrame) {
     TORCH_CHECK(
         false,
         "Send/receive packet decoding not implemented for this device interface");

diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -501,4 +501,26 @@ AVIOContext* avioAllocContext(
       seek);
 }
 
+double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
+  // To perform the multiplication before the division, av_q2d is not used
+  return static_cast<double>(pts) * timeBase.num / timeBase.den;
+}
+
+int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
+  return static_cast<int64_t>(
+      std::round(seconds * timeBase.den / timeBase.num));
+}
+
+int64_t computeSafeDuration(
+    const AVRational& frameRate,
+    const AVRational& timeBase) {
+  if (frameRate.num <= 0 || frameRate.den <= 0 || timeBase.num <= 0 ||
+      timeBase.den <= 0) {
+    return 0;
+  } else {
+    return (static_cast<int64_t>(frameRate.den) * timeBase.den) /
+        (static_cast<int64_t>(timeBase.num) * frameRate.num);
+  }
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -232,4 +232,10 @@ AVIOContext* avioAllocContext(
     AVIOWriteFunction write_packet,
     AVIOSeekFunction seek);
 
+double ptsToSeconds(int64_t pts, const AVRational& timeBase);
+int64_t secondsToClosestPts(double seconds, const AVRational& timeBase);
+int64_t computeSafeDuration(
+    const AVRational& frameRate,
+    const AVRational& timeBase);
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -17,16 +17,6 @@
 namespace facebook::torchcodec {
 namespace {
 
-double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
-  // To perform the multiplication before the division, av_q2d is not used
-  return static_cast<double>(pts) * timeBase.num / timeBase.den;
-}
-
-int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
-  return static_cast<int64_t>(
-      std::round(seconds * timeBase.den / timeBase.num));
-}
-
 // Some videos aren't properly encoded and do not specify pts values for
 // packets, and thus for frames. Unset values correspond to INT64_MIN. When that
 // happens, we fallback to the dts value which hopefully exists and is correct.
@@ -462,7 +452,7 @@ void SingleStreamDecoder::addStream(
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
     if (deviceInterface_) {
       deviceInterface_->initializeContext(codecContext);
-      deviceInterface_->initializeInterface(streamInfo.stream);
+      deviceInterface_->initializeInterface(streamInfo.stream, formatContext_);
     }
   }
 
@@ -1171,7 +1161,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 
   while (true) {
     if (useCustomInterface) {
-      status = deviceInterface_->receiveFrame(avFrame, cursor_);
+      status = deviceInterface_->receiveFrame(avFrame);
     } else {
       status =
           avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
@@ -1212,12 +1202,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
       if (status == AVERROR_EOF) {
         // End of file reached. We must drain the decoder
         if (useCustomInterface) {
-          // TODONVDEC P0: Re-think this. This should be simpler.
-          AutoAVPacket eofAutoPacket;
-          ReferenceAVPacket eofPacket(eofAutoPacket);
-          eofPacket->data = nullptr;
-          eofPacket->size = 0;
-          status = deviceInterface_->sendPacket(eofPacket);
+          status = deviceInterface_->sendEOFPacket();
         } else {
           status = avcodec_send_packet(
               streamInfo.codecContext.get(),

diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -155,12 +155,6 @@ def __init__(
                 device_variant = device_split[2]
                 device = ":".join(device_split[0:2])
 
-        # TODONVDEC P0 Support approximate mode. Not ideal to validate that here
-        # either, but validating this at a lower level forces to add yet another
-        # (temprorary) validation API to the device inteface
-        if device_variant == "beta" and seek_mode != "exact":
-            raise ValueError("Seek mode must be exact for BETA CUDA interface.")
-
         core.add_video_stream(
             self._decoder,
             stream_index=stream_index,

diff --git a/test/resources/testsrc2_h265.mp4 b/test/resources/testsrc2_h265.mp4