Reorganize public APIs in videoDecoder.h (#477)

NicolasHug · web-flow · commit baa9798ae7cc · 2025-01-27T17:05:53.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -16,30 +16,7 @@
 
 namespace facebook::torchcodec {
 
-/*
-The VideoDecoder class can be used to decode video frames to Tensors.
-
-Example usage of this class:
-std::string video_file_path = "/path/to/video.mp4";
-VideoDecoder video_decoder = VideoDecoder::createFromFilePath(video_file_path);
-
-// After creating the decoder, we can query the metadata:
-auto metadata = video_decoder.getContainerMetadata();
-
-// We can also add streams to the decoder:
-// -1 sets the default stream.
-video_decoder.addVideoStreamDecoder(-1);
-
-// API for seeking and frame extraction:
-// Let's extract the first frame at or after pts=5.0 seconds.
-video_decoder.setCursorPtsInSeconds(5.0);
-auto output = video_decoder->getNextFrameOutput();
-torch::Tensor frame = output.frame;
-double presentation_timestamp = output.ptsSeconds;
-// Note that presentation_timestamp can be any timestamp at 5.0 or above
-// because the frame time may not align exactly with the seek time.
-CHECK_GE(presentation_timestamp, 5.0);
-*/
+// The VideoDecoder class can be used to decode video frames to Tensors.
 // Note that VideoDecoder is not thread-safe.
 // Do not call non-const APIs concurrently on the same object.
 class VideoDecoder {
@@ -52,17 +29,16 @@ class VideoDecoder {
 
   enum class SeekMode { exact, approximate };
 
-  // Creates a VideoDecoder from the video at videoFilePath.
   explicit VideoDecoder(const std::string& videoFilePath, SeekMode seekMode);
-
-  // Creates a VideoDecoder from a given buffer. Note that the buffer is not
-  // owned by the VideoDecoder.
   explicit VideoDecoder(const void* buffer, size_t length, SeekMode seekMode);
 
+  // Creates a VideoDecoder from the video at videoFilePath.
   static std::unique_ptr<VideoDecoder> createFromFilePath(
       const std::string& videoFilePath,
       SeekMode seekMode = SeekMode::exact);
 
+  // Creates a VideoDecoder from a given buffer. Note that the buffer is not
+  // owned by the VideoDecoder.
   static std::unique_ptr<VideoDecoder> createFromBuffer(
       const void* buffer,
       size_t length,
@@ -71,8 +47,10 @@ class VideoDecoder {
   // --------------------------------------------------------------------------
   // VIDEO METADATA QUERY API
   // --------------------------------------------------------------------------
+
   // Updates the metadata of the video to accurate values obtained by scanning
-  // the contents of the video file.
+  // the contents of the video file. Also updates each StreamInfo's index, i.e.
+  // the allFrames and keyFrames vectors.
   void scanFileAndUpdateMetadataAndIndex();
 
   struct StreamMetadata {
@@ -88,7 +66,6 @@ class VideoDecoder {
     std::optional<int64_t> numKeyFrames;
     std::optional<double> averageFps;
     std::optional<double> bitRate;
-    std::optional<std::vector<int64_t>> keyFrames;
 
     // More accurate duration, obtained by scanning the file.
     // These presentation timestamps are in time base.
@@ -126,6 +103,7 @@ class VideoDecoder {
   // --------------------------------------------------------------------------
   // ADDING STREAMS API
   // --------------------------------------------------------------------------
+
   enum ColorConversionLibrary {
     // TODO: Add an AUTO option later.
     // Use the libavfilter library for color conversion.
@@ -164,96 +142,71 @@ class VideoDecoder {
       int streamIndex,
       const AudioStreamOptions& audioStreamOptions = AudioStreamOptions());
 
-  torch::Tensor maybePermuteHWC2CHW(int streamIndex, torch::Tensor& hwcTensor);
-
-  // ---- SINGLE FRAME SEEK AND DECODING API ----
-  // Places the cursor at the first frame on or after the position in seconds.
-  // Calling getNextFrameNoDemuxInternal() will return the first frame at
-  // or after this position.
-  void setCursorPtsInSeconds(double seconds);
-
-  // This structure ensures we always keep the streamIndex and AVFrame together
-  // Note that AVFrame itself doesn't retain the streamIndex.
-  struct AVFrameStream {
-    // The actual decoded output as a unique pointer to an AVFrame.
-    UniqueAVFrame avFrame;
-    // The stream index of the decoded frame.
-    int streamIndex;
-
-    explicit AVFrameStream(UniqueAVFrame&& a, int s)
-        : avFrame(std::move(a)), streamIndex(s) {}
-  };
+  // --------------------------------------------------------------------------
+  // DECODING AND SEEKING APIs
+  // --------------------------------------------------------------------------
 
+  // All public decoding entry points return either a FrameOutput or a
+  // FrameBatchOutput.
+  // They are the equivalent of the user-facing Frame and FrameBatch classes in
+  // Python. They contain RGB decoded frames along with some associated data
+  // like PTS and duration.
   struct FrameOutput {
-    // The actual decoded output as a Tensor.
-    torch::Tensor data;
-    // The stream index of the decoded frame. Used to distinguish
-    // between streams that are of the same type.
+    torch::Tensor data; // 3D: of shape CHW or HWC.
     int streamIndex;
-    // The presentation timestamp of the decoded frame in seconds.
     double ptsSeconds;
-    // The duration of the decoded frame in seconds.
     double durationSeconds;
   };
 
   struct FrameBatchOutput {
-    torch::Tensor data;
-    torch::Tensor ptsSeconds;
-    torch::Tensor durationSeconds;
+    torch::Tensor data; // 4D: of shape NCHW or NHWC.
+    torch::Tensor ptsSeconds; // 1D of shape (N,)
+    torch::Tensor durationSeconds; // 1D of shape (N,)
 
     explicit FrameBatchOutput(
         int64_t numFrames,
         const VideoStreamOptions& videoStreamOptions,
         const StreamMetadata& streamMetadata);
   };
 
-  class EndOfFileException : public std::runtime_error {
-   public:
-    explicit EndOfFileException(const std::string& msg)
-        : std::runtime_error(msg) {}
-  };
+  // Places the cursor at the first frame on or after the position in seconds.
+  // Calling getNextFrameNoDemux() will return the first frame at
+  // or after this position.
+  void setCursorPtsInSeconds(double seconds);
 
   // Decodes the frame where the current cursor position is. It also advances
   // the cursor to the next frame.
   FrameOutput getNextFrameNoDemux();
-  // Decodes the first frame in any added stream that is visible at a given
-  // timestamp. Frames in the video have a presentation timestamp and a
-  // duration. For example, if a frame has presentation timestamp of 5.0s and a
-  // duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0).
-  // i.e. it will be returned when this function is called with seconds=5.0 or
-  // seconds=5.999, etc.
-  FrameOutput getFramePlayedAtNoDemux(double seconds);
 
   FrameOutput getFrameAtIndex(int streamIndex, int64_t frameIndex);
-  // This is morally private but needs to be exposed for C++ tests. Once
-  // getFrameAtIndex supports the preAllocatedOutputTensor parameter, we can
-  // move it back to private.
-  FrameOutput getFrameAtIndexInternal(
-      int streamIndex,
-      int64_t frameIndex,
-      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   // Returns frames at the given indices for a given stream as a single stacked
   // Tensor.
   FrameBatchOutput getFramesAtIndices(
       int streamIndex,
       const std::vector<int64_t>& frameIndices);
 
+  // Returns frames within a given range. The range is defined by [start, stop).
+  // The values retrieved from the range are: [start, start+step,
+  // start+(2*step), start+(3*step), ..., stop). The default for step is 1.
+  FrameBatchOutput
+  getFramesInRange(int streamIndex, int64_t start, int64_t stop, int64_t step);
+
+  // Decodes the first frame in any added stream that is visible at a given
+  // timestamp. Frames in the video have a presentation timestamp and a
+  // duration. For example, if a frame has presentation timestamp of 5.0s and a
+  // duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0).
+  // i.e. it will be returned when this function is called with seconds=5.0 or
+  // seconds=5.999, etc.
+  FrameOutput getFramePlayedAtNoDemux(double seconds);
+
   FrameBatchOutput getFramesPlayedAt(
       int streamIndex,
       const std::vector<double>& timestamps);
 
-  // Returns frames within a given range for a given stream as a single stacked
-  // Tensor. The range is defined by [start, stop). The values retrieved from
-  // the range are:
-  //    [start, start+step, start+(2*step), start+(3*step), ..., stop)
-  // The default for step is 1.
-  FrameBatchOutput
-  getFramesInRange(int streamIndex, int64_t start, int64_t stop, int64_t step);
-
-  // Returns frames within a given pts range for a given stream as a single
-  // stacked tensor. The range is defined by [startSeconds, stopSeconds) with
-  // respect to the pts values for frames. The returned frames are in pts order.
+  // Returns frames within a given pts range. The range is defined by
+  // [startSeconds, stopSeconds) with respect to the pts values for frames. The
+  // returned frames are in pts order.
   //
   // Note that while stopSeconds is excluded in the half open range, this really
   // only makes a difference when stopSeconds is exactly the pts value for a
@@ -273,11 +226,47 @@ class VideoDecoder {
       double startSeconds,
       double stopSeconds);
 
+  class EndOfFileException : public std::runtime_error {
+   public:
+    explicit EndOfFileException(const std::string& msg)
+        : std::runtime_error(msg) {}
+  };
+
   // --------------------------------------------------------------------------
-  // DECODER PERFORMANCE STATISTICS API
+  // MORALLY PRIVATE APIS
   // --------------------------------------------------------------------------
+  // These are APIs that should be private, but that are effectively exposed for
+  // practical reasons, typically for testing purposes.
+
+  // This struct is needed because AVFrame doesn't retain the streamIndex. Only
+  // the AVPacket knows its stream. This is what the low-level private decoding
+  // entry points return. The AVFrameStream is then converted to a FrameOutput
+  // with convertAVFrameToFrameOutput. It should be private, but is currently
+  // used by DeviceInterface.
+  struct AVFrameStream {
+    // The actual decoded output as a unique pointer to an AVFrame.
+    // Usually, this is a YUV frame. It'll be converted to RGB in
+    // convertAVFrameToFrameOutput.
+    UniqueAVFrame avFrame;
+    // The stream index of the decoded frame.
+    int streamIndex;
 
-  // Only exposed for performance testing.
+    explicit AVFrameStream(UniqueAVFrame&& a, int s)
+        : avFrame(std::move(a)), streamIndex(s) {}
+  };
+
+  // Once getFrameAtIndex supports the preAllocatedOutputTensor parameter, we
+  // can move it back to private.
+  FrameOutput getFrameAtIndexInternal(
+      int streamIndex,
+      int64_t frameIndex,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
+
+  // Exposed for _test_frame_pts_equality, which is used to test non-regression
+  // of pts resolution (64 to 32 bit floats)
+  double getPtsSecondsForFrame(int streamIndex, int64_t frameIndex);
+
+  // Exposed for performance testing.
   struct DecodeStats {
     int64_t numSeeksAttempted = 0;
     int64_t numSeeksDone = 0;
@@ -291,9 +280,9 @@ class VideoDecoder {
   DecodeStats getDecodeStats() const;
   void resetDecodeStats();
 
-  double getPtsSecondsForFrame(int streamIndex, int64_t frameIndex);
-
  private:
+  torch::Tensor maybePermuteHWC2CHW(int streamIndex, torch::Tensor& hwcTensor);
+
   struct FrameInfo {
     int64_t pts = 0;
     // The value of this default is important: the last frame's nextPts will be
@@ -404,8 +393,10 @@ class VideoDecoder {
       const enum AVColorSpace colorspace);
 
   void maybeSeekToBeforeDesiredPts();
+
   AVFrameStream decodeAVFrame(
       std::function<bool(int, AVFrame*)> filterFunction);
+
   // Once we create a decoder can update the metadata with the codec context.
   // For example, for video streams, we can add the height and width of the
   // decoded stream.