Merge branch 'main' of github.com:pytorch/torchcodec into audioooooooo

NicolasHug · NicolasHug · commit 893c3583790e · 2025-03-11T09:54:54.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -569,10 +569,8 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
-  AVFrameStream avFrameStream = decodeAVFrame([this](AVFrame* avFrame) {
-    StreamInfo& activeStreamInfo = streamInfos_[activeStreamIndex_];
-    return avFrame->pts >= activeStreamInfo.discardFramesBeforePts;
-  });
+  AVFrameStream avFrameStream = decodeAVFrame(
+      [this](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
   return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
 }
 
@@ -909,7 +907,9 @@ torch::Tensor VideoDecoder::getFramesPlayedInRangeAudio(
 // --------------------------------------------------------------------------
 
 void VideoDecoder::setCursorPtsInSeconds(double seconds) {
-  desiredPtsSeconds_ = seconds;
+  cursorWasJustSet_ = true;
+  cursor_ =
+      secondsToClosestPts(seconds, streamInfos_[activeStreamIndex_].timeBase);
 }
 
 /*
@@ -937,29 +937,29 @@ I    P     P    P    I    P    P    P    I    P    P    I    P    P    I    P
 
 (2) is more efficient than (1) if there is an I frame between x and y.
 */
-bool VideoDecoder::canWeAvoidSeeking(int64_t targetPts) const {
+bool VideoDecoder::canWeAvoidSeeking() const {
   const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     return true;
   }
-
-  int64_t lastDecodedAvFramePts = streamInfo.lastDecodedAvFramePts;
-  if (targetPts < lastDecodedAvFramePts) {
+  int64_t lastDecodedAvFramePts =
+      streamInfos_.at(activeStreamIndex_).lastDecodedAvFramePts;
+  if (cursor_ < lastDecodedAvFramePts) {
     // We can never skip a seek if we are seeking backwards.
     return false;
   }
-  if (lastDecodedAvFramePts == targetPts) {
+  if (lastDecodedAvFramePts == cursor_) {
     // We are seeking to the exact same frame as we are currently at. Without
     // caching we have to rewind back and decode the frame again.
     // TODO: https://github.com/pytorch-labs/torchcodec/issues/84 we could
     // implement caching.
     return false;
   }
   // We are seeking forwards.
-  // We can only skip a seek if both lastDecodedAvFramePts and targetPts share
-  // the same keyframe.
+  // We can only skip a seek if both lastDecodedAvFramePts and
+  // cursor_ share the same keyframe.
   int lastDecodedAvFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts);
-  int targetKeyFrameIndex = getKeyFrameIndexForPts(targetPts);
+  int targetKeyFrameIndex = getKeyFrameIndexForPts(cursor_);
   return lastDecodedAvFrameIndex >= 0 && targetKeyFrameIndex >= 0 &&
       lastDecodedAvFrameIndex == targetKeyFrameIndex;
 }
@@ -971,16 +971,14 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   validateActiveStream();
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
 
-  int64_t desiredPts =
-      secondsToClosestPts(*desiredPtsSeconds_, streamInfo.timeBase);
-  streamInfo.discardFramesBeforePts = desiredPts;
-
   decodeStats_.numSeeksAttempted++;
-  if (canWeAvoidSeeking(desiredPts)) {
+  if (canWeAvoidSeeking()) {
     decodeStats_.numSeeksSkipped++;
     return;
   }
 
+  int64_t desiredPts = cursor_;
+
   // For some encodings like H265, FFMPEG sometimes seeks past the point we
   // set as the max_ts. So we use our own index to give it the exact pts of
   // the key frame that we want to seek to.
@@ -1019,10 +1017,9 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
 
   resetDecodeStats();
 
-  // Seek if needed.
-  if (desiredPtsSeconds_.has_value()) {
+  if (cursorWasJustSet_) {
     maybeSeekToBeforeDesiredPts();
-    desiredPtsSeconds_ = std::nullopt;
+    cursorWasJustSet_ = false;
   }
 
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -337,15 +337,11 @@ class VideoDecoder {
     std::vector<FrameInfo> keyFrames;
     std::vector<FrameInfo> allFrames;
 
-    // The current position of the cursor in the stream, and associated frame
-    // duration.
+    // TODO since the decoder is single-stream, these should be decoder fields,
+    // not streamInfo fields. And they should be defined right next to
+    // `cursor_`, with joint documentation.
     int64_t lastDecodedAvFramePts = 0;
     int64_t lastDecodedAvFrameDuration = 0;
-    // The desired position of the cursor in the stream. We send frames >=
-    // this pts to the user when they request a frame.
-    // We update this field if the user requested a seek. This typically
-    // corresponds to the decoder's desiredPts_ attribute.
-    int64_t discardFramesBeforePts = INT64_MIN;
     VideoStreamOptions videoStreamOptions;
 
     // color-conversion fields. Only one of FilterGraphContext and
@@ -368,7 +364,7 @@ class VideoDecoder {
   // DECODING APIS AND RELATED UTILS
   // --------------------------------------------------------------------------
 
-  bool canWeAvoidSeeking(int64_t targetPts) const;
+  bool canWeAvoidSeeking() const;
 
   void maybeSeekToBeforeDesiredPts();
 
@@ -477,9 +473,11 @@ class VideoDecoder {
   std::map<int, StreamInfo> streamInfos_;
   const int NO_ACTIVE_STREAM = -2;
   int activeStreamIndex_ = NO_ACTIVE_STREAM;
-  // Set when the user wants to seek and stores the desired pts that the user
-  // wants to seek to.
-  std::optional<double> desiredPtsSeconds_;
+
+  bool cursorWasJustSet_ = false;
+  // The desired position of the cursor in the stream. We send frames >= this
+  // pts to the user when they request a frame.
+  int64_t cursor_ = INT64_MIN;
   // Stores various internal decoding stats.
   DecodeStats decodeStats_;
   // Stores the AVIOContext for the input buffer.
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -143,20 +143,6 @@ at::Tensor create_from_tensor(
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 
-at::Tensor create_from_buffer(
-    const void* buffer,
-    size_t length,
-    std::optional<std::string_view> seek_mode) {
-  VideoDecoder::SeekMode realSeek = VideoDecoder::SeekMode::exact;
-  if (seek_mode.has_value()) {
-    realSeek = seekModeFromString(seek_mode.value());
-  }
-
-  std::unique_ptr<VideoDecoder> uniqueDecoder =
-      std::make_unique<VideoDecoder>(buffer, length, realSeek);
-  return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
-}
-
 void add_video_stream(
     at::Tensor& decoder,
     std::optional<int64_t> width,
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h
@@ -28,13 +28,6 @@ at::Tensor create_from_tensor(
     at::Tensor video_tensor,
     std::optional<std::string_view> seek_mode = std::nullopt);
 
-// This API is C++ only and will not be exposed via custom ops, use
-// videodecoder_create_from_bytes in Python
-at::Tensor create_from_buffer(
-    const void* buffer,
-    size_t length,
-    std::optional<std::string_view> seek_mode = std::nullopt);
-
 // Add a new video stream at `stream_index` using the provided options.
 void add_video_stream(
     at::Tensor& decoder,
diff --git a/test/decoders/CMakeLists.txt b/test/decoders/CMakeLists.txt
@@ -21,28 +21,14 @@ add_executable(
   VideoDecoderTest.cpp
 )
 
-add_executable(
-  VideoDecoderOpsTest
-  VideoDecoderOpsTest.cpp
-)
-
 target_include_directories(VideoDecoderTest SYSTEM PRIVATE ${TORCH_INCLUDE_DIRS})
 target_include_directories(VideoDecoderTest PRIVATE ../../)
-target_include_directories(VideoDecoderOpsTest SYSTEM PRIVATE ${TORCH_INCLUDE_DIRS})
-target_include_directories(VideoDecoderOpsTest PRIVATE ../../)
 
 target_link_libraries(
   VideoDecoderTest
   ${libtorchcodec_target_name}
   GTest::gtest_main
 )
 
-target_link_libraries(
-  VideoDecoderOpsTest
-  ${libtorchcodec_target_name}
-  GTest::gtest_main
-)
-
 include(GoogleTest)
 gtest_discover_tests(VideoDecoderTest)
-gtest_discover_tests(VideoDecoderOpsTest)
diff --git a/test/decoders/VideoDecoderOpsTest.cpp b/test/decoders/VideoDecoderOpsTest.cpp