meta-pytorch
diff --git a/‎benchmarks/decoders/benchmark_decoders_library.py‎
Lines changed: 35 additions & 19 deletions b/‎benchmarks/decoders/benchmark_decoders_library.py‎
Lines changed: 35 additions & 19 deletions
diff --git a/‎src/torchcodec/decoders/_core/CPUOnlyDevice.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/decoders/_core/CPUOnlyDevice.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/decoders/_core/CudaDevice.cpp‎
Lines changed: 11 additions & 10 deletions b/‎src/torchcodec/decoders/_core/CudaDevice.cpp‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎src/torchcodec/decoders/_core/DeviceInterface.h‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/decoders/_core/DeviceInterface.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 19 additions & 0 deletions b/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 38 additions & 2 deletions b/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 38 additions & 2 deletions
@@ -22,7 +22,6 @@
     get_frames_by_pts,
     get_json_metadata,
     get_next_frame,
-    scan_all_streams_to_update_metadata,
     seek_to_pts,
 )
 
@@ -154,8 +153,7 @@ def __init__(self, num_threads=None, color_conversion_library=None, device="cpu"
         self._device = device
 
     def decode_frames(self, video_file, pts_list):
-        decoder = create_from_file(video_file)
-        scan_all_streams_to_update_metadata(decoder)
+        decoder = create_from_file(video_file, seek_mode="exact")
         _add_video_stream(
             decoder,
             num_threads=self._num_threads,
@@ -170,7 +168,7 @@ def decode_frames(self, video_file, pts_list):
         return frames
 
     def decode_first_n_frames(self, video_file, n):
-        decoder = create_from_file(video_file)
+        decoder = create_from_file(video_file, seek_mode="approximate")
         _add_video_stream(
             decoder,
             num_threads=self._num_threads,
@@ -197,7 +195,7 @@ def __init__(self, num_threads=None, color_conversion_library=None, device="cpu"
         self.transforms_v2 = transforms_v2
 
     def decode_frames(self, video_file, pts_list):
-        decoder = create_from_file(video_file)
+        decoder = create_from_file(video_file, seek_mode="approximate")
         num_threads = int(self._num_threads) if self._num_threads else 0
         _add_video_stream(
             decoder,
@@ -216,7 +214,7 @@ def decode_frames(self, video_file, pts_list):
 
     def decode_first_n_frames(self, video_file, n):
         num_threads = int(self._num_threads) if self._num_threads else 0
-        decoder = create_from_file(video_file)
+        decoder = create_from_file(video_file, seek_mode="approximate")
         _add_video_stream(
             decoder,
             num_threads=num_threads,
@@ -233,7 +231,7 @@ def decode_first_n_frames(self, video_file, n):
 
     def decode_and_resize(self, video_file, pts_list, height, width, device):
         num_threads = int(self._num_threads) if self._num_threads else 1
-        decoder = create_from_file(video_file)
+        decoder = create_from_file(video_file, seek_mode="approximate")
         _add_video_stream(
             decoder,
             num_threads=num_threads,
@@ -263,8 +261,7 @@ def __init__(self, num_threads=None, color_conversion_library=None, device="cpu"
         self._device = device
 
     def decode_frames(self, video_file, pts_list):
-        decoder = create_from_file(video_file)
-        scan_all_streams_to_update_metadata(decoder)
+        decoder = create_from_file(video_file, seek_mode="exact")
         _add_video_stream(
             decoder,
             num_threads=self._num_threads,
@@ -279,8 +276,7 @@ def decode_frames(self, video_file, pts_list):
         return frames
 
     def decode_first_n_frames(self, video_file, n):
-        decoder = create_from_file(video_file)
-        scan_all_streams_to_update_metadata(decoder)
+        decoder = create_from_file(video_file, seek_mode="exact")
         _add_video_stream(
             decoder,
             num_threads=self._num_threads,
@@ -297,9 +293,10 @@ def decode_first_n_frames(self, video_file, n):
 
 
 class TorchCodecPublic(AbstractDecoder):
-    def __init__(self, num_ffmpeg_threads=None, device="cpu"):
+    def __init__(self, num_ffmpeg_threads=None, device="cpu", seek_mode="exact"):
         self._num_ffmpeg_threads = num_ffmpeg_threads
         self._device = device
+        self._seek_mode = seek_mode
 
         from torchvision.transforms import v2 as transforms_v2
 
@@ -310,7 +307,10 @@ def decode_frames(self, video_file, pts_list):
             int(self._num_ffmpeg_threads) if self._num_ffmpeg_threads else 0
         )
         decoder = VideoDecoder(
-            video_file, num_ffmpeg_threads=num_ffmpeg_threads, device=self._device
+            video_file,
+            num_ffmpeg_threads=num_ffmpeg_threads,
+            device=self._device,
+            seek_mode=self._seek_mode,
         )
         return decoder.get_frames_played_at(pts_list)
 
@@ -319,7 +319,10 @@ def decode_first_n_frames(self, video_file, n):
             int(self._num_ffmpeg_threads) if self._num_ffmpeg_threads else 0
         )
         decoder = VideoDecoder(
-            video_file, num_ffmpeg_threads=num_ffmpeg_threads, device=self._device
+            video_file,
+            num_ffmpeg_threads=num_ffmpeg_threads,
+            device=self._device,
+            seek_mode=self._seek_mode,
         )
         frames = []
         count = 0
@@ -335,17 +338,21 @@ def decode_and_resize(self, video_file, pts_list, height, width, device):
             int(self._num_ffmpeg_threads) if self._num_ffmpeg_threads else 1
         )
         decoder = VideoDecoder(
-            video_file, num_ffmpeg_threads=num_ffmpeg_threads, device=self._device
+            video_file,
+            num_ffmpeg_threads=num_ffmpeg_threads,
+            device=self._device,
+            seek_mode=self._seek_mode,
         )
         frames = decoder.get_frames_played_at(pts_list)
         frames = self.transforms_v2.functional.resize(frames.data, (height, width))
         return frames
 
 
 class TorchCodecPublicNonBatch(AbstractDecoder):
-    def __init__(self, num_ffmpeg_threads=None, device="cpu"):
+    def __init__(self, num_ffmpeg_threads=None, device="cpu", seek_mode="approximate"):
         self._num_ffmpeg_threads = num_ffmpeg_threads
         self._device = device
+        self._seek_mode = seek_mode
 
         from torchvision.transforms import v2 as transforms_v2
 
@@ -356,7 +363,10 @@ def decode_frames(self, video_file, pts_list):
             int(self._num_ffmpeg_threads) if self._num_ffmpeg_threads else 0
         )
         decoder = VideoDecoder(
-            video_file, num_ffmpeg_threads=num_ffmpeg_threads, device=self._device
+            video_file,
+            num_ffmpeg_threads=num_ffmpeg_threads,
+            device=self._device,
+            seek_mode=self._seek_mode,
         )
 
         frames = []
@@ -370,7 +380,10 @@ def decode_first_n_frames(self, video_file, n):
             int(self._num_ffmpeg_threads) if self._num_ffmpeg_threads else 0
         )
         decoder = VideoDecoder(
-            video_file, num_ffmpeg_threads=num_ffmpeg_threads, device=self._device
+            video_file,
+            num_ffmpeg_threads=num_ffmpeg_threads,
+            device=self._device,
+            seek_mode=self._seek_mode,
         )
         frames = []
         count = 0
@@ -386,7 +399,10 @@ def decode_and_resize(self, video_file, pts_list, height, width, device):
             int(self._num_ffmpeg_threads) if self._num_ffmpeg_threads else 1
         )
         decoder = VideoDecoder(
-            video_file, num_ffmpeg_threads=num_ffmpeg_threads, device=self._device
+            video_file,
+            num_ffmpeg_threads=num_ffmpeg_threads,
+            device=self._device,
+            seek_mode=self._seek_mode,
         )
 
         frames = []
 
@@ -16,7 +16,7 @@ namespace facebook::torchcodec {
 
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
-    [[maybe_unused]] const VideoDecoder::VideoStreamDecoderOptions& options,
+    [[maybe_unused]] const VideoDecoder::VideoStreamOptions& videoStreamOptions,
     [[maybe_unused]] VideoDecoder::RawDecodedOutput& rawOutput,
     [[maybe_unused]] VideoDecoder::DecodedOutput& output,
     [[maybe_unused]] std::optional<torch::Tensor> preAllocatedOutputTensor) {
 
@@ -185,17 +185,18 @@ void initializeContextOnCuda(
 
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& videoStreamOptions,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrame* src = rawOutput.frame.get();
+  AVFrame* avFrame = rawOutput.avFrame.get();
 
   TORCH_CHECK(
-      src->format == AV_PIX_FMT_CUDA,
+      avFrame->format == AV_PIX_FMT_CUDA,
       "Expected format to be AV_PIX_FMT_CUDA, got " +
-          std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
-  auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(options, *src);
+          std::string(av_get_pix_fmt_name((AVPixelFormat)avFrame->format)));
+  auto frameDims =
+      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, *avFrame);
   int height = frameDims.height;
   int width = frameDims.width;
   torch::Tensor& dst = output.frame;
@@ -212,28 +213,28 @@ void convertAVFrameToDecodedOutputOnCuda(
         "x3, got ",
         shape);
   } else {
-    dst = allocateEmptyHWCTensor(height, width, options.device);
+    dst = allocateEmptyHWCTensor(height, width, videoStreamOptions.device);
   }
 
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device);
 
   NppiSize oSizeROI = {width, height};
-  Npp8u* input[2] = {src->data[0], src->data[1]};
+  Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
 
   auto start = std::chrono::high_resolution_clock::now();
   NppStatus status;
-  if (src->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
+  if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
     status = nppiNV12ToRGB_709CSC_8u_P2C3R(
         input,
-        src->linesize[0],
+        avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI);
   } else {
     status = nppiNV12ToRGB_8u_P2C3R(
         input,
-        src->linesize[0],
+        avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI);
 
@@ -31,7 +31,7 @@ void initializeContextOnCuda(
 
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& videoStreamOptions,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
@@ -10,6 +10,25 @@
 
 namespace facebook::torchcodec {
 
+AutoAVPacket::AutoAVPacket() : avPacket_(av_packet_alloc()) {
+  TORCH_CHECK(avPacket_ != nullptr, "Couldn't allocate avPacket.");
+}
+AutoAVPacket::~AutoAVPacket() {
+  av_packet_free(&avPacket_);
+}
+
+ReferenceAVPacket::ReferenceAVPacket(AutoAVPacket& shared)
+    : avPacket_(shared.avPacket_) {}
+ReferenceAVPacket::~ReferenceAVPacket() {
+  av_packet_unref(avPacket_);
+}
+AVPacket* ReferenceAVPacket::get() {
+  return avPacket_;
+}
+AVPacket* ReferenceAVPacket::operator->() {
+  return avPacket_;
+}
+
 AVCodecOnlyUseForCallingAVFindBestStream
 makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
 #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
 
@@ -57,8 +57,6 @@ using UniqueAVCodecContext = std::unique_ptr<
     Deleterp<AVCodecContext, void, avcodec_free_context>>;
 using UniqueAVFrame =
     std::unique_ptr<AVFrame, Deleterp<AVFrame, void, av_frame_free>>;
-using UniqueAVPacket =
-    std::unique_ptr<AVPacket, Deleterp<AVPacket, void, av_packet_free>>;
 using UniqueAVFilterGraph = std::unique_ptr<
     AVFilterGraph,
     Deleterp<AVFilterGraph, void, avfilter_graph_free>>;
@@ -70,6 +68,44 @@ using UniqueAVIOContext = std::
 using UniqueSwsContext =
     std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
 
+// These 2 classes share the same underlying AVPacket object. They are meant to
+// be used in tandem, like so:
+//
+// AutoAVPacket autoAVPacket; // <-- malloc for AVPacket happens here
+// while(...){
+//   ReferenceAVPacket packet(autoAVPacket);
+//   av_read_frame(..., packet.get());  <-- av_packet_ref() called by FFmpeg
+// } <-- av_packet_unref() called here
+//
+// This achieves a few desirable things:
+// - Memory allocation of the underlying AVPacket happens only once, when
+//   autoAVPacket is created.
+// - av_packet_free() is called when autoAVPacket gets out of scope
+// - av_packet_unref() is automatically called when needed, i.e. at the end of
+//   each loop iteration (or when hitting break / continue). This prevents the
+//   risk of us forgetting to call it.
+class AutoAVPacket {
+  friend class ReferenceAVPacket;
+
+ private:
+  AVPacket* avPacket_;
+
+ public:
+  AutoAVPacket();
+  ~AutoAVPacket();
+};
+
+class ReferenceAVPacket {
+ private:
+  AVPacket* avPacket_;
+
+ public:
+  ReferenceAVPacket(AutoAVPacket& shared);
+  ~ReferenceAVPacket();
+  AVPacket* get();
+  AVPacket* operator->();
+};
+
 // av_find_best_stream is not const-correct before commit:
 // https://github.com/FFmpeg/FFmpeg/commit/46dac8cf3d250184ab4247809bc03f60e14f4c0c
 // which was released in FFMPEG version=5.0.3