Merge branch 'pts_sort_and_dedup' into samplers_hack

NicolasHug · NicolasHug · commit 5e114f28a1e3 · 2024-10-23T12:09:25.000+01:00
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1030,22 +1030,19 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
 
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
     int streamIndex,
-    const std::vector<int64_t>& frameIndices,
-    const bool sortIndices) {
+    const std::vector<int64_t>& frameIndices) {
   validateUserProvidedStreamIndex(streamIndex);
   validateScannedAllStreams("getFramesAtIndices");
 
-  const auto& streamMetadata = containerMetadata_.streams[streamIndex];
-  const auto& stream = streams_[streamIndex];
-  const auto& options = stream.options;
-  BatchDecodedOutput output(frameIndices.size(), options, streamMetadata);
+  auto indicesAreSorted =
+      std::is_sorted(frameIndices.begin(), frameIndices.end());
 
-  // if frameIndices is [13, 10, 12, 11]
-  // when sorted, it's  [10, 11, 12, 13] <-- this is the sorted order we want
-  //                                         to use to decode the frames
-  // and argsort is     [ 1,  3,  2,  0]
   std::vector<size_t> argsort;
-  if (sortIndices) {
+  if (!indicesAreSorted) {
+    // if frameIndices is [13, 10, 12, 11]
+    // when sorted, it's  [10, 11, 12, 13] <-- this is the sorted order we want
+    //                                         to use to decode the frames
+    // and argsort is     [ 1,  3,  2,  0]
     argsort.resize(frameIndices.size());
     for (size_t i = 0; i < argsort.size(); ++i) {
       argsort[i] = i;
@@ -1056,17 +1053,22 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
         });
   }
 
+  const auto& streamMetadata = containerMetadata_.streams[streamIndex];
+  const auto& stream = streams_[streamIndex];
+  const auto& options = stream.options;
+  BatchDecodedOutput output(frameIndices.size(), options, streamMetadata);
+
   auto previousIndexInVideo = -1;
   for (auto f = 0; f < frameIndices.size(); ++f) {
-    auto indexInOutput = sortIndices ? argsort[f] : f;
+    auto indexInOutput = indicesAreSorted ? f : argsort[f];
     auto indexInVideo = frameIndices[indexInOutput];
     if (indexInVideo < 0 || indexInVideo >= stream.allFrames.size()) {
       throw std::runtime_error(
           "Invalid frame index=" + std::to_string(indexInVideo));
     }
     if ((f > 0) && (indexInVideo == previousIndexInVideo)) {
       // Avoid decoding the same frame twice
-      auto previousIndexInOutput = sortIndices ? argsort[f - 1] : f - 1;
+      auto previousIndexInOutput = indicesAreSorted ? f - 1 : argsort[f - 1];
       output.frames[indexInOutput].copy_(output.frames[previousIndexInOutput]);
       output.ptsSeconds[indexInOutput] =
           output.ptsSeconds[previousIndexInOutput];
@@ -1088,12 +1090,11 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
   return output;
 }
 
-VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtPtss(
+VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesDisplayedByTimestamps(
     int streamIndex,
-    const std::vector<double>& framePtss,
-    const bool sortPtss) {
+    const std::vector<double>& framePtss) {
   validateUserProvidedStreamIndex(streamIndex);
-  validateScannedAllStreams("getFramesAtPtss");
+  validateScannedAllStreams("getFramesDisplayedByTimestamps");
 
   // The frame displayed at timestamp t and the one displayed at timestamp `t +
   // eps` are probably the same frame, with the same index. The easiest way to
@@ -1116,7 +1117,7 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtPtss(
 
     auto it = std::lower_bound(
         stream.allFrames.begin(),
-        stream.allFrames.end(),
+        stream.allFrames.end() - 1,
         framePts,
         [&stream](const FrameInfo& info, double start) {
           return ptsToSeconds(info.nextPts, stream.timeBase) <= start;
@@ -1125,7 +1126,7 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtPtss(
     frameIndices[i] = frameIndex;
   }
 
-  return getFramesAtIndices(streamIndex, frameIndices, sortPtss);
+  return getFramesAtIndices(streamIndex, frameIndices);
 }
 
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesInRange(
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -242,13 +242,11 @@ class VideoDecoder {
   // Tensor.
   BatchDecodedOutput getFramesAtIndices(
       int streamIndex,
-      const std::vector<int64_t>& frameIndices,
-      const bool sortIndices = false);
+      const std::vector<int64_t>& frameIndices);
 
-  BatchDecodedOutput getFramesAtPtss(
+  BatchDecodedOutput getFramesDisplayedByTimestamps(
       int streamIndex,
-      const std::vector<double>& framePtss,
-      const bool sortPtss = false);
+      const std::vector<double>& framePtss);
 
   // Returns frames within a given range for a given stream as a single stacked
   // Tensor. The range is defined by [start, stop). The values retrieved from
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -40,13 +40,13 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "get_frame_at_index(Tensor(a!) decoder, *, int stream_index, int frame_index) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frames_at_indices(Tensor(a!) decoder, *, int stream_index, int[] frame_indices, bool sort_indices=False) -> (Tensor, Tensor, Tensor)");
-  m.def(
-      "get_frames_at_ptss(Tensor(a!) decoder, *, int stream_index, float[] frame_ptss, bool sort_ptss=False) -> (Tensor, Tensor, Tensor)");
+      "get_frames_at_indices(Tensor(a!) decoder, *, int stream_index, int[] frame_indices) -> (Tensor, Tensor, Tensor)");
   m.def(
       "get_frames_in_range(Tensor(a!) decoder, *, int stream_index, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
   m.def(
       "get_frames_by_pts_in_range(Tensor(a!) decoder, *, int stream_index, float start_seconds, float stop_seconds) -> (Tensor, Tensor, Tensor)");
+  m.def(
+      "get_frames_by_pts(Tensor(a!) decoder, *, int stream_index, float[] frame_ptss) -> (Tensor, Tensor, Tensor)");
   m.def("get_json_metadata(Tensor(a!) decoder) -> str");
   m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
   m.def(
@@ -211,18 +211,6 @@ OpsDecodedOutput get_frame_at_pts(at::Tensor& decoder, double seconds) {
   return makeOpsDecodedOutput(result);
 }
 
-OpsBatchDecodedOutput get_frames_at_ptss(
-    at::Tensor& decoder,
-    int64_t stream_index,
-    at::ArrayRef<double> frame_ptss,
-    bool sort_ptss) {
-  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  std::vector<double> framePtssVec(frame_ptss.begin(), frame_ptss.end());
-  auto result =
-      videoDecoder->getFramesAtPtss(stream_index, framePtssVec, sort_ptss);
-  return makeOpsBatchDecodedOutput(result);
-}
-
 OpsDecodedOutput get_frame_at_index(
     at::Tensor& decoder,
     int64_t stream_index,
@@ -235,13 +223,11 @@ OpsDecodedOutput get_frame_at_index(
 OpsBatchDecodedOutput get_frames_at_indices(
     at::Tensor& decoder,
     int64_t stream_index,
-    at::IntArrayRef frame_indices,
-    bool sort_indices) {
+    at::IntArrayRef frame_indices) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   std::vector<int64_t> frameIndicesVec(
       frame_indices.begin(), frame_indices.end());
-  auto result = videoDecoder->getFramesAtIndices(
-      stream_index, frameIndicesVec, sort_indices);
+  auto result = videoDecoder->getFramesAtIndices(stream_index, frameIndicesVec);
   return makeOpsBatchDecodedOutput(result);
 }
 
@@ -256,6 +242,16 @@ OpsBatchDecodedOutput get_frames_in_range(
       stream_index, start, stop, step.value_or(1));
   return makeOpsBatchDecodedOutput(result);
 }
+OpsBatchDecodedOutput get_frames_by_pts(
+    at::Tensor& decoder,
+    int64_t stream_index,
+    at::ArrayRef<double> frame_ptss) {
+  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
+  std::vector<double> framePtssVec(frame_ptss.begin(), frame_ptss.end());
+  auto result =
+      videoDecoder->getFramesDisplayedByTimestamps(stream_index, framePtssVec);
+  return makeOpsBatchDecodedOutput(result);
+}
 
 OpsBatchDecodedOutput get_frames_by_pts_in_range(
     at::Tensor& decoder,
@@ -499,9 +495,9 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("get_frame_at_pts", &get_frame_at_pts);
   m.impl("get_frame_at_index", &get_frame_at_index);
   m.impl("get_frames_at_indices", &get_frames_at_indices);
-  m.impl("get_frames_at_ptss", &get_frames_at_ptss);
   m.impl("get_frames_in_range", &get_frames_in_range);
   m.impl("get_frames_by_pts_in_range", &get_frames_by_pts_in_range);
+  m.impl("get_frames_by_pts", &get_frames_by_pts);
   m.impl("_test_frame_pts_equality", &_test_frame_pts_equality);
   m.impl(
       "scan_all_streams_to_update_metadata",
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h
@@ -76,11 +76,10 @@ using OpsBatchDecodedOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
 OpsDecodedOutput get_frame_at_pts(at::Tensor& decoder, double seconds);
 
 // Return the frames at given ptss for a given stream
-OpsBatchDecodedOutput get_frames_at_ptss(
+OpsBatchDecodedOutput get_frames_by_pts(
     at::Tensor& decoder,
     int64_t stream_index,
-    at::ArrayRef<double> frame_ptss,
-    bool sort_ptss = false);
+    at::ArrayRef<double> frame_ptss);
 
 // Return the frame that is visible at a given index in the video.
 OpsDecodedOutput get_frame_at_index(
@@ -96,8 +95,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder);
 OpsBatchDecodedOutput get_frames_at_indices(
     at::Tensor& decoder,
     int64_t stream_index,
-    at::IntArrayRef frame_indices,
-    bool sort_indices = false);
+    at::IntArrayRef frame_indices);
 
 // Return the frames inside a range as a single stacked Tensor. The range is
 // defined as [start, stop).
diff --git a/src/torchcodec/decoders/_core/__init__.py b/src/torchcodec/decoders/_core/__init__.py
@@ -22,7 +22,7 @@
     get_frame_at_index,
     get_frame_at_pts,
     get_frames_at_indices,
-    get_frames_at_ptss,
+    get_frames_by_pts,
     get_frames_by_pts_in_range,
     get_frames_in_range,
     get_json_metadata,
diff --git a/src/torchcodec/decoders/_core/video_decoder_ops.py b/src/torchcodec/decoders/_core/video_decoder_ops.py
@@ -71,7 +71,7 @@ def load_torchcodec_extension():
 get_frame_at_pts = torch.ops.torchcodec_ns.get_frame_at_pts.default
 get_frame_at_index = torch.ops.torchcodec_ns.get_frame_at_index.default
 get_frames_at_indices = torch.ops.torchcodec_ns.get_frames_at_indices.default
-get_frames_at_ptss = torch.ops.torchcodec_ns.get_frames_at_ptss.default
+get_frames_by_pts = torch.ops.torchcodec_ns.get_frames_by_pts.default
 get_frames_in_range = torch.ops.torchcodec_ns.get_frames_in_range.default
 get_frames_by_pts_in_range = torch.ops.torchcodec_ns.get_frames_by_pts_in_range.default
 get_json_metadata = torch.ops.torchcodec_ns.get_json_metadata.default
@@ -173,13 +173,12 @@ def get_frame_at_pts_abstract(
     )
 
 
-@register_fake("torchcodec_ns::get_frames_at_ptss")
-def get_frames_at_pts_abstract(
+@register_fake("torchcodec_ns::get_frames_by_pts")
+def get_frames_by_pts_abstract(
     decoder: torch.Tensor,
     *,
     stream_index: int,
     frame_ptss: List[float],
-    sort_ptss: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
     return (
@@ -207,7 +206,6 @@ def get_frames_at_indices_abstract(
     *,
     stream_index: int,
     frame_indices: List[int],
-    sort_indices: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
     return (
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -27,7 +27,7 @@
     get_frame_at_index,
     get_frame_at_pts,
     get_frames_at_indices,
-    get_frames_at_ptss,
+    get_frames_by_pts,
     get_frames_by_pts_in_range,
     get_frames_in_range,
     get_json_metadata,
@@ -125,8 +125,7 @@ def test_get_frames_at_indices(self):
         assert_tensor_equal(frames0and180[0], reference_frame0)
         assert_tensor_equal(frames0and180[1], reference_frame180)
 
-    @pytest.mark.parametrize("sort_indices", (False, True))
-    def test_get_frames_at_indices_with_sort(self, sort_indices):
+    def test_get_frames_at_indices_unsorted_indices(self):
         decoder = create_from_file(str(NASA_VIDEO.path))
         _add_video_stream(decoder)
         scan_all_streams_to_update_metadata(decoder)
@@ -145,7 +144,6 @@ def test_get_frames_at_indices_with_sort(self, sort_indices):
             decoder,
             stream_index=stream_index,
             frame_indices=frame_indices,
-            sort_indices=sort_indices,
         )
         for frame, expected_frame in zip(frames, expected_frames):
             assert_tensor_equal(frame, expected_frame)
@@ -158,24 +156,23 @@ def test_get_frames_at_indices_with_sort(self, sort_indices):
         with pytest.raises(AssertionError):
             assert_tensor_equal(frames[0], frames[-1])
 
-    @pytest.mark.parametrize("sort_ptss", (False, True))
-    def test_get_frames_at_ptss_with_sort(self, sort_ptss):
+    def test_get_frames_by_pts(self):
         decoder = create_from_file(str(NASA_VIDEO.path))
         _add_video_stream(decoder)
         scan_all_streams_to_update_metadata(decoder)
         stream_index = 3
 
-        frame_ptss = [2, 0, 1, 0 + 1e-3, 2 + 1e-3]
+        # Note: 13.01 should give the last video frame for the NASA video
+        frame_ptss = [2, 0, 1, 0 + 1e-3, 13.01, 2 + 1e-3]
 
         expected_frames = [
             get_frame_at_pts(decoder, seconds=pts)[0] for pts in frame_ptss
         ]
 
-        frames, *_ = get_frames_at_ptss(
+        frames, *_ = get_frames_by_pts(
             decoder,
             stream_index=stream_index,
             frame_ptss=frame_ptss,
-            sort_ptss=sort_ptss,
         )
         for frame, expected_frame in zip(frames, expected_frames):
             assert_tensor_equal(frame, expected_frame)