Added sorting logic

NicolasHug · NicolasHug · commit f7a70ba53a3d · 2024-10-22T11:45:36.000+01:00
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1040,32 +1040,46 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
   const auto& options = stream.options;
   BatchDecodedOutput output(frameIndices.size(), options, streamMetadata);
 
-  auto previousFrameIndex = -1;
+  std::vector<size_t> argsort(frameIndices.size());
+  for (size_t i = 0; i < argsort.size(); ++i) {
+    argsort[i] = i;
+  }
+  if (sortIndices) {
+    std::sort(
+        argsort.begin(), argsort.end(), [&frameIndices](size_t a, size_t b) {
+          return frameIndices[a] < frameIndices[b];
+        });
+  }
+
+  auto previousIndexInVideo = -1;
   for (auto f = 0; f < frameIndices.size(); ++f) {
-    auto frameIndex = frameIndices[f];
-    if (frameIndex < 0 || frameIndex >= stream.allFrames.size()) {
+    auto indexInOutput = argsort[f];
+    auto indexInVideo = frameIndices[argsort[f]];
+    if (indexInVideo < 0 || indexInVideo >= stream.allFrames.size()) {
       throw std::runtime_error(
-          "Invalid frame index=" + std::to_string(frameIndex));
+          "Invalid frame index=" + std::to_string(indexInVideo));
     }
-    if ((f > 0) && (frameIndex == previousFrameIndex)) {
+    if ((f > 0) && (indexInVideo == previousIndexInVideo)) {
       // Avoid decoding the same frame twice
-      output.frames[f].copy_(output.frames[f - 1]);
-      output.ptsSeconds[f] = output.ptsSeconds[f - 1];
-      output.durationSeconds[f] = output.durationSeconds[f - 1];
+      auto previousIndexInOutput = argsort[f - 1];
+      output.frames[indexInOutput].copy_(output.frames[previousIndexInOutput]);
+      output.ptsSeconds[indexInOutput] =
+          output.ptsSeconds[previousIndexInOutput];
+      output.durationSeconds[indexInOutput] =
+          output.durationSeconds[previousIndexInOutput];
     } else {
-      DecodedOutput singleOut =
-          getFrameAtIndex(streamIndex, frameIndex, output.frames[f]);
+      DecodedOutput singleOut = getFrameAtIndex(
+          streamIndex, indexInVideo, output.frames[indexInOutput]);
       if (options.colorConversionLibrary ==
           ColorConversionLibrary::FILTERGRAPH) {
-        output.frames[f] = singleOut.frame;
+        output.frames[indexInOutput] = singleOut.frame;
       }
-      output.ptsSeconds[f] = singleOut.ptsSeconds;
-      output.durationSeconds[f] = singleOut.durationSeconds;
+      output.ptsSeconds[indexInOutput] = singleOut.ptsSeconds;
+      output.durationSeconds[indexInOutput] = singleOut.durationSeconds;
     }
-    previousFrameIndex = frameIndex;
+    previousIndexInVideo = indexInVideo;
   }
   output.frames = MaybePermuteHWC2CHW(options, output.frames);
-
   return output;
 }
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -40,7 +40,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "get_frame_at_index(Tensor(a!) decoder, *, int stream_index, int frame_index) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frames_at_indices(Tensor(a!) decoder, *, int stream_index, int[] frame_indices) -> (Tensor, Tensor, Tensor)");
+      "get_frames_at_indices(Tensor(a!) decoder, *, int stream_index, int[] frame_indices, bool sort_indices=False) -> (Tensor, Tensor, Tensor)");
   m.def(
       "get_frames_in_range(Tensor(a!) decoder, *, int stream_index, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
   m.def(
@@ -221,11 +221,13 @@ OpsDecodedOutput get_frame_at_index(
 OpsBatchDecodedOutput get_frames_at_indices(
     at::Tensor& decoder,
     int64_t stream_index,
-    at::IntArrayRef frame_indices) {
+    at::IntArrayRef frame_indices,
+    bool sort_indices) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   std::vector<int64_t> frameIndicesVec(
       frame_indices.begin(), frame_indices.end());
-  auto result = videoDecoder->getFramesAtIndices(stream_index, frameIndicesVec);
+  auto result = videoDecoder->getFramesAtIndices(
+      stream_index, frameIndicesVec, sort_indices);
   return makeOpsBatchDecodedOutput(result);
 }
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h
@@ -90,7 +90,8 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder);
 OpsBatchDecodedOutput get_frames_at_indices(
     at::Tensor& decoder,
     int64_t stream_index,
-    at::IntArrayRef frame_indices);
+    at::IntArrayRef frame_indices,
+    bool sort_indices = false);
 
 // Return the frames inside a range as a single stacked Tensor. The range is
 // defined as [start, stop).
diff --git a/src/torchcodec/decoders/_core/video_decoder_ops.py b/src/torchcodec/decoders/_core/video_decoder_ops.py
@@ -190,6 +190,7 @@ def get_frames_at_indices_abstract(
     *,
     stream_index: int,
     frame_indices: List[int],
+    sort_indices: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
     return (
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -124,6 +124,39 @@ def test_get_frames_at_indices(self):
         assert_tensor_equal(frames0and180[0], reference_frame0)
         assert_tensor_equal(frames0and180[1], reference_frame180)
 
+    @pytest.mark.parametrize("sort_indices", (False, True))
+    def test_get_frames_at_indices_with_sort(self, sort_indices):
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        _add_video_stream(decoder)
+        scan_all_streams_to_update_metadata(decoder)
+        stream_index = 3
+
+        frame_indices = [2, 0, 1, 0, 2]
+
+        expected_frames = [
+            get_frame_at_index(
+                decoder, stream_index=stream_index, frame_index=frame_index
+            )[0]
+            for frame_index in frame_indices
+        ]
+
+        frames, *_ = get_frames_at_indices(
+            decoder,
+            stream_index=stream_index,
+            frame_indices=frame_indices,
+            sort_indices=sort_indices,
+        )
+        for frame, expected_frame in zip(frames, expected_frames):
+            assert_tensor_equal(frame, expected_frame)
+
+        # first and last frame should be equal, at index 2. We then modify the
+        # first frame and assert that it's now different from the last frame.
+        # This ensures a copy was properly made during the de-duplication logic.
+        assert_tensor_equal(frames[0], frames[-1])
+        frames[0] += 20
+        with pytest.raises(AssertionError):
+            assert_tensor_equal(frames[0], frames[-1])
+
     def test_get_frames_in_range(self):
         decoder = create_from_file(str(NASA_VIDEO.path))
         scan_all_streams_to_update_metadata(decoder)