2025-02-12 nightly release (590fe1c)

pytorchbot · pytorchbot · commit a4c268bcf365 · 2025-02-12T11:35:26.000Z
diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py
@@ -242,7 +242,6 @@ def _get_clips_for_index_based_sampling(
             ]
             frames, *_ = get_frames_at_indices(
                 video_decoder,
-                stream_index=metadata_json["bestVideoStreamIndex"],
                 frame_indices=batch_indexes,
             )
             clips.append(frames)
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -418,7 +418,7 @@ VideoDecoder::VideoStreamOptions::VideoStreamOptions(
   }
 }
 
-void VideoDecoder::addVideoStreamDecoder(
+void VideoDecoder::addVideoStream(
     int streamIndex,
     const VideoStreamOptions& videoStreamOptions) {
   TORCH_CHECK(
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -136,7 +136,7 @@ class VideoDecoder {
 
   struct AudioStreamOptions {};
 
-  void addVideoStreamDecoder(
+  void addVideoStream(
       int streamIndex,
       const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
   void addAudioStreamDecoder(
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -39,24 +39,23 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "get_frame_at_pts(Tensor(a!) decoder, float seconds) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frame_at_index(Tensor(a!) decoder, *, int stream_index, int frame_index) -> (Tensor, Tensor, Tensor)");
+      "get_frame_at_index(Tensor(a!) decoder, *, int frame_index) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frames_at_indices(Tensor(a!) decoder, *, int stream_index, int[] frame_indices) -> (Tensor, Tensor, Tensor)");
+      "get_frames_at_indices(Tensor(a!) decoder, *, int[] frame_indices) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frames_in_range(Tensor(a!) decoder, *, int stream_index, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
+      "get_frames_in_range(Tensor(a!) decoder, *, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frames_by_pts_in_range(Tensor(a!) decoder, *, int stream_index, float start_seconds, float stop_seconds) -> (Tensor, Tensor, Tensor)");
+      "get_frames_by_pts_in_range(Tensor(a!) decoder, *, float start_seconds, float stop_seconds) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frames_by_pts(Tensor(a!) decoder, *, int stream_index, float[] timestamps) -> (Tensor, Tensor, Tensor)");
-  m.def(
-      "_get_key_frame_indices(Tensor(a!) decoder, int stream_index) -> Tensor");
+      "get_frames_by_pts(Tensor(a!) decoder, *, float[] timestamps) -> (Tensor, Tensor, Tensor)");
+  m.def("_get_key_frame_indices(Tensor(a!) decoder) -> Tensor");
   m.def("get_json_metadata(Tensor(a!) decoder) -> str");
   m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
   m.def(
       "get_stream_json_metadata(Tensor(a!) decoder, int stream_index) -> str");
   m.def("_get_json_ffmpeg_library_versions() -> str");
   m.def(
-      "_test_frame_pts_equality(Tensor(a!) decoder, *, int stream_index, int frame_index, float pts_seconds_to_test) -> bool");
+      "_test_frame_pts_equality(Tensor(a!) decoder, *, int frame_index, float pts_seconds_to_test) -> bool");
   m.def("scan_all_streams_to_update_metadata(Tensor(a!) decoder) -> ()");
 }
 
@@ -220,8 +219,7 @@ void _add_video_stream(
   }
 
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  videoDecoder->addVideoStreamDecoder(
-      stream_index.value_or(-1), videoStreamOptions);
+  videoDecoder->addVideoStream(stream_index.value_or(-1), videoStreamOptions);
 }
 
 void seek_to_pts(at::Tensor& decoder, double seconds) {
@@ -237,11 +235,6 @@ OpsFrameOutput get_next_frame(at::Tensor& decoder) {
   } catch (const VideoDecoder::EndOfFileException& e) {
     C10_THROW_ERROR(IndexError, e.what());
   }
-  if (result.data.sizes().size() != 3) {
-    throw std::runtime_error(
-        "image_size is unexpected. Expected 3, got: " +
-        std::to_string(result.data.sizes().size()));
-  }
   return makeOpsFrameOutput(result);
 }
 
@@ -251,18 +244,14 @@ OpsFrameOutput get_frame_at_pts(at::Tensor& decoder, double seconds) {
   return makeOpsFrameOutput(result);
 }
 
-OpsFrameOutput get_frame_at_index(
-    at::Tensor& decoder,
-    [[maybe_unused]] int64_t stream_index,
-    int64_t frame_index) {
+OpsFrameOutput get_frame_at_index(at::Tensor& decoder, int64_t frame_index) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   auto result = videoDecoder->getFrameAtIndex(frame_index);
   return makeOpsFrameOutput(result);
 }
 
 OpsFrameBatchOutput get_frames_at_indices(
     at::Tensor& decoder,
-    [[maybe_unused]] int64_t stream_index,
     at::IntArrayRef frame_indices) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   std::vector<int64_t> frameIndicesVec(
@@ -273,7 +262,6 @@ OpsFrameBatchOutput get_frames_at_indices(
 
 OpsFrameBatchOutput get_frames_in_range(
     at::Tensor& decoder,
-    [[maybe_unused]] int64_t stream_index,
     int64_t start,
     int64_t stop,
     std::optional<int64_t> step) {
@@ -284,7 +272,6 @@ OpsFrameBatchOutput get_frames_in_range(
 
 OpsFrameBatchOutput get_frames_by_pts(
     at::Tensor& decoder,
-    [[maybe_unused]] int64_t stream_index,
     at::ArrayRef<double> timestamps) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   std::vector<double> timestampsVec(timestamps.begin(), timestamps.end());
@@ -294,7 +281,6 @@ OpsFrameBatchOutput get_frames_by_pts(
 
 OpsFrameBatchOutput get_frames_by_pts_in_range(
     at::Tensor& decoder,
-    [[maybe_unused]] int64_t stream_index,
     double start_seconds,
     double stop_seconds) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
@@ -327,17 +313,14 @@ std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
 
 bool _test_frame_pts_equality(
     at::Tensor& decoder,
-    [[maybe_unused]] int64_t stream_index,
     int64_t frame_index,
     double pts_seconds_to_test) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   return pts_seconds_to_test ==
       videoDecoder->getPtsSecondsForFrame(frame_index);
 }
 
-torch::Tensor _get_key_frame_indices(
-    at::Tensor& decoder,
-    [[maybe_unused]] int64_t stream_index) {
+torch::Tensor _get_key_frame_indices(at::Tensor& decoder) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   return videoDecoder->getKeyFrameIndices();
 }
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h
@@ -85,14 +85,10 @@ OpsFrameOutput get_frame_at_pts(at::Tensor& decoder, double seconds);
 // Return the frames at given ptss for a given stream
 OpsFrameBatchOutput get_frames_by_pts(
     at::Tensor& decoder,
-    int64_t stream_index,
     at::ArrayRef<double> timestamps);
 
 // Return the frame that is visible at a given index in the video.
-OpsFrameOutput get_frame_at_index(
-    at::Tensor& decoder,
-    int64_t stream_index,
-    int64_t frame_index);
+OpsFrameOutput get_frame_at_index(at::Tensor& decoder, int64_t frame_index);
 
 // Get the next frame from the video as a tuple that has the frame data, pts and
 // duration as tensors.
@@ -101,14 +97,12 @@ OpsFrameOutput get_next_frame(at::Tensor& decoder);
 // Return the frames at given indices for a given stream
 OpsFrameBatchOutput get_frames_at_indices(
     at::Tensor& decoder,
-    int64_t stream_index,
     at::IntArrayRef frame_indices);
 
 // Return the frames inside a range as a single stacked Tensor. The range is
 // defined as [start, stop).
 OpsFrameBatchOutput get_frames_in_range(
     at::Tensor& decoder,
-    int64_t stream_index,
     int64_t start,
     int64_t stop,
     std::optional<int64_t> step = std::nullopt);
@@ -118,7 +112,6 @@ OpsFrameBatchOutput get_frames_in_range(
 // order.
 OpsFrameBatchOutput get_frames_by_pts_in_range(
     at::Tensor& decoder,
-    int64_t stream_index,
     double start_seconds,
     double stop_seconds);
 
@@ -128,16 +121,15 @@ OpsFrameBatchOutput get_frames_by_pts_in_range(
 // We want to make sure that the value is preserved exactly, bit-for-bit, during
 // this process.
 //
-// Returns true if for the given decoder, in the stream stream_index, the pts
+// Returns true if for the given decoder, the pts
 // value when converted to seconds as a double is exactly pts_seconds_to_test.
 // Returns false otherwise.
 bool _test_frame_pts_equality(
     at::Tensor& decoder,
-    int64_t stream_index,
     int64_t frame_index,
     double pts_seconds_to_test);
 
-torch::Tensor _get_key_frame_indices(at::Tensor& decoder, int64_t stream_index);
+torch::Tensor _get_key_frame_indices(at::Tensor& decoder);
 
 // Get the metadata from the video as a string.
 std::string get_json_metadata(at::Tensor& decoder);
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -152,9 +152,7 @@ def _getitem_int(self, key: int) -> Tensor:
                 f"Index {key} is out of bounds; length is {self._num_frames}"
             )
 
-        frame_data, *_ = core.get_frame_at_index(
-            self._decoder, frame_index=key, stream_index=self.stream_index
-        )
+        frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key)
         return frame_data
 
     def _getitem_slice(self, key: slice) -> Tensor:
@@ -163,7 +161,6 @@ def _getitem_slice(self, key: slice) -> Tensor:
         start, stop, step = key.indices(len(self))
         frame_data, *_ = core.get_frames_in_range(
             self._decoder,
-            stream_index=self.stream_index,
             start=start,
             stop=stop,
             step=step,
@@ -189,9 +186,7 @@ def __getitem__(self, key: Union[numbers.Integral, slice]) -> Tensor:
         )
 
     def _get_key_frame_indices(self) -> list[int]:
-        return core._get_key_frame_indices(
-            self._decoder, stream_index=self.stream_index
-        )
+        return core._get_key_frame_indices(self._decoder)
 
     def get_frame_at(self, index: int) -> Frame:
         """Return a single frame at the given index.
@@ -208,7 +203,7 @@ def get_frame_at(self, index: int) -> Frame:
                 f"Index {index} is out of bounds; must be in the range [0, {self._num_frames})."
             )
         data, pts_seconds, duration_seconds = core.get_frame_at_index(
-            self._decoder, frame_index=index, stream_index=self.stream_index
+            self._decoder, frame_index=index
         )
         return Frame(
             data=data,
@@ -234,7 +229,7 @@ def get_frames_at(self, indices: list[int]) -> FrameBatch:
         """
 
         data, pts_seconds, duration_seconds = core.get_frames_at_indices(
-            self._decoder, stream_index=self.stream_index, frame_indices=indices
+            self._decoder, frame_indices=indices
         )
         return FrameBatch(
             data=data,
@@ -268,7 +263,6 @@ def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatc
             raise IndexError(f"Step ({step}) must be greater than 0.")
         frames = core.get_frames_in_range(
             self._decoder,
-            stream_index=self.stream_index,
             start=start,
             stop=stop,
             step=step,
@@ -316,7 +310,7 @@ def get_frames_played_at(self, seconds: list[float]) -> FrameBatch:
             FrameBatch: The frames that are played at ``seconds``.
         """
         data, pts_seconds, duration_seconds = core.get_frames_by_pts(
-            self._decoder, timestamps=seconds, stream_index=self.stream_index
+            self._decoder, timestamps=seconds
         )
         return FrameBatch(
             data=data,
@@ -359,7 +353,6 @@ def get_frames_played_in_range(
             )
         frames = core.get_frames_by_pts_in_range(
             self._decoder,
-            stream_index=self.stream_index,
             start_seconds=start_seconds,
             stop_seconds=stop_seconds,
         )
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
@@ -148,7 +148,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) {
   VideoDecoder::VideoStreamOptions videoStreamOptions;
   videoStreamOptions.width = 100;
   videoStreamOptions.height = 120;
-  decoder->addVideoStreamDecoder(-1, videoStreamOptions);
+  decoder->addVideoStream(-1, videoStreamOptions);
   torch::Tensor tensor = decoder->getNextFrame().data;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({3, 120, 100}));
 }
@@ -158,7 +158,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) {
   std::unique_ptr<VideoDecoder> decoder = std::make_unique<VideoDecoder>(path);
   VideoDecoder::VideoStreamOptions videoStreamOptions;
   videoStreamOptions.dimensionOrder = "NHWC";
-  decoder->addVideoStreamDecoder(-1, videoStreamOptions);
+  decoder->addVideoStream(-1, videoStreamOptions);
   torch::Tensor tensor = decoder->getNextFrame().data;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({270, 480, 3}));
 }
@@ -167,7 +167,7 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   auto output = ourDecoder->getNextFrame();
   torch::Tensor tensor0FromOurDecoder = output.data;
   EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
@@ -206,7 +206,7 @@ TEST_P(VideoDecoderTest, DecodesFramesInABatchInNCHW) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(bestVideoStreamIndex);
+  ourDecoder->addVideoStream(bestVideoStreamIndex);
   // Frame with index 180 corresponds to timestamp 6.006.
   auto output = ourDecoder->getFramesAtIndices({0, 180});
   auto tensor = output.data;
@@ -228,7 +228,7 @@ TEST_P(VideoDecoderTest, DecodesFramesInABatchInNHWC) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(
+  ourDecoder->addVideoStream(
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamOptions("dimension_order=NHWC"));
   // Frame with index 180 corresponds to timestamp 6.006.
@@ -250,7 +250,7 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   ourDecoder->setCursorPtsInSeconds(388388. / 30'000);
   auto output = ourDecoder->getNextFrame();
   EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000);
@@ -263,7 +263,7 @@ TEST_P(VideoDecoderTest, GetsFramePlayedAtTimestamp) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   auto output = ourDecoder->getFramePlayedAt(6.006);
   EXPECT_EQ(output.ptsSeconds, 6.006);
   // The frame's duration is 0.033367 according to ffprobe,
@@ -293,7 +293,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   ourDecoder->setCursorPtsInSeconds(6.0);
   auto output = ourDecoder->getNextFrame();
   torch::Tensor tensor6FromOurDecoder = output.data;
@@ -393,7 +393,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorFilterGraph) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(
+  ourDecoder->addVideoStream(
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamOptions("color_conversion_library=filtergraph"));
   auto output =
@@ -410,7 +410,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorSwscale) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(
+  ourDecoder->addVideoStream(
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamOptions("color_conversion_library=swscale"));
   auto output =
diff --git a/test/decoders/manual_smoke_test.py b/test/decoders/manual_smoke_test.py
@@ -16,7 +16,5 @@
 )
 torchcodec.decoders._core.scan_all_streams_to_update_metadata(decoder)
 torchcodec.decoders._core.add_video_stream(decoder, stream_index=3)
-frame, _, _ = torchcodec.decoders._core.get_frame_at_index(
-    decoder, stream_index=3, frame_index=180
-)
+frame, _, _ = torchcodec.decoders._core.get_frame_at_index(decoder, frame_index=180)
 write_png(frame, "frame180.png")
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py

Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,6 @@ def _get_clips_for_index_based_sampling(`
`242`	`242`	`]`
`243`	`243`	`frames, *_ = get_frames_at_indices(`
`244`	`244`	`video_decoder,`
`245`		`- stream_index=metadata_json["bestVideoStreamIndex"],`
`246`	`245`	`frame_indices=batch_indexes,`
`247`	`246`	`)`
`248`	`247`	`clips.append(frames)`
Original file line number	Diff line number	Diff line change
`@@ -418,7 +418,7 @@ VideoDecoder::VideoStreamOptions::VideoStreamOptions(`
`418`	`418`	`}`
`419`	`419`	`}`
`420`	`420`
`421`		`-void VideoDecoder::addVideoStreamDecoder(`
	`421`	`+void VideoDecoder::addVideoStream(`
`422`	`422`	`int streamIndex,`
`423`	`423`	`const VideoStreamOptions& videoStreamOptions) {`
`424`	`424`	`TORCH_CHECK(`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,5 @@`
`16`	`16`	`)`
`17`	`17`	`torchcodec.decoders._core.scan_all_streams_to_update_metadata(decoder)`
`18`	`18`	`torchcodec.decoders._core.add_video_stream(decoder, stream_index=3)`
`19`		`-frame, _, _ = torchcodec.decoders._core.get_frame_at_index(`
`20`		`- decoder, stream_index=3, frame_index=180`
`21`		`-)`
	`19`	`+frame, _, _ = torchcodec.decoders._core.get_frame_at_index(decoder, frame_index=180)`
`22`	`20`	`write_png(frame, "frame180.png")`