remove next() support

NicolasHug · NicolasHug · commit 59b0d15515a2 · 2025-03-08T16:11:14.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -550,7 +550,6 @@ void VideoDecoder::addAudioStream(int streamIndex) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
-
   streamMetadata.sampleRate =
       static_cast<int64_t>(streamInfo.codecContext->sample_rate);
   streamMetadata.numChannels = getNumChannels(streamInfo.codecContext);
@@ -562,12 +561,13 @@ void VideoDecoder::addAudioStream(int streamIndex) {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
   auto output = getNextFrameInternal();
-  output.data = maybePermuteOutputTensor(output.data);
+  output.data = maybePermuteHWC2CHW(output.data);
   return output;
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
   AVFrameStream avFrameStream = decodeAVFrame([this](AVFrame* avFrame) {
     StreamInfo& activeStreamInfo = streamInfos_[activeStreamIndex_];
     return avFrame->pts >= activeStreamInfo.discardFramesBeforePts;
@@ -576,7 +576,6 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndex(int64_t frameIndex) {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
   auto frameOutput = getFrameAtIndexInternal(frameIndex);
   frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
   return frameOutput;
@@ -585,7 +584,7 @@ VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndex(int64_t frameIndex) {
 VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndexInternal(
     int64_t frameIndex,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  validateActiveStream();
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
 
   const auto& streamInfo = streamInfos_[activeStreamIndex_];
   const auto& streamMetadata =
@@ -1389,17 +1388,6 @@ torch::Tensor allocateEmptyHWCTensor(
   }
 }
 
-torch::Tensor VideoDecoder::maybePermuteOutputTensor(
-    torch::Tensor& outputTensor) {
-  if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
-    return maybePermuteHWC2CHW(outputTensor);
-  } else {
-    // No need to do anything for audio. We always return (numChannels,
-    // numSamples) or (numFrames, numChannels, numSamples)
-    return outputTensor;
-  }
-}
-
 // Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
 // The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
 // or 4D.
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -376,7 +376,6 @@ class VideoDecoder {
   FrameOutput getNextFrameInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
-  torch::Tensor maybePermuteOutputTensor(torch::Tensor& outputTensor);
   torch::Tensor maybePermuteHWC2CHW(torch::Tensor& hwcTensor);
 
   FrameOutput convertAVFrameToFrameOutput(
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -234,12 +234,6 @@ void add_audio_stream(
 }
 
 void seek_to_pts(at::Tensor& decoder, double seconds) {
-  // TODO-AUDIO we should prevent more than one call to this op for audio
-  // streams, for the same reasons we do so for getFramesPlayedInRange(). But we
-  // can't implement the logic here, because we don't know media type (audio vs
-  // video). We also can't do it within setCursorPtsInSeconds because it's used
-  // by all other decoding methods.  This isn't un-doable, just not easy with
-  // the API we currently have.
   auto videoDecoder = static_cast<VideoDecoder*>(decoder.mutable_data_ptr());
   videoDecoder->setCursorPtsInSeconds(seconds);
 }
diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py
@@ -626,6 +626,7 @@ class TestAudioOps:
             partial(get_frames_in_range, start=4, stop=5),
             partial(get_frame_at_pts, seconds=2),
             partial(get_frames_by_pts, timestamps=[0, 1.5]),
+            partial(get_next_frame),
         ),
     )
     def test_audio_bad_method(self, method):
@@ -641,28 +642,6 @@ def test_audio_bad_seek_mode(self):
         ):
             add_audio_stream(decoder)
 
-    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
-    def test_audio_decode_all_samples_with_next(self, asset):
-        decoder = create_from_file(str(asset.path), seek_mode="approximate")
-        add_audio_stream(decoder)
-
-        reference_frames = [
-            asset.get_frame_data_by_index(i) for i in range(asset.num_frames)
-        ]
-
-        reference_frames = torch.cat(reference_frames, dim=-1)
-
-        all_frames = []
-        while True:
-            try:
-                frame, *_ = get_next_frame(decoder)
-                all_frames.append(frame)
-            except IndexError:
-                break
-        all_frames = torch.cat(all_frames, dim=-1)
-
-        assert_frames_equal(all_frames, reference_frames)
-
     @pytest.mark.parametrize(
         "range",
         (
@@ -736,43 +715,6 @@ def test_decode_just_one_frame_at_boundaries(self, asset, expected_shape):
         )
         assert frames.shape == expected_shape
 
-    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
-    def test_seek_and_next_audio(self, asset):
-        decoder = create_from_file(str(asset.path), seek_mode="approximate")
-        add_audio_stream(decoder)
-
-        pts = 2
-        # Need +1 because we're not at frames boundaries
-        reference_frame = asset.get_frame_data_by_index(
-            asset.get_frame_index(pts_seconds=pts) + 1
-        )
-        seek_to_pts(decoder, pts)
-        frame, _, _ = get_next_frame(decoder)
-        assert_frames_equal(frame, reference_frame)
-
-        # Seeking forward is OK
-        pts = 4
-        reference_frame = asset.get_frame_data_by_index(
-            asset.get_frame_index(pts_seconds=pts) + 1
-        )
-        seek_to_pts(decoder, pts)
-        frame, _, _ = get_next_frame(decoder)
-        assert_frames_equal(frame, reference_frame)
-
-        # Seeking backwards doesn't error, but it's wrong. See TODO in
-        # `seek_to_pts` op.
-        prev_pts = pts
-        pts = 1
-        seek_to_pts(decoder, pts)
-        frame, _, _ = get_next_frame(decoder)
-        # the decoder actually didn't seek, so the frame we're getting is just
-        # the "next: one without seeking. This assertion exists to illutrate
-        # what currently hapens, but it's obviously *wrong*.
-        reference_frame = asset.get_frame_data_by_index(
-            asset.get_frame_index(pts_seconds=prev_pts) + 2
-        )
-        assert_frames_equal(frame, reference_frame)
-
 
 if __name__ == "__main__":
     pytest.main()

Original file line number	Diff line number	Diff line change
`@@ -234,12 +234,6 @@ void add_audio_stream(`
`234`	`234`	`}`
`235`	`235`
`236`	`236`	`void seek_to_pts(at::Tensor& decoder, double seconds) {`
`237`		`- // TODO-AUDIO we should prevent more than one call to this op for audio`
`238`		`- // streams, for the same reasons we do so for getFramesPlayedInRange(). But we`
`239`		`- // can't implement the logic here, because we don't know media type (audio vs`
`240`		`- // video). We also can't do it within setCursorPtsInSeconds because it's used`
`241`		`- // by all other decoding methods. This isn't un-doable, just not easy with`
`242`		`- // the API we currently have.`
`243`	`237`	`auto videoDecoder = static_cast<VideoDecoder*>(decoder.mutable_data_ptr());`
`244`	`238`	`videoDecoder->setCursorPtsInSeconds(seconds);`
`245`	`239`	`}`