Add support for get_frames_in_range

NicolasHug · NicolasHug · commit 60828037de80 · 2025-02-14T20:59:46.000Z
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
@@ -69,6 +69,16 @@ int getNumChannels(const AVFrame* avFrame) {
 #endif
 }
 
+int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
+// TODO not sure about the bounds of the versions here
+#if LIBAVFILTER_VERSION_MAJOR > 8 || \
+    (IBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
+  return av_get_channel_layout_nb_channels(avCodecContext->channel_layout);
+#else
+  return avCodecContext->channels;
+#endif
+}
+
 AVIOBytesContext::AVIOBytesContext(
     const void* data,
     size_t data_size,
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -140,6 +140,7 @@ int64_t getDuration(const UniqueAVFrame& frame);
 int64_t getDuration(const AVFrame* frame);
 
 int getNumChannels(const AVFrame* avFrame);
+int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -617,6 +617,14 @@ void VideoDecoder::addVideoStream(
 
 void VideoDecoder::addAudioStream(int streamIndex) {
   addStream(streamIndex, AVMEDIA_TYPE_AUDIO);
+
+  // TODO address this, this is currently super limitting. The main thing we'll
+  // need to handle is the pre-allocation of the output tensor in batch APIs. We
+  // probably won't be able to pre-allocate anything.
+  auto& streamInfo = streamInfos_[activeStreamIndex_];
+  TORCH_CHECK(
+      streamInfo.codecContext->frame_size > 0,
+      "No support for variable framerate yet.");
 }
 
 // --------------------------------------------------------------------------
@@ -736,9 +744,18 @@ VideoDecoder::getFramesInRange(int64_t start, int64_t stop, int64_t step) {
       step > 0, "Step must be greater than 0; is " + std::to_string(step));
 
   int64_t numOutputFrames = std::ceil((stop - start) / double(step));
-  const auto& videoStreamOptions = streamInfo.videoStreamOptions;
-  FrameBatchOutput frameBatchOutput(
-      numOutputFrames, videoStreamOptions, streamMetadata);
+
+  FrameBatchOutput frameBatchOutput;
+  if (streamInfo.avMediaType == AVMEDIA_TYPE_VIDEO) {
+    const auto& videoStreamOptions = streamInfo.videoStreamOptions;
+    frameBatchOutput =
+        FrameBatchOutput(numOutputFrames, videoStreamOptions, streamMetadata);
+  } else {
+    int64_t numSamples = streamInfo.codecContext->frame_size;
+    int64_t numChannels = getNumChannels(streamInfo.codecContext);
+    frameBatchOutput =
+        FrameBatchOutput(numOutputFrames, numChannels, numSamples);
+  }
 
   for (int64_t i = start, f = 0; i < stop; i += step, ++f) {
     FrameOutput frameOutput =
@@ -1200,8 +1217,8 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
   frameOutput.durationSeconds = ptsToSeconds(
       getDuration(avFrame), formatContext_->streams[streamIndex]->time_base);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
-    // TODO: handle preAllocatedTensor for audio
-    convertAudioAVFrameToFrameOutputOnCPU(avFrameStream, frameOutput);
+    convertAudioAVFrameToFrameOutputOnCPU(
+        avFrameStream, frameOutput, preAllocatedOutputTensor);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
     convertAVFrameToFrameOutputOnCPU(
         avFrameStream, frameOutput, preAllocatedOutputTensor);
@@ -1380,14 +1397,21 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
 
 void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
     VideoDecoder::AVFrameStream& avFrameStream,
-    FrameOutput& frameOutput) {
+    FrameOutput& frameOutput,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   const AVFrame* avFrame = avFrameStream.avFrame.get();
 
   auto numSamples = avFrame->nb_samples; // per channel
   auto numChannels = getNumChannels(avFrame);
 
   // TODO: dtype should be format-dependent
-  torch::Tensor data = torch::empty({numChannels, numSamples}, torch::kFloat32);
+  // TODO rename data to something else
+  torch::Tensor data;
+  if (preAllocatedOutputTensor.has_value()) {
+    data = preAllocatedOutputTensor.value();
+  } else {
+    data = torch::empty({numChannels, numSamples}, torch::kFloat32);
+  }
 
   AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
   // TODO Implement all formats
@@ -1431,6 +1455,20 @@ VideoDecoder::FrameBatchOutput::FrameBatchOutput(
       height, width, videoStreamOptions.device, numFrames);
 }
 
+VideoDecoder::FrameBatchOutput::FrameBatchOutput(
+    int64_t numFrames,
+    int64_t numChannels,
+    int64_t numSamples)
+    : ptsSeconds(torch::empty({numSamples}, {torch::kFloat64})),
+      durationSeconds(torch::empty({numSamples}, {torch::kFloat64})) {
+  // TODO handle dtypes other than float
+  auto tensorOptions = torch::TensorOptions()
+                           .dtype(torch::kFloat32)
+                           .layout(torch::kStrided)
+                           .device(torch::kCPU);
+  data = torch::empty({numFrames, numChannels, numSamples}, tensorOptions);
+}
+
 torch::Tensor allocateEmptyHWCTensor(
     int height,
     int width,
@@ -1459,8 +1497,13 @@ torch::Tensor allocateEmptyHWCTensor(
 // https://pytorch.org/docs/stable/generated/torch.permute.html
 torch::Tensor VideoDecoder::maybePermuteHWC2CHW(torch::Tensor& hwcTensor) {
   if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_AUDIO) {
-    // TODO: Is this really how we want to handle audio?
-    return hwcTensor;
+    // TODO: Do something better
+    auto shape = hwcTensor.sizes();
+    auto numFrames = shape[0];
+    auto numChannels = shape[1];
+    auto numSamples = shape[2];
+    return hwcTensor.permute({1, 0, 2}).reshape(
+        {numChannels, numSamples * numFrames});
   }
   if (streamInfos_[activeStreamIndex_].videoStreamOptions.dimensionOrder ==
       "NHWC") {
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -162,10 +162,15 @@ class VideoDecoder {
     torch::Tensor ptsSeconds; // 1D of shape (N,)
     torch::Tensor durationSeconds; // 1D of shape (N,)
 
+    FrameBatchOutput(){};
     explicit FrameBatchOutput(
         int64_t numFrames,
         const VideoStreamOptions& videoStreamOptions,
         const StreamMetadata& streamMetadata);
+    explicit FrameBatchOutput(
+        int64_t numFrames,
+        int64_t numChannels,
+        int64_t numSamples);
   };
 
   // Places the cursor at the first frame on or after the position in seconds.
@@ -385,7 +390,8 @@ class VideoDecoder {
 
   void convertAudioAVFrameToFrameOutputOnCPU(
       AVFrameStream& avFrameStream,
-      FrameOutput& frameOutput);
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   torch::Tensor convertAVFrameToTensorUsingFilterGraph(const AVFrame* avFrame);
 
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -350,45 +350,51 @@ def test_pts_apis_against_index_ref(self, device):
         )
         torch.testing.assert_close(pts_seconds, all_pts_seconds_ref, atol=0, rtol=0)
 
+    @pytest.mark.parametrize("test_ref", (NASA_VIDEO, NASA_AUDIO))
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_get_frames_in_range(self, device):
-        decoder = create_from_file(str(NASA_VIDEO.path))
-        add_video_stream(decoder, device=device)
+    def test_get_frames_in_range(self, test_ref, device):
+        if device == "cuda" and test_ref is NASA_AUDIO:
+            pytest.skip(reason="CUDA decoding not supported for audio")
+        decoder = create_from_file(str(test_ref.path))
+        _add_stream(decoder=decoder, test_ref=test_ref, device=device)
 
         # ensure that the degenerate case of a range of size 1 works
-        ref_frame0 = NASA_VIDEO.get_frame_data_by_range(0, 1)
+        ref_frame0 = test_ref.get_frame_data_by_range(0, 1)
         bulk_frame0, *_ = get_frames_in_range(decoder, start=0, stop=1)
         assert_frames_equal(bulk_frame0, ref_frame0.to(device))
 
-        ref_frame1 = NASA_VIDEO.get_frame_data_by_range(1, 2)
+        ref_frame1 = test_ref.get_frame_data_by_range(1, 2)
         bulk_frame1, *_ = get_frames_in_range(decoder, start=1, stop=2)
         assert_frames_equal(bulk_frame1, ref_frame1.to(device))
 
-        ref_frame389 = NASA_VIDEO.get_frame_data_by_range(389, 390)
-        bulk_frame389, *_ = get_frames_in_range(decoder, start=389, stop=390)
+        last_index = 389 if test_ref is NASA_VIDEO else 203  # TODO ew
+        ref_frame389 = test_ref.get_frame_data_by_range(last_index, last_index + 1)
+        bulk_frame389, *_ = get_frames_in_range(
+            decoder, start=last_index, stop=last_index + 1
+        )
         assert_frames_equal(bulk_frame389, ref_frame389.to(device))
 
         # contiguous ranges
-        ref_frames0_9 = NASA_VIDEO.get_frame_data_by_range(0, 9)
+        ref_frames0_9 = test_ref.get_frame_data_by_range(0, 9)
         bulk_frames0_9, *_ = get_frames_in_range(decoder, start=0, stop=9)
         assert_frames_equal(bulk_frames0_9, ref_frames0_9.to(device))
 
-        ref_frames4_8 = NASA_VIDEO.get_frame_data_by_range(4, 8)
+        ref_frames4_8 = test_ref.get_frame_data_by_range(4, 8)
         bulk_frames4_8, *_ = get_frames_in_range(decoder, start=4, stop=8)
         assert_frames_equal(bulk_frames4_8, ref_frames4_8.to(device))
 
         # ranges with a stride
-        ref_frames15_35 = NASA_VIDEO.get_frame_data_by_range(15, 36, 5)
+        ref_frames15_35 = test_ref.get_frame_data_by_range(15, 36, 5)
         bulk_frames15_35, *_ = get_frames_in_range(decoder, start=15, stop=36, step=5)
         assert_frames_equal(bulk_frames15_35, ref_frames15_35.to(device))
 
-        ref_frames0_9_2 = NASA_VIDEO.get_frame_data_by_range(0, 9, 2)
+        ref_frames0_9_2 = test_ref.get_frame_data_by_range(0, 9, 2)
         bulk_frames0_9_2, *_ = get_frames_in_range(decoder, start=0, stop=9, step=2)
         assert_frames_equal(bulk_frames0_9_2, ref_frames0_9_2.to(device))
 
         # an empty range is valid!
         empty_frame, *_ = get_frames_in_range(decoder, start=5, stop=5)
-        assert_frames_equal(empty_frame, NASA_VIDEO.empty_chw_tensor.to(device))
+        assert_frames_equal(empty_frame, test_ref.empty_chw_tensor.to(device))
 
     @pytest.mark.parametrize(
         "test_ref, last_frame_index", ((NASA_VIDEO, 289), (NASA_AUDIO, 203))
diff --git a/test/utils.py b/test/utils.py
@@ -119,11 +119,7 @@ def get_frame_data_by_range(
         *,
         stream_index: Optional[int] = None,
     ) -> torch.Tensor:
-        tensors = [
-            self.get_frame_data_by_index(i, stream_index=stream_index)
-            for i in range(start, stop, step)
-        ]
-        return torch.stack(tensors)
+        raise NotImplementedError("Override in child classes")
 
     def get_pts_seconds_by_range(
         self,
@@ -197,6 +193,20 @@ def get_frame_data_by_index(
         )
         return torch.load(file_path, weights_only=True).permute(2, 0, 1)
 
+    def get_frame_data_by_range(
+        self,
+        start: int,
+        stop: int,
+        step: int = 1,
+        *,
+        stream_index: Optional[int] = None,
+    ) -> torch.Tensor:
+        tensors = [
+            self.get_frame_data_by_index(i, stream_index=stream_index)
+            for i in range(start, stop, step)
+        ]
+        return torch.stack(tensors)
+
     @property
     def width(self) -> int:
         return self.stream_infos[self.default_stream_index].width
@@ -337,6 +347,25 @@ def get_frame_data_by_index(
 
         return self._reference_frames[idx]
 
+    def get_frame_data_by_range(
+        self,
+        start: int,
+        stop: int,
+        step: int = 1,
+        *,
+        stream_index: Optional[int] = None,
+    ) -> torch.Tensor:
+        tensors = [
+            self.get_frame_data_by_index(i, stream_index=stream_index)
+            for i in range(start, stop, step)
+        ]
+        return torch.cat(tensors, dim=1)
+
+    # TODO: this shouldn't be named chw
+    @property
+    def empty_chw_tensor(self) -> torch.Tensor:
+        return torch.empty([2, 0], dtype=torch.float32)
+
 
 NASA_AUDIO = TestAudio(
     filename="nasa_13013.mp4",