Create AddAudioStream

NicolasHug · NicolasHug · commit ace0bd4bcd66 · 2025-02-10T17:38:52.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -418,8 +418,9 @@ VideoDecoder::VideoStreamOptions::VideoStreamOptions(
   }
 }
 
-void VideoDecoder::addVideoStreamDecoder(
+void VideoDecoder::addStream(
     int streamIndex,
+    AVMediaType mediaType,
     const VideoStreamOptions& videoStreamOptions) {
   TORCH_CHECK(
       activeStreamIndex_ == NO_ACTIVE_STREAM,
@@ -429,30 +430,37 @@ void VideoDecoder::addVideoStreamDecoder(
   AVCodecOnlyUseForCallingAVFindBestStream avCodec = nullptr;
 
   activeStreamIndex_ = av_find_best_stream(
-      formatContext_.get(), AVMEDIA_TYPE_VIDEO, streamIndex, -1, &avCodec, 0);
+      formatContext_.get(), mediaType, streamIndex, -1, &avCodec, 0);
+
   if (activeStreamIndex_ < 0) {
-    throw std::invalid_argument("No valid stream found in input file.");
+    throw std::invalid_argument(
+        "No valid stream found in input file. Is " +
+        std::to_string(streamIndex) + " of the desired media type?");
   }
+
   TORCH_CHECK(avCodec != nullptr);
 
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
   streamInfo.streamIndex = activeStreamIndex_;
   streamInfo.timeBase = formatContext_->streams[activeStreamIndex_]->time_base;
   streamInfo.stream = formatContext_->streams[activeStreamIndex_];
+  streamInfo.avMediaType = mediaType;
 
-  if (streamInfo.stream->codecpar->codec_type != AVMEDIA_TYPE_VIDEO) {
-    throw std::invalid_argument(
-        "Stream with index " + std::to_string(activeStreamIndex_) +
-        " is not a video stream.");
-  }
+  // This should never happen, checking just to be safe.
+  TORCH_CHECK(
+    streamInfo.stream->codecpar->codec_type == mediaType,
+    "FFmpeg found stream with index ", activeStreamIndex_, " which is of the wrong media type.");
 
-  if (videoStreamOptions.device.type() == torch::kCUDA) {
+
+  if (mediaType == AVMEDIA_TYPE_VIDEO &&
+      videoStreamOptions.device.type() == torch::kCUDA) {
     avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
         findCudaCodec(
             videoStreamOptions.device, streamInfo.stream->codecpar->codec_id)
             .value_or(avCodec));
   }
 
+  // TODO figure out whether this should be VIDEO only
   StreamMetadata& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
   if (seekMode_ == SeekMode::approximate &&
@@ -465,37 +473,34 @@ void VideoDecoder::addVideoStreamDecoder(
 
   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(codecContext != nullptr);
-  codecContext->thread_count = videoStreamOptions.ffmpegThreadCount.value_or(0);
+  codecContext->thread_count =
+      videoStreamOptions.ffmpegThreadCount.value_or(0); // TODO VIDEO ONLY?
   streamInfo.codecContext.reset(codecContext);
 
   int retVal = avcodec_parameters_to_context(
       streamInfo.codecContext.get(), streamInfo.stream->codecpar);
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
-  if (videoStreamOptions.device.type() == torch::kCPU) {
-    // No more initialization needed for CPU.
-  } else if (videoStreamOptions.device.type() == torch::kCUDA) {
-    initializeContextOnCuda(videoStreamOptions.device, codecContext);
-  } else {
-    TORCH_CHECK(
-        false, "Invalid device type: " + videoStreamOptions.device.str());
+  if (mediaType == AVMEDIA_TYPE_VIDEO) {
+    if (videoStreamOptions.device.type() == torch::kCPU) {
+      // No more initialization needed for CPU.
+    } else if (videoStreamOptions.device.type() == torch::kCUDA) {
+      initializeContextOnCuda(videoStreamOptions.device, codecContext);
+    } else {
+      TORCH_CHECK(
+          false, "Invalid device type: " + videoStreamOptions.device.str());
+    }
+    streamInfo.videoStreamOptions = videoStreamOptions;
   }
-  streamInfo.videoStreamOptions = videoStreamOptions;
 
   retVal = avcodec_open2(streamInfo.codecContext.get(), avCodec, nullptr);
   if (retVal < AVSUCCESS) {
     throw std::invalid_argument(getFFMPEGErrorStringFromErrorCode(retVal));
   }
 
   codecContext->time_base = streamInfo.stream->time_base;
-
-  containerMetadata_.allStreamMetadata[activeStreamIndex_].width =
-      codecContext->width;
-  containerMetadata_.allStreamMetadata[activeStreamIndex_].height =
-      codecContext->height;
-  auto codedId = codecContext->codec_id;
   containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName =
-      std::string(avcodec_get_name(codedId));
+      std::string(avcodec_get_name(codecContext->codec_id));
 
   // We will only need packets from the active stream, so we tell FFmpeg to
   // discard packets from the other streams. Note that av_read_frame() may still
@@ -506,6 +511,18 @@ void VideoDecoder::addVideoStreamDecoder(
       formatContext_->streams[i]->discard = AVDISCARD_ALL;
     }
   }
+}
+
+void VideoDecoder::addVideoStream(
+    int streamIndex,
+    const VideoStreamOptions& videoStreamOptions) {
+  addStream(streamIndex, AVMEDIA_TYPE_VIDEO, videoStreamOptions);
+
+  auto& streamInfo = streamInfos_[activeStreamIndex_];
+  containerMetadata_.allStreamMetadata[activeStreamIndex_].width =
+      streamInfo.codecContext->width;
+  containerMetadata_.allStreamMetadata[activeStreamIndex_].height =
+      streamInfo.codecContext->height;
 
   // By default, we want to use swscale for color conversion because it is
   // faster. However, it has width requirements, so we may need to fall back
@@ -514,7 +531,7 @@ void VideoDecoder::addVideoStreamDecoder(
   // swscale's width requirements to be violated. We don't expose the ability to
   // choose color conversion library publicly; we only use this ability
   // internally.
-  int width = videoStreamOptions.width.value_or(codecContext->width);
+  int width = videoStreamOptions.width.value_or(streamInfo.codecContext->width);
 
   // swscale requires widths to be multiples of 32:
   // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
@@ -527,6 +544,10 @@ void VideoDecoder::addVideoStreamDecoder(
       videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
 }
 
+void VideoDecoder::addAudioStream(int streamIndex) {
+  addStream(streamIndex, AVMEDIA_TYPE_AUDIO);
+}
+
 // --------------------------------------------------------------------------
 // HIGH-LEVEL DECODING ENTRY-POINTS
 // --------------------------------------------------------------------------
@@ -1051,7 +1072,6 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
   AVFrame* avFrame = avFrameStream.avFrame.get();
   frameOutput.streamIndex = streamIndex;
   auto& streamInfo = streamInfos_[streamIndex];
-  TORCH_CHECK(streamInfo.stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO);
   frameOutput.ptsSeconds = ptsToSeconds(
       avFrame->pts, formatContext_->streams[streamIndex]->time_base);
   frameOutput.durationSeconds = ptsToSeconds(
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -136,12 +136,10 @@ class VideoDecoder {
 
   struct AudioStreamOptions {};
 
-  void addVideoStreamDecoder(
+  void addVideoStream(
       int streamIndex,
       const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
-  void addAudioStreamDecoder(
-      int streamIndex,
-      const AudioStreamOptions& audioStreamOptions = AudioStreamOptions());
+  void addAudioStream(int streamIndex);
 
   // --------------------------------------------------------------------------
   // DECODING AND SEEKING APIs
@@ -322,6 +320,8 @@ class VideoDecoder {
   struct StreamInfo {
     int streamIndex = -1;
     AVStream* stream = nullptr;
+    AVMediaType avMediaType = AVMEDIA_TYPE_UNKNOWN;
+
     AVRational timeBase = {};
     UniqueAVCodecContext codecContext;
 
@@ -433,6 +433,11 @@ class VideoDecoder {
   // STREAM AND METADATA APIS
   // --------------------------------------------------------------------------
 
+  void addStream(
+      int streamIndex,
+      AVMediaType mediaType,
+      const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
+
   // Returns the "best" stream index for a given media type. The "best" is
   // determined by various heuristics in FFMPEG.
   // See
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -34,6 +34,8 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, str? color_conversion_library=None) -> ()");
   m.def(
       "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()");
+  m.def(
+      "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
   m.def("get_next_frame(Tensor(a!) decoder) -> (Tensor, Tensor, Tensor)");
   m.def(
@@ -220,8 +222,14 @@ void _add_video_stream(
   }
 
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  videoDecoder->addVideoStreamDecoder(
-      stream_index.value_or(-1), videoStreamOptions);
+  videoDecoder->addVideoStream(stream_index.value_or(-1), videoStreamOptions);
+}
+
+void add_audio_stream(
+    at::Tensor& decoder,
+    std::optional<int64_t> stream_index) {
+  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
+  videoDecoder->addAudioStream(stream_index.value_or(-1));
 }
 
 void seek_to_pts(at::Tensor& decoder, double seconds) {
@@ -533,6 +541,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("seek_to_pts", &seek_to_pts);
   m.impl("add_video_stream", &add_video_stream);
   m.impl("_add_video_stream", &_add_video_stream);
+  m.impl("add_audio_stream", &add_audio_stream);
   m.impl("get_next_frame", &get_next_frame);
   m.impl("_get_key_frame_indices", &_get_key_frame_indices);
   m.impl("get_json_metadata", &get_json_metadata);
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h
@@ -55,6 +55,10 @@ void _add_video_stream(
     std::optional<std::string_view> device = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt);
 
+void add_audio_stream(
+    at::Tensor& decoder,
+    std::optional<int64_t> stream_index = std::nullopt);
+
 // Seek to a particular presentation timestamp in the video in seconds.
 void seek_to_pts(at::Tensor& decoder, double seconds);
 
diff --git a/src/torchcodec/decoders/_core/__init__.py b/src/torchcodec/decoders/_core/__init__.py
@@ -15,6 +15,7 @@
     _add_video_stream,
     _get_key_frame_indices,
     _test_frame_pts_equality,
+    add_audio_stream,
     add_video_stream,
     create_from_bytes,
     create_from_file,
diff --git a/src/torchcodec/decoders/_core/video_decoder_ops.py b/src/torchcodec/decoders/_core/video_decoder_ops.py
@@ -69,6 +69,7 @@ def load_torchcodec_extension():
 )
 add_video_stream = torch.ops.torchcodec_ns.add_video_stream.default
 _add_video_stream = torch.ops.torchcodec_ns._add_video_stream.default
+add_audio_stream = torch.ops.torchcodec_ns.add_audio_stream.default
 seek_to_pts = torch.ops.torchcodec_ns.seek_to_pts.default
 get_next_frame = torch.ops.torchcodec_ns.get_next_frame.default
 get_frame_at_pts = torch.ops.torchcodec_ns.get_frame_at_pts.default
@@ -150,6 +151,15 @@ def add_video_stream_abstract(
     return
 
 
+@register_fake("torchcodec_ns::add_audio_stream")
+def add_audio_stream_abstract(
+    decoder: torch.Tensor,
+    *,
+    stream_index: Optional[int] = None,
+) -> None:
+    return
+
+
 @register_fake("torchcodec_ns::seek_to_pts")
 def seek_abstract(decoder: torch.Tensor, seconds: float) -> None:
     return
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
@@ -148,7 +148,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) {
   VideoDecoder::VideoStreamOptions videoStreamOptions;
   videoStreamOptions.width = 100;
   videoStreamOptions.height = 120;
-  decoder->addVideoStreamDecoder(-1, videoStreamOptions);
+  decoder->addVideoStream(-1, videoStreamOptions);
   torch::Tensor tensor = decoder->getNextFrame().data;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({3, 120, 100}));
 }
@@ -158,7 +158,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) {
   std::unique_ptr<VideoDecoder> decoder = std::make_unique<VideoDecoder>(path);
   VideoDecoder::VideoStreamOptions videoStreamOptions;
   videoStreamOptions.dimensionOrder = "NHWC";
-  decoder->addVideoStreamDecoder(-1, videoStreamOptions);
+  decoder->addVideoStream(-1, videoStreamOptions);
   torch::Tensor tensor = decoder->getNextFrame().data;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({270, 480, 3}));
 }
@@ -167,7 +167,7 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   auto output = ourDecoder->getNextFrame();
   torch::Tensor tensor0FromOurDecoder = output.data;
   EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
@@ -206,7 +206,7 @@ TEST_P(VideoDecoderTest, DecodesFramesInABatchInNCHW) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(bestVideoStreamIndex);
+  ourDecoder->addVideoStream(bestVideoStreamIndex);
   // Frame with index 180 corresponds to timestamp 6.006.
   auto output = ourDecoder->getFramesAtIndices({0, 180});
   auto tensor = output.data;
@@ -228,7 +228,7 @@ TEST_P(VideoDecoderTest, DecodesFramesInABatchInNHWC) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(
+  ourDecoder->addVideoStream(
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamOptions("dimension_order=NHWC"));
   // Frame with index 180 corresponds to timestamp 6.006.
@@ -250,7 +250,7 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   ourDecoder->setCursorPtsInSeconds(388388. / 30'000);
   auto output = ourDecoder->getNextFrame();
   EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000);
@@ -263,7 +263,7 @@ TEST_P(VideoDecoderTest, GetsFramePlayedAtTimestamp) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   auto output = ourDecoder->getFramePlayedAt(6.006);
   EXPECT_EQ(output.ptsSeconds, 6.006);
   // The frame's duration is 0.033367 according to ffprobe,
@@ -293,7 +293,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
-  ourDecoder->addVideoStreamDecoder(-1);
+  ourDecoder->addVideoStream(-1);
   ourDecoder->setCursorPtsInSeconds(6.0);
   auto output = ourDecoder->getNextFrame();
   torch::Tensor tensor6FromOurDecoder = output.data;
@@ -393,7 +393,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorFilterGraph) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(
+  ourDecoder->addVideoStream(
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamOptions("color_conversion_library=filtergraph"));
   auto output =
@@ -410,7 +410,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorSwscale) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  ourDecoder->addVideoStreamDecoder(
+  ourDecoder->addVideoStream(
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamOptions("color_conversion_library=swscale"));
   auto output =
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -19,6 +19,7 @@
 from torchcodec.decoders._core import (
     _add_video_stream,
     _test_frame_pts_equality,
+    add_audio_stream,
     add_video_stream,
     create_from_bytes,
     create_from_file,
@@ -63,6 +64,31 @@ def seek(self, pts: float):
 
 
 class TestOps:
+    def test_add_stream(self):
+        valid_video_stream, valid_audio_stream = 0, 1
+
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        add_video_stream(decoder, stream_index=valid_video_stream)
+        with pytest.raises(RuntimeError, match="Can only add one single stream"):
+            add_video_stream(decoder, stream_index=valid_video_stream)
+
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        add_audio_stream(decoder, stream_index=valid_audio_stream)
+        with pytest.raises(RuntimeError, match="Can only add one single stream"):
+            add_audio_stream(decoder, stream_index=valid_audio_stream)
+
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        with pytest.raises(
+            ValueError, match=f"Is {valid_audio_stream} of the desired media type"
+        ):
+            add_video_stream(decoder, stream_index=valid_audio_stream)
+
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        with pytest.raises(
+            ValueError, match=f"Is {valid_video_stream} of the desired media type"
+        ):
+            add_audio_stream(decoder, stream_index=valid_video_stream)
+
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_seek_and_next(self, device):
         decoder = create_from_file(str(NASA_VIDEO.path))