AudioDecoder: specify desired num_channels

NicolasHug · NicolasHug · commit 6e6600138391 · 2025-05-13T18:38:17.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -297,9 +297,11 @@ void AudioEncoder::encodeInnerLoop(
           AV_SAMPLE_FMT_FLTP,
           avCodecContext_->sample_fmt,
           srcAVFrame->sample_rate, // No sample rate conversion
-          srcAVFrame->sample_rate));
+          srcAVFrame->sample_rate,
+          2 // TODO
+          ));
     }
-    convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
+    convertedAVFrame = convertAudioAVFrameSamples(
         swrContext_,
         srcAVFrame,
         avCodecContext_->sample_fmt,
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -121,17 +121,26 @@ SwrContext* createSwrContext(
     AVSampleFormat sourceSampleFormat,
     AVSampleFormat desiredSampleFormat,
     int sourceSampleRate,
-    int desiredSampleRate) {
+    int desiredSampleRate,
+    int desiredNumChannels) {
   SwrContext* swrContext = nullptr;
   int status = AVSUCCESS;
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
-  AVChannelLayout layout = avCodecContext->ch_layout;
+  AVChannelLayout sourceLayout = avCodecContext->ch_layout;
+  AVChannelLayout desiredLayout;
+  if (desiredNumChannels == getNumChannels(avCodecContext)) {
+    status = av_channel_layout_copy(&desiredLayout, &sourceLayout);
+    TORCH_CHECK(status == AVSUCCESS, "TODO");
+  } else {
+    av_channel_layout_default(&desiredLayout, desiredNumChannels);
+    // TODO check validity of this call?
+  }
   status = swr_alloc_set_opts2(
       &swrContext,
-      &layout,
+      &desiredLayout,
       desiredSampleFormat,
       desiredSampleRate,
-      &layout,
+      &sourceLayout,
       sourceSampleFormat,
       sourceSampleRate,
       0,
@@ -167,7 +176,7 @@ SwrContext* createSwrContext(
   return swrContext;
 }
 
-UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
+UniqueAVFrame convertAudioAVFrameSamples(
     const UniqueSwrContext& swrContext,
     const UniqueAVFrame& srcAVFrame,
     AVSampleFormat desiredSampleFormat,
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -163,9 +163,10 @@ SwrContext* createSwrContext(
     AVSampleFormat sourceSampleFormat,
     AVSampleFormat desiredSampleFormat,
     int sourceSampleRate,
-    int desiredSampleRate);
+    int desiredSampleRate,
+    int desiredNumChannels);
 
-UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
+UniqueAVFrame convertAudioAVFrameSamples(
     const UniqueSwrContext& swrContext,
     const UniqueAVFrame& srcAVFrame,
     AVSampleFormat desiredSampleFormat,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1355,9 +1355,14 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
   int desiredSampleRate =
       streamInfo.audioStreamOptions.sampleRate.value_or(sourceSampleRate);
 
+  int sourceNumChannels = getNumChannels(srcAVFrame);
+  int desiredNumChannels =
+      streamInfo.audioStreamOptions.numChannels.value_or(sourceNumChannels);
+
   bool mustConvert =
       (sourceSampleFormat != desiredSampleFormat ||
-       sourceSampleRate != desiredSampleRate);
+       sourceSampleRate != desiredSampleRate ||
+       sourceNumChannels != desiredNumChannels);
 
   UniqueAVFrame convertedAVFrame;
   if (mustConvert) {
@@ -1367,10 +1372,11 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
           sourceSampleFormat,
           desiredSampleFormat,
           sourceSampleRate,
-          desiredSampleRate));
+          desiredSampleRate,
+          desiredNumChannels));
     }
 
-    convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
+    convertedAVFrame = convertAudioAVFrameSamples(
         streamInfo.swrContext,
         srcAVFrame,
         desiredSampleFormat,
@@ -1389,15 +1395,15 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
       av_get_sample_fmt_name(format));
 
   auto numSamples = avFrame->nb_samples; // per channel
-  auto numChannels = getNumChannels(avFrame);
 
-  frameOutput.data = torch::empty({numChannels, numSamples}, torch::kFloat32);
+  frameOutput.data =
+      torch::empty({desiredNumChannels, numSamples}, torch::kFloat32);
 
   if (numSamples > 0) {
     uint8_t* outputChannelData =
         static_cast<uint8_t*>(frameOutput.data.data_ptr());
     auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
-    for (auto channel = 0; channel < numChannels;
+    for (auto channel = 0; channel < desiredNumChannels;
          ++channel, outputChannelData += numBytesPerChannel) {
       std::memcpy(
           outputChannelData,
@@ -1424,7 +1430,8 @@ std::optional<torch::Tensor> SingleStreamDecoder::maybeFlushSwrBuffers() {
     return std::nullopt;
   }
 
-  auto numChannels = getNumChannels(streamInfo.codecContext);
+  int numChannels = streamInfo.audioStreamOptions.numChannels.value_or(
+      getNumChannels(streamInfo.codecContext));
   torch::Tensor lastSamples =
       torch::empty({numChannels, numRemainingSamples}, torch::kFloat32);
 
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -44,6 +44,7 @@ struct AudioStreamOptions {
   AudioStreamOptions() {}
 
   std::optional<int> sampleRate;
+  std::optional<int> numChannels;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -40,7 +40,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()");
   m.def(
-      "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None) -> ()");
+      "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
   m.def("get_next_frame(Tensor(a!) decoder) -> (Tensor, Tensor, Tensor)");
   m.def(
@@ -280,9 +280,11 @@ void add_video_stream(
 void add_audio_stream(
     at::Tensor& decoder,
     std::optional<int64_t> stream_index = std::nullopt,
-    std::optional<int64_t> sample_rate = std::nullopt) {
+    std::optional<int64_t> sample_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt) {
   AudioStreamOptions audioStreamOptions;
   audioStreamOptions.sampleRate = sample_rate;
+  audioStreamOptions.numChannels = num_channels;
 
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   videoDecoder->addAudioStream(stream_index.value_or(-1), audioStreamOptions);
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -221,6 +221,8 @@ def add_audio_stream_abstract(
     decoder: torch.Tensor,
     *,
     stream_index: Optional[int] = None,
+    sample_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
 ) -> None:
     return
 
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -40,6 +40,8 @@ class AudioDecoder:
             the :term:`best stream` is used.
         sample_rate (int, optional): The desired output sample rate of the decoded samples.
             By default, the samples are returned in their original sample rate.
+        num_channels (int, optional): The desired number of channels of the decoded samples.
+            By default, the original number of channels is used.
 
     Attributes:
         metadata (AudioStreamMetadata): Metadata of the audio stream.
@@ -54,11 +56,15 @@ def __init__(
         *,
         stream_index: Optional[int] = None,
         sample_rate: Optional[int] = None,
+        num_channels: Optional[int] = None,
     ):
         self._decoder = create_decoder(source=source, seek_mode="approximate")
 
         core.add_audio_stream(
-            self._decoder, stream_index=stream_index, sample_rate=sample_rate
+            self._decoder,
+            stream_index=stream_index,
+            sample_rate=sample_rate,
+            num_channels=num_channels,
         )
 
         container_metadata = core.get_container_metadata(self._decoder)