Add num_channels parameter to AudioEncoder

NicolasHug · NicolasHug · commit 52d624ba69ec · 2025-05-22T10:18:31.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -55,6 +55,20 @@ void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
       supportedRates.str());
 }
 
+void print_supported_channel_layouts(const AVCodec *codec) {
+    if (!codec->ch_layouts) {
+        printf("No specific channel layouts supported by this encoder.\n");
+        return;
+    }
+    const AVChannelLayout *layout = codec->ch_layouts;
+    while (layout->order != AV_CHANNEL_ORDER_UNSPEC) {
+        char layout_name[256];
+        av_channel_layout_describe(layout, layout_name, sizeof(layout_name));
+        printf("Supported channel layout: %s\n", layout_name);
+        layout++;
+    }
+}
+
 static const std::vector<AVSampleFormat> preferredFormatsOrder = {
     AV_SAMPLE_FMT_FLTP,
     AV_SAMPLE_FMT_FLT,
@@ -101,7 +115,8 @@ AudioEncoder::AudioEncoder(
     const torch::Tensor wf,
     int sampleRate,
     std::string_view fileName,
-    std::optional<int64_t> bitRate)
+    std::optional<int64_t> bitRate,
+    std::optional<int64_t> numChannels)
     : wf_(validateWf(wf)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
@@ -121,15 +136,16 @@ AudioEncoder::AudioEncoder(
       "avio_open failed: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  initializeEncoder(sampleRate, bitRate);
+  initializeEncoder(sampleRate, bitRate, numChannels);
 }
 
 AudioEncoder::AudioEncoder(
     const torch::Tensor wf,
     int sampleRate,
     std::string_view formatName,
     std::unique_ptr<AVIOToTensorContext> avioContextHolder,
-    std::optional<int64_t> bitRate)
+    std::optional<int64_t> bitRate,
+    std::optional<int64_t> numChannels)
     : wf_(validateWf(wf)), avioContextHolder_(std::move(avioContextHolder)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
@@ -145,17 +161,19 @@ AudioEncoder::AudioEncoder(
 
   avFormatContext_->pb = avioContextHolder_->getAVIOContext();
 
-  initializeEncoder(sampleRate, bitRate);
+  initializeEncoder(sampleRate, bitRate, numChannels);
 }
 
 void AudioEncoder::initializeEncoder(
     int sampleRate,
-    std::optional<int64_t> bitRate) {
+    std::optional<int64_t> bitRate,
+    [[maybe_unused]] std::optional<int64_t> numChannels) {
   // We use the AVFormatContext's default codec for that
   // specific format/container.
   const AVCodec* avCodec =
       avcodec_find_encoder(avFormatContext_->oformat->audio_codec);
   TORCH_CHECK(avCodec != nullptr, "Codec not found");
+  print_supported_channel_layouts(avCodec);
 
   AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
@@ -168,6 +186,10 @@ void AudioEncoder::initializeEncoder(
   // well when "-b:a" isn't specified.
   avCodecContext_->bit_rate = bitRate.value_or(0);
 
+  desiredNumChannels_ = static_cast<int>(numChannels.value_or(wf_.sizes()[0]));
+
+  setDefaultChannelLayout(avCodecContext_, desiredNumChannels_);
+
   validateSampleRate(*avCodec, sampleRate);
   avCodecContext_->sample_rate = sampleRate;
 
@@ -176,8 +198,6 @@ void AudioEncoder::initializeEncoder(
   // what the `.sample_fmt` defines.
   avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec);
 
-  setDefaultChannelLayout(avCodecContext_, static_cast<int>(wf_.sizes()[0]));
-
   int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
   TORCH_CHECK(
       status == AVSUCCESS,
@@ -222,7 +242,7 @@ void AudioEncoder::encode() {
   avFrame->format = AV_SAMPLE_FMT_FLTP;
   avFrame->sample_rate = avCodecContext_->sample_rate;
   avFrame->pts = 0;
-  setChannelLayout(avFrame, avCodecContext_);
+  setDefaultChannelLayout(avFrame, static_cast<int>(wf_.sizes()[0]));
 
   auto status = av_frame_get_buffer(avFrame.get(), 0);
   TORCH_CHECK(
@@ -287,8 +307,10 @@ void AudioEncoder::encodeInnerLoop(
     AutoAVPacket& autoAVPacket,
     const UniqueAVFrame& srcAVFrame) {
   bool mustConvert =
-      (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP &&
-       srcAVFrame != nullptr);
+      (srcAVFrame != nullptr &&
+       (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP ||
+        getNumChannels(srcAVFrame) != desiredNumChannels_));
+
   UniqueAVFrame convertedAVFrame;
   if (mustConvert) {
     if (!swrContext_) {
@@ -298,15 +320,14 @@ void AudioEncoder::encodeInnerLoop(
           srcAVFrame->sample_rate, // No sample rate conversion
           srcAVFrame->sample_rate,
           srcAVFrame,
-          getNumChannels(srcAVFrame) // No num_channel conversion
-          ));
+          desiredNumChannels_));
     }
     convertedAVFrame = convertAudioAVFrameSamples(
         swrContext_,
         srcAVFrame,
         avCodecContext_->sample_fmt,
         srcAVFrame->sample_rate, // No sample rate conversion
-        getNumChannels(srcAVFrame)); // No num_channel conversion
+        desiredNumChannels_);
     TORCH_CHECK(
         convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
         "convertedAVFrame->nb_samples=",
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -13,6 +13,9 @@ class AudioEncoder {
   // like passing 0, which results in choosing the minimum supported bit rate.
   // Passing 44_100 could result in output being 44000 if only 44000 is
   // supported.
+  //
+  // TODO-ENCODING: bundle the optional params like bitRate, numChannels, etc.
+  // into an AudioStreamOptions struct, or similar.
   AudioEncoder(
       const torch::Tensor wf,
       // The *output* sample rate. We can't really decide for the user what it
@@ -21,20 +24,23 @@ class AudioEncoder {
       // encoding will still work but audio will be distorted.
       int sampleRate,
       std::string_view fileName,
-      std::optional<int64_t> bitRate = std::nullopt);
+      std::optional<int64_t> bitRate = std::nullopt,
+      std::optional<int64_t> numChannels = std::nullopt);
   AudioEncoder(
       const torch::Tensor wf,
       int sampleRate,
       std::string_view formatName,
       std::unique_ptr<AVIOToTensorContext> avioContextHolder,
-      std::optional<int64_t> bitRate = std::nullopt);
+      std::optional<int64_t> bitRate = std::nullopt,
+      std::optional<int64_t> numChannels = std::nullopt);
   void encode();
   torch::Tensor encodeToTensor();
 
  private:
   void initializeEncoder(
       int sampleRate,
-      std::optional<int64_t> bitRate = std::nullopt);
+      std::optional<int64_t> bitRate = std::nullopt,
+      std::optional<int64_t> numChannels = std::nullopt);
   void encodeInnerLoop(
       AutoAVPacket& autoAVPacket,
       const UniqueAVFrame& srcAVFrame);
@@ -44,6 +50,9 @@ class AudioEncoder {
   UniqueAVCodecContext avCodecContext_;
   int streamIndex_;
   UniqueSwrContext swrContext_;
+  // TODO-ENCODING: desiredNumChannels should just be part of an options struct,
+  // see other TODO above.
+  int desiredNumChannels_ = -1;
 
   const torch::Tensor wf_;
 
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -88,23 +88,35 @@ void setDefaultChannelLayout(
 #endif
 }
 
-void setChannelLayout(
-    UniqueAVFrame& dstAVFrame,
-    const UniqueAVCodecContext& avCodecContext) {
+void setDefaultChannelLayout(UniqueAVFrame& avFrame, int numChannels) {
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
-  auto status = av_channel_layout_copy(
-      &dstAVFrame->ch_layout, &avCodecContext->ch_layout);
-  TORCH_CHECK(
-      status == AVSUCCESS,
-      "Couldn't copy channel layout to avFrame: ",
-      getFFMPEGErrorStringFromErrorCode(status));
+  AVChannelLayout channel_layout;
+  av_channel_layout_default(&channel_layout, numChannels);
+  avFrame->ch_layout = channel_layout;
 #else
-  dstAVFrame->channel_layout = avCodecContext->channel_layout;
-  dstAVFrame->channels = avCodecContext->channels;
-
+  uint64_t channel_layout = av_get_default_channel_layout(numChannels);
+  avFrame->channel_layout = channel_layout;
+  avFrame->channels = numChannels;
 #endif
 }
 
+// void setChannelLayout(
+//     UniqueAVFrame& dstAVFrame,
+//     const UniqueAVCodecContext& avCodecContext) {
+// #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
+//   auto status = av_channel_layout_copy(
+//       &dstAVFrame->ch_layout, &avCodecContext->ch_layout);
+//   TORCH_CHECK(
+//       status == AVSUCCESS,
+//       "Couldn't copy channel layout to avFrame: ",
+//       getFFMPEGErrorStringFromErrorCode(status));
+// #else
+//   dstAVFrame->channel_layout = avCodecContext->channel_layout;
+//   dstAVFrame->channels = avCodecContext->channels;
+
+// #endif
+// }
+
 namespace {
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
 
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -151,9 +151,11 @@ void setDefaultChannelLayout(
     UniqueAVCodecContext& avCodecContext,
     int numChannels);
 
-void setChannelLayout(
-    UniqueAVFrame& dstAVFrame,
-    const UniqueAVCodecContext& avCodecContext);
+void setDefaultChannelLayout(UniqueAVFrame& avFrame, int numChannels);
+
+// void setChannelLayout(
+//     UniqueAVFrame& dstAVFrame,
+//     const UniqueAVCodecContext& avCodecContext);
 
 void setChannelLayout(
     UniqueAVFrame& dstAVFrame,
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -29,9 +29,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "torchcodec._core.ops", "//pytorch/torchcodec:torchcodec");
   m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor");
   m.def(
-      "encode_audio_to_file(Tensor wf, int sample_rate, str filename, int? bit_rate=None) -> ()");
+      "encode_audio_to_file(Tensor wf, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None) -> ()");
   m.def(
-      "encode_audio_to_tensor(Tensor wf, int sample_rate, str format, int? bit_rate=None) -> Tensor");
+      "encode_audio_to_tensor(Tensor wf, int sample_rate, str format, int? bit_rate=None, int? num_channels=None) -> Tensor");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
@@ -391,23 +391,27 @@ void encode_audio_to_file(
     const at::Tensor wf,
     int64_t sample_rate,
     std::string_view file_name,
-    std::optional<int64_t> bit_rate = std::nullopt) {
-  AudioEncoder(wf, validateSampleRate(sample_rate), file_name, bit_rate)
+    std::optional<int64_t> bit_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt) {
+  AudioEncoder(
+      wf, validateSampleRate(sample_rate), file_name, bit_rate, num_channels)
       .encode();
 }
 
 at::Tensor encode_audio_to_tensor(
     const at::Tensor wf,
     int64_t sample_rate,
     std::string_view format,
-    std::optional<int64_t> bit_rate = std::nullopt) {
+    std::optional<int64_t> bit_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   return AudioEncoder(
              wf,
              validateSampleRate(sample_rate),
              format,
              std::move(avioContextHolder),
-             bit_rate)
+             bit_rate,
+             num_channels)
       .encodeToTensor();
 }
 
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -163,14 +163,22 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
 
 @register_fake("torchcodec_ns::encode_audio_to_file")
 def encode_audio_to_file_abstract(
-    wf: torch.Tensor, sample_rate: int, filename: str, bit_rate: Optional[int] = None
+    wf: torch.Tensor,
+    sample_rate: int,
+    filename: str,
+    bit_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
 ) -> None:
     return
 
 
 @register_fake("torchcodec_ns::encode_audio_to_tensor")
 def encode_audio_to_tensor_abstract(
-    wf: torch.Tensor, sample_rate: int, format: str, bit_rate: Optional[int] = None
+    wf: torch.Tensor,
+    sample_rate: int,
+    format: str,
+    bit_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
diff --git a/test/test_ops.py b/test/test_ops.py