Merge branch 'move-conversion-out' into encoding_sample_rate_lezzzgo

NicolasHug · NicolasHug · commit b2eed2f85f63 · 2025-05-29T10:29:18.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -215,9 +215,6 @@ void AudioEncoder::initializeEncoder(
       status == AVSUCCESS,
       "avcodec_open2 failed: ",
       getFFMPEGErrorStringFromErrorCode(status));
-  
-  bool supportsVariableFrameSize = avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE;
-  printf("supportsVariableFrameSize = %d\n", supportsVariableFrameSize);
 
   // We're allocating the stream here. Streams are meant to be freed by
   // avformat_free_context(avFormatContext), which we call in the
@@ -232,11 +229,19 @@ void AudioEncoder::initializeEncoder(
       getFFMPEGErrorStringFromErrorCode(status));
   streamIndex_ = avStream->index;
 
-  // frame_size * 2 is a decent default size. FFmpeg automatically re-allocates
-  // the fifo if more space is needed.
-  auto avAudioFifo = av_audio_fifo_alloc(avCodecContext_->sample_fmt, outNumChannels_, avCodecContext_->frame_size * 2);
-  TORCH_CHECK(avAudioFifo!= nullptr, "Couldn't create AVAudioFifo.");
-  avAudioFifo_.reset(avAudioFifo);
+  //   bool supportsVariableFrameSize =
+  //       avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE;
+  //   printf("supportsVariableFrameSize = %d\n", supportsVariableFrameSize);
+
+  //   // frame_size * 2 is a decent default size. FFmpeg automatically
+  //   re-allocates
+  //   // the fifo if more space is needed.
+  //   auto avAudioFifo = av_audio_fifo_alloc(
+  //       avCodecContext_->sample_fmt,
+  //       outNumChannels_,
+  //       avCodecContext_->frame_size * 2);
+  //   TORCH_CHECK(avAudioFifo != nullptr, "Couldn't create AVAudioFifo.");
+  //   avAudioFifo_.reset(avAudioFifo);
 }
 
 torch::Tensor AudioEncoder::encodeToTensor() {
@@ -300,10 +305,13 @@ void AudioEncoder::encode() {
     // encoded frame would contain more samples than necessary and our results
     // wouldn't match the ffmpeg CLI.
     avFrame->nb_samples = numSamplesToEncode;
-    encodeInnerLoop(autoAVPacket, avFrame);
 
-    avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
+    UniqueAVFrame convertedAVFrame = maybeConvertAVFrame(avFrame);
+    encodeInnerLoop(autoAVPacket, convertedAVFrame);
+
     numEncodedSamples += numSamplesToEncode;
+    // TODO-ENCODING set frame pts correctly, and test against it.
+    // avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
   }
   TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
 
@@ -316,67 +324,69 @@ void AudioEncoder::encode() {
       getFFMPEGErrorStringFromErrorCode(status));
 }
 
-void AudioEncoder::encodeInnerLoop(
-    AutoAVPacket& autoAVPacket,
-    UniqueAVFrame& srcAVFrame,
-    bool allowConvert) {
-  // TODO: Probably makes more sense to move the conversion away? It shouldn't
-  // be in inner loop in any case. We should also remove allowConvert.
-  bool mustConvert =
-      (allowConvert && srcAVFrame != nullptr &&
-       (static_cast<AVSampleFormat>(srcAVFrame->format) !=
-            avCodecContext_->sample_fmt ||
-        getNumChannels(srcAVFrame) != outNumChannels_ ||
-        srcAVFrame->sample_rate != outSampleRate_));
-
-  UniqueAVFrame convertedAVFrame;
-  if (mustConvert) {
-    if (!swrContext_) {
-      swrContext_.reset(createSwrContext(
-          AV_SAMPLE_FMT_FLTP,
-          avCodecContext_->sample_fmt,
-          srcAVFrame->sample_rate,
-          outSampleRate_,
-          srcAVFrame,
-          outNumChannels_));
-    }
-    convertedAVFrame = convertAudioAVFrameSamples(
-        swrContext_,
-        srcAVFrame,
+UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) {
+  if (static_cast<AVSampleFormat>(avFrame->format) ==
+          avCodecContext_->sample_fmt &&
+      getNumChannels(avFrame) == outNumChannels_ &&
+      avFrame->sample_rate == outSampleRate_) {
+    // Note: the clone references the same underlying data, it's a cheap copy.
+    return UniqueAVFrame(av_frame_clone(avFrame.get()));
+  }
+
+  if (!swrContext_) {
+    swrContext_.reset(createSwrContext(
+        static_cast<AVSampleFormat>(avFrame->format),
         avCodecContext_->sample_fmt,
+        avFrame->sample_rate,
         outSampleRate_,
-        outNumChannels_);
-    if (outSampleRate_ == sampleRateInput_) {
-      TORCH_CHECK(
-          convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
-          "convertedAVFrame->nb_samples=",
-          convertedAVFrame->nb_samples,
-          " differs from ",
-          "srcAVFrame->nb_samples=",
-          srcAVFrame->nb_samples,
-          "This is unexpected, please report on the TorchCodec bug tracker.");
-    }
+        avFrame,
+        outNumChannels_));
   }
-  UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
-
-  if (avFrame != nullptr) {
-    // TODO static cast
-    int numSamplesWritten = av_audio_fifo_write(avAudioFifo_.get(), (void**)avFrame->data, avFrame->nb_samples);
-    TORCH_CHECK(numSamplesWritten == avFrame->nb_samples, "Tried to write  TODO");
-    printf("Writing %d samples to fifo (size = %d)\n", avFrame->nb_samples, av_audio_fifo_size(avAudioFifo_.get()));
-
-    avFrame = allocateAVFrame(avCodecContext_->frame_size, outSampleRate_, outNumChannels_);
-    // TODO cast
-    int numSamplesRead = av_audio_fifo_read(avAudioFifo_.get(), (void**)avFrame->data, avFrame->nb_samples);
-    printf("Read %d from fifo\n", numSamplesRead);
-    TORCH_CHECK(numSamplesRead > 0, "Tried to read TODO");
+  UniqueAVFrame convertedAVFrame = convertAudioAVFrameSamples(
+      swrContext_,
+      avFrame,
+      avCodecContext_->sample_fmt,
+      outSampleRate_,
+      outNumChannels_);
+
+  if (avFrame->sample_rate == outSampleRate_) {
+    TORCH_CHECK(
+        convertedAVFrame->nb_samples == avFrame->nb_samples,
+        "convertedAVFrame->nb_samples=",
+        convertedAVFrame->nb_samples,
+        " differs from ",
+        "avFrame->nb_samples=",
+        avFrame->nb_samples,
+        "This is unexpected, please report on the TorchCodec bug tracker.");
   }
+  return convertedAVFrame;
+}
 
-  if (avFrame != nullptr) {
-    printf("Sending frame with %d samples\n", avFrame->nb_samples);
-  } else{
-    printf("AVFrame is empty\n");
-  }
+void AudioEncoder::encodeInnerLoop(
+    AutoAVPacket& autoAVPacket,
+    const UniqueAVFrame& avFrame) {
+  //   if (avFrame != nullptr) {
+  //     // TODO static cast
+  //     int numSamplesWritten = av_audio_fifo_write(avAudioFifo_.get(),
+  //     (void**)avFrame->data, avFrame->nb_samples);
+  //     TORCH_CHECK(numSamplesWritten == avFrame->nb_samples, "Tried to write
+  //     TODO"); printf("Writing %d samples to fifo (size = %d)\n",
+  //     avFrame->nb_samples, av_audio_fifo_size(avAudioFifo_.get()));
+
+  //     avFrame = allocateAVFrame(avCodecContext_->frame_size, outSampleRate_,
+  //     outNumChannels_);
+  //     // TODO cast
+  //     int numSamplesRead = av_audio_fifo_read(avAudioFifo_.get(),
+  //     (void**)avFrame->data, avFrame->nb_samples); printf("Read %d from
+  //     fifo\n", numSamplesRead); TORCH_CHECK(numSamplesRead > 0, "Tried to
+  //     read TODO");
+  //   }
+
+  //   if (avFrame != nullptr) {
+  //     printf("Sending frame with %d samples\n", avFrame->nb_samples);
+  //   } else{
+  //     printf("AVFrame is empty\n");
+  //   }
   auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
   TORCH_CHECK(
       status == AVSUCCESS,
@@ -434,13 +444,12 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
       swrContext_.get(), avFrame->data, avFrame->nb_samples, NULL, 0);
   avFrame->nb_samples = actualNumRemainingSamples;
 
-  encodeInnerLoop(autoAVPacket, avFrame, false);
+  encodeInnerLoop(autoAVPacket, avFrame);
 }
 
 void AudioEncoder::flushBuffers() {
   AutoAVPacket autoAVPacket;
   maybeFlushSwrBuffers(autoAVPacket);
-  auto zob = UniqueAVFrame(nullptr);
-  encodeInnerLoop(autoAVPacket, zob);
+  encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
 }
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -36,10 +36,10 @@ class AudioEncoder {
 
  private:
   void initializeEncoder(const AudioStreamOptions& audioStreamOptions);
+  UniqueAVFrame maybeConvertAVFrame(const UniqueAVFrame& avFrame);
   void encodeInnerLoop(
       AutoAVPacket& autoAVPacket,
-      UniqueAVFrame& srcAVFrame,
-      bool allowConvert = true);
+      const UniqueAVFrame& srcAVFrame);
   void maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket);
   void flushBuffers();
 
@@ -55,7 +55,6 @@ class AudioEncoder {
   const torch::Tensor wf_;
   int sampleRateInput_ = -1;
 
-
   UniqueAVAudioFifo avAudioFifo_;
 
   // Stores the AVIOContext for the output tensor buffer.
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -15,14 +15,14 @@ extern "C" {
 #include <libavfilter/avfilter.h>
 #include <libavformat/avformat.h>
 #include <libavformat/avio.h>
+#include <libavutil/audio_fifo.h>
 #include <libavutil/avutil.h>
 #include <libavutil/dict.h>
 #include <libavutil/display.h>
 #include <libavutil/file.h>
 #include <libavutil/opt.h>
 #include <libavutil/pixfmt.h>
 #include <libavutil/version.h>
-#include <libavutil/audio_fifo.h>
 #include <libswresample/swresample.h>
 #include <libswscale/swscale.h>
 }
@@ -74,9 +74,8 @@ using UniqueSwsContext =
     std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
 using UniqueSwrContext =
     std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
-using UniqueAVAudioFifo = std::unique_ptr<
-    AVAudioFifo,
-    Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
+using UniqueAVAudioFifo = std::
+    unique_ptr<AVAudioFifo, Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
 
 // These 2 classes share the same underlying AVPacket object. They are meant to
 // be used in tandem, like so: