meta-pytorch
diff --git a/‎src/torchcodec/_core/DeviceInterface.cpp‎
Lines changed: 19 additions & 8 deletions b/‎src/torchcodec/_core/DeviceInterface.cpp‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 88 additions & 23 deletions b/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 88 additions & 23 deletions
diff --git a/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 3 additions & 2 deletions b/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 71 additions & 2 deletions b/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 71 additions & 2 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 8 additions & 1 deletion b/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 8 additions & 1 deletion
@@ -11,8 +11,9 @@
 namespace facebook::torchcodec {
 
 namespace {
+using DeviceInterfaceMap = std::map<torch::DeviceType, CreateDeviceInterfaceFn>;
 std::mutex g_interface_mutex;
-std::map<torch::DeviceType, CreateDeviceInterfaceFn> g_interface_map;
+std::unique_ptr<DeviceInterfaceMap> g_interface_map;
 
 std::string getDeviceType(const std::string& device) {
   size_t pos = device.find(':');
@@ -28,11 +29,18 @@ bool registerDeviceInterface(
     torch::DeviceType deviceType,
     CreateDeviceInterfaceFn createInterface) {
   std::scoped_lock lock(g_interface_mutex);
+  if (!g_interface_map) {
+    // We delay this initialization until runtime to avoid the Static
+    // Initialization Order Fiasco:
+    //
+    //   https://en.cppreference.com/w/cpp/language/siof
+    g_interface_map = std::make_unique<DeviceInterfaceMap>();
+  }
   TORCH_CHECK(
-      g_interface_map.find(deviceType) == g_interface_map.end(),
+      g_interface_map->find(deviceType) == g_interface_map->end(),
       "Device interface already registered for ",
       deviceType);
-  g_interface_map.insert({deviceType, createInterface});
+  g_interface_map->insert({deviceType, createInterface});
   return true;
 }
 
@@ -45,14 +53,16 @@ torch::Device createTorchDevice(const std::string device) {
   std::scoped_lock lock(g_interface_mutex);
   std::string deviceType = getDeviceType(device);
   auto deviceInterface = std::find_if(
-      g_interface_map.begin(),
-      g_interface_map.end(),
+      g_interface_map->begin(),
+      g_interface_map->end(),
       [&](const std::pair<torch::DeviceType, CreateDeviceInterfaceFn>& arg) {
         return device.rfind(
                    torch::DeviceTypeName(arg.first, /*lcase*/ true), 0) == 0;
       });
   TORCH_CHECK(
-      deviceInterface != g_interface_map.end(), "Unsupported device: ", device);
+      deviceInterface != g_interface_map->end(),
+      "Unsupported device: ",
+      device);
 
   return torch::Device(device);
 }
@@ -67,11 +77,12 @@ std::unique_ptr<DeviceInterface> createDeviceInterface(
 
   std::scoped_lock lock(g_interface_mutex);
   TORCH_CHECK(
-      g_interface_map.find(deviceType) != g_interface_map.end(),
+      g_interface_map->find(deviceType) != g_interface_map->end(),
       "Unsupported device: ",
       device);
 
-  return std::unique_ptr<DeviceInterface>(g_interface_map[deviceType](device));
+  return std::unique_ptr<DeviceInterface>(
+      (*g_interface_map)[deviceType](device));
 }
 
 } // namespace facebook::torchcodec
@@ -33,6 +33,44 @@ void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
       supportedRates.str());
 }
 
+static const std::vector<AVSampleFormat> preferredFormatsOrder = {
+    AV_SAMPLE_FMT_FLTP,
+    AV_SAMPLE_FMT_FLT,
+    AV_SAMPLE_FMT_DBLP,
+    AV_SAMPLE_FMT_DBL,
+    AV_SAMPLE_FMT_S64P,
+    AV_SAMPLE_FMT_S64,
+    AV_SAMPLE_FMT_S32P,
+    AV_SAMPLE_FMT_S32,
+    AV_SAMPLE_FMT_S16P,
+    AV_SAMPLE_FMT_S16,
+    AV_SAMPLE_FMT_U8P,
+    AV_SAMPLE_FMT_U8};
+
+AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
+  // Find a sample format that the encoder supports. We prefer using FLT[P],
+  // since this is the format of the input waveform. If FLTP isn't supported
+  // then we'll need to convert the AVFrame's format. Our heuristic is to encode
+  // into the format with the highest resolution.
+  if (avCodec.sample_fmts == nullptr) {
+    // Can't really validate anything in this case, best we can do is hope that
+    // FLTP is supported by the encoder. If not, FFmpeg will raise.
+    return AV_SAMPLE_FMT_FLTP;
+  }
+
+  for (AVSampleFormat preferredFormat : preferredFormatsOrder) {
+    for (int i = 0; avCodec.sample_fmts[i] != -1; ++i) {
+      if (avCodec.sample_fmts[i] == preferredFormat) {
+        return preferredFormat;
+      }
+    }
+  }
+  // We should always find a match in preferredFormatsOrder, so we should always
+  // return earlier. But in the event that a future FFmpeg version defines an
+  // additional sample format that isn't in preferredFormatsOrder, we fallback:
+  return avCodec.sample_fmts[0];
+}
+
 } // namespace
 
 AudioEncoder::~AudioEncoder() {}
@@ -41,12 +79,14 @@ AudioEncoder::AudioEncoder(
     const torch::Tensor wf,
     int sampleRate,
     std::string_view fileName,
-    std::optional<int64_t> bit_rate)
+    std::optional<int64_t> bitRate)
     : wf_(wf) {
   TORCH_CHECK(
       wf_.dtype() == torch::kFloat32,
       "waveform must have float32 dtype, got ",
       wf_.dtype());
+  // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
+  // planar (fltp).
   TORCH_CHECK(
       wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
 
@@ -82,24 +122,20 @@ AudioEncoder::AudioEncoder(
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
   avCodecContext_.reset(avCodecContext);
 
-  if (bit_rate.has_value()) {
-    TORCH_CHECK(*bit_rate >= 0, "bit_rate=", *bit_rate, " must be >= 0.");
+  if (bitRate.has_value()) {
+    TORCH_CHECK(*bitRate >= 0, "bit_rate=", *bitRate, " must be >= 0.");
   }
   // bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as
   // well when "-b:a" isn't specified.
-  avCodecContext_->bit_rate = bit_rate.value_or(0);
+  avCodecContext_->bit_rate = bitRate.value_or(0);
 
   validateSampleRate(*avCodec, sampleRate);
   avCodecContext_->sample_rate = sampleRate;
 
-  // Note: This is the format of the **input** waveform. This doesn't determine
-  // the output.
-  // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
-  // planar.
-  // TODO-ENCODING If the encoder doesn't support FLTP (like flac), FFmpeg will
-  // raise. We need to handle this, probably converting the format with
-  // libswresample.
-  avCodecContext_->sample_fmt = AV_SAMPLE_FMT_FLTP;
+  // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we
+  // may need to convert the wf into a supported output sample format, which is
+  // what the `.sample_fmt` defines.
+  avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec);
 
   int numChannels = static_cast<int>(wf_.sizes()[0]);
   TORCH_CHECK(
@@ -120,12 +156,6 @@ AudioEncoder::AudioEncoder(
       "avcodec_open2 failed: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  TORCH_CHECK(
-      avCodecContext_->frame_size > 0,
-      "frame_size is ",
-      avCodecContext_->frame_size,
-      ". Cannot encode. This should probably never happen?");
-
   // We're allocating the stream here. Streams are meant to be freed by
   // avformat_free_context(avFormatContext), which we call in the
   // avFormatContext_'s destructor.
@@ -143,8 +173,11 @@ AudioEncoder::AudioEncoder(
 void AudioEncoder::encode() {
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
-  avFrame->nb_samples = avCodecContext_->frame_size;
-  avFrame->format = avCodecContext_->sample_fmt;
+  //  Default to 256 like in torchaudio
+  int numSamplesAllocatedPerFrame =
+      avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256;
+  avFrame->nb_samples = numSamplesAllocatedPerFrame;
+  avFrame->format = AV_SAMPLE_FMT_FLTP;
   avFrame->sample_rate = avCodecContext_->sample_rate;
   avFrame->pts = 0;
   setChannelLayout(avFrame, avCodecContext_);
@@ -160,7 +193,6 @@ void AudioEncoder::encode() {
   uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
   int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel
   int numEncodedSamples = 0; // per channel
-  int numSamplesPerFrame = avCodecContext_->frame_size; // per channel
   int numBytesPerSample = static_cast<int>(wf_.element_size());
   int numBytesPerChannel = numSamples * numBytesPerSample;
 
@@ -178,7 +210,7 @@ void AudioEncoder::encode() {
         getFFMPEGErrorStringFromErrorCode(status));
 
     int numSamplesToEncode =
-        std::min(numSamplesPerFrame, numSamples - numEncodedSamples);
+        std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples);
     int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
 
     for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
@@ -211,7 +243,37 @@ void AudioEncoder::encode() {
 
 void AudioEncoder::encodeInnerLoop(
     AutoAVPacket& autoAVPacket,
-    const UniqueAVFrame& avFrame) {
+    const UniqueAVFrame& srcAVFrame) {
+  bool mustConvert =
+      (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP &&
+       srcAVFrame != nullptr);
+  UniqueAVFrame convertedAVFrame;
+  if (mustConvert) {
+    if (!swrContext_) {
+      swrContext_.reset(createSwrContext(
+          avCodecContext_,
+          AV_SAMPLE_FMT_FLTP,
+          avCodecContext_->sample_fmt,
+          srcAVFrame->sample_rate, // No sample rate conversion
+          srcAVFrame->sample_rate));
+    }
+    convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
+        swrContext_,
+        srcAVFrame,
+        avCodecContext_->sample_fmt,
+        srcAVFrame->sample_rate, // No sample rate conversion
+        srcAVFrame->sample_rate);
+    TORCH_CHECK(
+        convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
+        "convertedAVFrame->nb_samples=",
+        convertedAVFrame->nb_samples,
+        " differs from ",
+        "srcAVFrame->nb_samples=",
+        srcAVFrame->nb_samples,
+        "This is unexpected, please report on the TorchCodec bug tracker.");
+  }
+  const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
+
   auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
   TORCH_CHECK(
       status == AVSUCCESS,
@@ -248,6 +310,9 @@ void AudioEncoder::encodeInnerLoop(
 }
 
 void AudioEncoder::flushBuffers() {
+  // We flush the main FFmpeg buffers, but not swresample buffers. Flushing
+  // swresample is only necessary when converting sample rates, which we don't
+  // do for encoding.
   AutoAVPacket autoAVPacket;
   encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
 }
 
@@ -20,18 +20,19 @@ class AudioEncoder {
       // encoding will still work but audio will be distorted.
       int sampleRate,
       std::string_view fileName,
-      std::optional<int64_t> bit_rate = std::nullopt);
+      std::optional<int64_t> bitRate = std::nullopt);
   void encode();
 
  private:
   void encodeInnerLoop(
       AutoAVPacket& autoAVPacket,
-      const UniqueAVFrame& avFrame);
+      const UniqueAVFrame& srcAVFrame);
   void flushBuffers();
 
   UniqueEncodingAVFormatContext avFormatContext_;
   UniqueAVCodecContext avCodecContext_;
   int streamIndex_;
+  UniqueSwrContext swrContext_;
 
   const torch::Tensor wf_;
 };
 
@@ -116,16 +116,17 @@ void setChannelLayout(
 #endif
 }
 
-SwrContext* allocateSwrContext(
+SwrContext* createSwrContext(
     UniqueAVCodecContext& avCodecContext,
     AVSampleFormat sourceSampleFormat,
     AVSampleFormat desiredSampleFormat,
     int sourceSampleRate,
     int desiredSampleRate) {
   SwrContext* swrContext = nullptr;
+  int status = AVSUCCESS;
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
   AVChannelLayout layout = avCodecContext->ch_layout;
-  auto status = swr_alloc_set_opts2(
+  status = swr_alloc_set_opts2(
       &swrContext,
       &layout,
       desiredSampleFormat,
@@ -155,9 +156,77 @@ SwrContext* allocateSwrContext(
 #endif
 
   TORCH_CHECK(swrContext != nullptr, "Couldn't create swrContext");
+  status = swr_init(swrContext);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Couldn't initialize SwrContext: ",
+      getFFMPEGErrorStringFromErrorCode(status),
+      ". If the error says 'Invalid argument', it's likely that you are using "
+      "a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
+      "valid scenarios. Try to upgrade FFmpeg?");
   return swrContext;
 }
 
+UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
+    const UniqueSwrContext& swrContext,
+    const UniqueAVFrame& srcAVFrame,
+    AVSampleFormat desiredSampleFormat,
+    int sourceSampleRate,
+    int desiredSampleRate) {
+  UniqueAVFrame convertedAVFrame(av_frame_alloc());
+  TORCH_CHECK(
+      convertedAVFrame,
+      "Could not allocate frame for sample format conversion.");
+
+  setChannelLayout(convertedAVFrame, srcAVFrame);
+  convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
+  convertedAVFrame->sample_rate = desiredSampleRate;
+  if (sourceSampleRate != desiredSampleRate) {
+    // Note that this is an upper bound on the number of output samples.
+    // `swr_convert()` will likely not fill convertedAVFrame with that many
+    // samples if sample rate conversion is needed. It will buffer the last few
+    // ones because those require future samples. That's also why we reset
+    // nb_samples after the call to `swr_convert()`.
+    // We could also use `swr_get_out_samples()` to determine the number of
+    // output samples, but empirically `av_rescale_rnd()` seems to provide a
+    // tighter bound.
+    convertedAVFrame->nb_samples = av_rescale_rnd(
+        swr_get_delay(swrContext.get(), sourceSampleRate) +
+            srcAVFrame->nb_samples,
+        desiredSampleRate,
+        sourceSampleRate,
+        AV_ROUND_UP);
+  } else {
+    convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
+  }
+
+  auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Could not allocate frame buffers for sample format conversion: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  auto numConvertedSamples = swr_convert(
+      swrContext.get(),
+      convertedAVFrame->data,
+      convertedAVFrame->nb_samples,
+      static_cast<const uint8_t**>(
+          const_cast<const uint8_t**>(srcAVFrame->data)),
+      srcAVFrame->nb_samples);
+  // numConvertedSamples can be 0 if we're downsampling by a great factor and
+  // the first frame doesn't contain a lot of samples. It should be handled
+  // properly by the caller.
+  TORCH_CHECK(
+      numConvertedSamples >= 0,
+      "Error in swr_convert: ",
+      getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
+
+  // See comment above about nb_samples
+  convertedAVFrame->nb_samples = numConvertedSamples;
+
+  return convertedAVFrame;
+}
+
 void setFFmpegLogLevel() {
   auto logLevel = AV_LOG_QUIET;
   const char* logLevelEnvPtr = std::getenv("TORCHCODEC_FFMPEG_LOG_LEVEL");
 
@@ -158,13 +158,20 @@ void setChannelLayout(
 void setChannelLayout(
     UniqueAVFrame& dstAVFrame,
     const UniqueAVFrame& srcAVFrame);
-SwrContext* allocateSwrContext(
+SwrContext* createSwrContext(
     UniqueAVCodecContext& avCodecContext,
     AVSampleFormat sourceSampleFormat,
     AVSampleFormat desiredSampleFormat,
     int sourceSampleRate,
     int desiredSampleRate);
 
+UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
+    const UniqueSwrContext& swrContext,
+    const UniqueAVFrame& srcAVFrame,
+    AVSampleFormat desiredSampleFormat,
+    int sourceSampleRate,
+    int desiredSampleRate);
+
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();