meta-pytorch
diff --git a/‎src/torchcodec/decoders/_core/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎src/torchcodec/decoders/_core/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 52 additions & 1 deletion b/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 52 additions & 1 deletion
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 13 additions & 1 deletion b/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 110 additions & 24 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 110 additions & 24 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 13 additions & 0 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 13 additions & 0 deletions
@@ -77,10 +77,10 @@ if(DEFINED ENV{BUILD_AGAINST_ALL_FFMPEG_FROM_S3})
     )
 
 
-	make_torchcodec_library(libtorchcodec4 ffmpeg4)
 	make_torchcodec_library(libtorchcodec7 ffmpeg7)
 	make_torchcodec_library(libtorchcodec6 ffmpeg6)
 	make_torchcodec_library(libtorchcodec5 ffmpeg5)
+	make_torchcodec_library(libtorchcodec4 ffmpeg4)
 
 else()
     message(
@@ -97,6 +97,7 @@ else()
         libavformat
         libavcodec
         libavutil
+        libswresample
         libswscale
     )
 
 
@@ -60,7 +60,7 @@ int64_t getDuration(const AVFrame* frame) {
 #endif
 }
 
-int getNumChannels(const AVFrame* avFrame) {
+int getNumChannels(const UniqueAVFrame& avFrame) {
 #if LIBAVFILTER_VERSION_MAJOR > 8 || \
     (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
   return avFrame->ch_layout.nb_channels;
@@ -78,6 +78,57 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
 #endif
 }
 
+void setChannelLayout(
+    UniqueAVFrame& dstAVFrame,
+    const UniqueAVFrame& srcAVFrame) {
+#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
+  dstAVFrame->ch_layout = srcAVFrame->ch_layout;
+#else
+  dstAVFrame->channel_layout = srcAVFrame->channel_layout;
+#endif
+}
+
+SwrContext* allocateSwrContext(
+    UniqueAVCodecContext& avCodecContext,
+    int sampleRate,
+    AVSampleFormat sourceSampleFormat,
+    AVSampleFormat desiredSampleFormat) {
+  SwrContext* swrContext = nullptr;
+#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
+  AVChannelLayout layout = avCodecContext->ch_layout;
+  auto status = swr_alloc_set_opts2(
+      &swrContext,
+      &layout,
+      desiredSampleFormat,
+      sampleRate,
+      &layout,
+      sourceSampleFormat,
+      sampleRate,
+      0,
+      nullptr);
+
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Couldn't create SwrContext: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+#else
+  int64_t layout = static_cast<int64_t>(avCodecContext->channel_layout);
+  swrContext = swr_alloc_set_opts(
+      nullptr,
+      layout,
+      desiredSampleFormat,
+      sampleRate,
+      layout,
+      sourceSampleFormat,
+      sampleRate,
+      0,
+      nullptr);
+#endif
+
+  TORCH_CHECK(swrContext != nullptr, "Couldn't create swrContext");
+  return swrContext;
+}
+
 AVIOBytesContext::AVIOBytesContext(
     const void* data,
     size_t dataSize,
 
@@ -22,6 +22,7 @@ extern "C" {
 #include <libavutil/opt.h>
 #include <libavutil/pixfmt.h>
 #include <libavutil/version.h>
+#include <libswresample/swresample.h>
 #include <libswscale/swscale.h>
 }
 
@@ -67,6 +68,8 @@ using UniqueAVIOContext = std::
     unique_ptr<AVIOContext, Deleterp<AVIOContext, void, avio_context_free>>;
 using UniqueSwsContext =
     std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
+using UniqueSwrContext =
+    std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
 
 // These 2 classes share the same underlying AVPacket object. They are meant to
 // be used in tandem, like so:
@@ -139,9 +142,18 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
 int64_t getDuration(const UniqueAVFrame& frame);
 int64_t getDuration(const AVFrame* frame);
 
-int getNumChannels(const AVFrame* avFrame);
+int getNumChannels(const UniqueAVFrame& avFrame);
 int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
+void setChannelLayout(
+    UniqueAVFrame& dstAVFrame,
+    const UniqueAVFrame& srcAVFrame);
+SwrContext* allocateSwrContext(
+    UniqueAVCodecContext& avCodecContext,
+    int sampleRate,
+    AVSampleFormat sourceSampleFormat,
+    AVSampleFormat desiredSampleFormat);
+
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();
 
 
@@ -23,6 +23,7 @@ extern "C" {
 #include <libavutil/imgutils.h>
 #include <libavutil/log.h>
 #include <libavutil/pixdesc.h>
+#include <libswresample/swresample.h>
 #include <libswscale/swscale.h>
 }
 
@@ -467,6 +468,7 @@ void VideoDecoder::addStream(
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
   streamInfo.codecContext->thread_count = ffmpegThreadCount.value_or(0);
+  streamInfo.codecContext->pkt_timebase = streamInfo.stream->time_base;
 
   // TODO_CODE_QUALITY same as above.
   if (mediaType == AVMEDIA_TYPE_VIDEO && device.type() == torch::kCUDA) {
@@ -558,6 +560,12 @@ void VideoDecoder::addAudioStream(int streamIndex) {
       static_cast<int64_t>(streamInfo.codecContext->sample_rate);
   streamMetadata.numChannels =
       static_cast<int64_t>(getNumChannels(streamInfo.codecContext));
+
+  // FFmpeg docs say that the decoder will try to decode natively in this
+  // format, if it can. Docs don't say what the decoder does when it doesn't
+  // support that format, but it looks like it does nothing, so this probably
+  // doesn't hurt.
+  streamInfo.codecContext->request_sample_fmt = AV_SAMPLE_FMT_FLTP;
 }
 
 // --------------------------------------------------------------------------
@@ -566,13 +574,15 @@ void VideoDecoder::addAudioStream(int streamIndex) {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
   auto output = getNextFrameInternal();
-  output.data = maybePermuteHWC2CHW(output.data);
+  if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
+    output.data = maybePermuteHWC2CHW(output.data);
+  }
   return output;
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
   AVFrameStream avFrameStream = decodeAVFrame(
       [this](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
   return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
@@ -868,7 +878,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
     // If we need to seek backwards, then we have to seek back to the beginning
     // of the stream.
     // TODO-AUDIO: document why this is needed in a big comment.
-    setCursorPtsInSeconds(INT64_MIN);
+    setCursorPtsInSecondsInternal(INT64_MIN);
   }
 
   // TODO-AUDIO Pre-allocate a long-enough tensor instead of creating a vec +
@@ -914,6 +924,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
 // --------------------------------------------------------------------------
 
 void VideoDecoder::setCursorPtsInSeconds(double seconds) {
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  setCursorPtsInSecondsInternal(seconds);
+}
+
+void VideoDecoder::setCursorPtsInSecondsInternal(double seconds) {
   cursorWasJustSet_ = true;
   cursor_ =
       secondsToClosestPts(seconds, streamInfos_[activeStreamIndex_].timeBase);
@@ -1342,37 +1357,89 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
       !preAllocatedOutputTensor.has_value(),
       "pre-allocated audio tensor not supported yet.");
 
-  const AVFrame* avFrame = avFrameStream.avFrame.get();
+  AVSampleFormat sourceSampleFormat =
+      static_cast<AVSampleFormat>(avFrameStream.avFrame->format);
+  AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
+
+  UniqueAVFrame convertedAVFrame;
+  if (sourceSampleFormat != desiredSampleFormat) {
+    convertedAVFrame = convertAudioAVFrameSampleFormat(
+        avFrameStream.avFrame, sourceSampleFormat, desiredSampleFormat);
+  }
+  const UniqueAVFrame& avFrame = (sourceSampleFormat != desiredSampleFormat)
+      ? convertedAVFrame
+      : avFrameStream.avFrame;
+
+  AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
+  TORCH_CHECK(
+      format == desiredSampleFormat,
+      "Something went wrong, the frame didn't get converted to the desired format. ",
+      "Desired format = ",
+      av_get_sample_fmt_name(desiredSampleFormat),
+      "source format = ",
+      av_get_sample_fmt_name(format));
 
   auto numSamples = avFrame->nb_samples; // per channel
   auto numChannels = getNumChannels(avFrame);
   torch::Tensor outputData =
       torch::empty({numChannels, numSamples}, torch::kFloat32);
 
-  AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
-  // TODO-AUDIO Implement all formats.
-  switch (format) {
-    case AV_SAMPLE_FMT_FLTP: {
-      uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
-      auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
-      for (auto channel = 0; channel < numChannels;
-           ++channel, outputChannelData += numBytesPerChannel) {
-        memcpy(
-            outputChannelData,
-            avFrame->extended_data[channel],
-            numBytesPerChannel);
-      }
-      break;
-    }
-    default:
-      TORCH_CHECK(
-          false,
-          "Unsupported audio format (yet!): ",
-          av_get_sample_fmt_name(format));
+  uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
+  auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
+  for (auto channel = 0; channel < numChannels;
+       ++channel, outputChannelData += numBytesPerChannel) {
+    memcpy(
+        outputChannelData, avFrame->extended_data[channel], numBytesPerChannel);
   }
   frameOutput.data = outputData;
 }
 
+UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat(
+    const UniqueAVFrame& avFrame,
+    AVSampleFormat sourceSampleFormat,
+    AVSampleFormat desiredSampleFormat
+
+) {
+  auto& streamInfo = streamInfos_[activeStreamIndex_];
+  const auto& streamMetadata =
+      containerMetadata_.allStreamMetadata[activeStreamIndex_];
+  int sampleRate = static_cast<int>(streamMetadata.sampleRate.value());
+
+  if (!streamInfo.swrContext) {
+    createSwrContext(
+        streamInfo, sampleRate, sourceSampleFormat, desiredSampleFormat);
+  }
+
+  UniqueAVFrame convertedAVFrame(av_frame_alloc());
+  TORCH_CHECK(
+      convertedAVFrame,
+      "Could not allocate frame for sample format conversion.");
+
+  setChannelLayout(convertedAVFrame, avFrame);
+  convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
+  convertedAVFrame->sample_rate = avFrame->sample_rate;
+  convertedAVFrame->nb_samples = avFrame->nb_samples;
+
+  auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Could not allocate frame buffers for sample format conversion: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  auto numSampleConverted = swr_convert(
+      streamInfo.swrContext.get(),
+      convertedAVFrame->data,
+      convertedAVFrame->nb_samples,
+      static_cast<const uint8_t**>(const_cast<const uint8_t**>(avFrame->data)),
+      avFrame->nb_samples);
+  TORCH_CHECK(
+      numSampleConverted > 0,
+      "Error in swr_convert: ",
+      getFFMPEGErrorStringFromErrorCode(numSampleConverted));
+
+  return convertedAVFrame;
+}
+
 // --------------------------------------------------------------------------
 // OUTPUT ALLOCATION AND SHAPE CONVERSION
 // --------------------------------------------------------------------------
@@ -1606,6 +1673,25 @@ void VideoDecoder::createSwsContext(
   streamInfo.swsContext.reset(swsContext);
 }
 
+void VideoDecoder::createSwrContext(
+    StreamInfo& streamInfo,
+    int sampleRate,
+    AVSampleFormat sourceSampleFormat,
+    AVSampleFormat desiredSampleFormat) {
+  auto swrContext = allocateSwrContext(
+      streamInfo.codecContext,
+      sampleRate,
+      sourceSampleFormat,
+      desiredSampleFormat);
+
+  auto status = swr_init(swrContext);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Couldn't initialize SwrContext: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  streamInfo.swrContext.reset(swrContext);
+}
+
 // --------------------------------------------------------------------------
 // PTS <-> INDEX CONVERSIONS
 // --------------------------------------------------------------------------
 
@@ -355,6 +355,7 @@ class VideoDecoder {
     FilterGraphContext filterGraphContext;
     ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
     UniqueSwsContext swsContext;
+    UniqueSwrContext swrContext;
 
     // Used to know whether a new FilterGraphContext or UniqueSwsContext should
     // be created before decoding a new frame.
@@ -370,6 +371,7 @@ class VideoDecoder {
   // DECODING APIS AND RELATED UTILS
   // --------------------------------------------------------------------------
 
+  void setCursorPtsInSecondsInternal(double seconds);
   bool canWeAvoidSeeking() const;
 
   void maybeSeekToBeforeDesiredPts();
@@ -401,6 +403,11 @@ class VideoDecoder {
       const AVFrame* avFrame,
       torch::Tensor& outputTensor);
 
+  UniqueAVFrame convertAudioAVFrameSampleFormat(
+      const UniqueAVFrame& avFrame,
+      AVSampleFormat sourceSampleFormat,
+      AVSampleFormat desiredSampleFormat);
+
   // --------------------------------------------------------------------------
   // COLOR CONVERSION LIBRARIES HANDLERS CREATION
   // --------------------------------------------------------------------------
@@ -415,6 +422,12 @@ class VideoDecoder {
       const DecodedFrameContext& frameContext,
       const enum AVColorSpace colorspace);
 
+  void createSwrContext(
+      StreamInfo& streamInfo,
+      int sampleRate,
+      AVSampleFormat sourceSampleFormat,
+      AVSampleFormat desiredSampleFormat);
+
   // --------------------------------------------------------------------------
   // PTS <-> INDEX CONVERSIONS
   // --------------------------------------------------------------------------