WIP

NicolasHug · NicolasHug · commit 823e9bba3e36 · 2025-03-13T14:43:29.000Z
diff --git a/src/torchcodec/decoders/_core/CMakeLists.txt b/src/torchcodec/decoders/_core/CMakeLists.txt
@@ -4,7 +4,8 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 find_package(Torch REQUIRED)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic -Werror ${TORCH_CXX_FLAGS}")
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic -Werror ${TORCH_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra ${TORCH_CXX_FLAGS}")
 find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
 
 function(make_torchcodec_library library_name ffmpeg_target)
@@ -97,6 +98,7 @@ else()
         libavformat
         libavcodec
         libavutil
+        libswresample
         libswscale
     )
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -23,6 +23,7 @@ extern "C" {
 #include <libavutil/imgutils.h>
 #include <libavutil/log.h>
 #include <libavutil/pixdesc.h>
+#include <libswresample/swresample.h>
 #include <libswscale/swscale.h>
 }
 
@@ -1341,6 +1342,71 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
 
   const AVFrame* avFrame = avFrameStream.avFrame.get();
 
+   AVFrame* output_frame = nullptr;
+   SwrContext* swr_ctx = NULL; // TODO RAII
+
+    const auto& streamInfo = streamInfos_[activeStreamIndex_];
+    const auto& streamMetadata =
+        containerMetadata_.allStreamMetadata[activeStreamIndex_];
+    int sampleRate = static_cast<int>(streamMetadata.sampleRate.value());
+
+    AVSampleFormat sampleFormat = AV_SAMPLE_FMT_FLTP;
+    AVChannelLayout layout = streamInfo.codecContext->ch_layout;
+
+    int status = swr_alloc_set_opts2(
+        &swr_ctx,
+        &layout,
+        sampleFormat,
+        sampleRate,
+        &layout,
+        sampleFormat,
+        sampleRate,
+        0,
+        NULL);
+
+    TORCH_CHECK(status == 0, "IS NULL");
+
+    if (swr_init(swr_ctx) < 0) {
+      swr_free(&swr_ctx);
+      TORCH_CHECK(false, "Failed to initialize the resampling context\n");
+    }
+
+    // Allocate output frame
+    output_frame = av_frame_alloc();
+    if (!output_frame) {
+      swr_free(&swr_ctx);
+      TORCH_CHECK(false, "Could not allocate output frame\n");
+    }
+    output_frame->ch_layout = layout;
+    output_frame->sample_rate = sampleRate;
+    output_frame->format = sampleFormat;
+
+    output_frame->nb_samples = av_rescale_rnd(
+        swr_get_delay(swr_ctx, sampleRate) + avFrame->nb_samples,
+        sampleRate,
+        sampleRate,
+        AV_ROUND_UP);
+
+    if (av_frame_get_buffer(output_frame, 0) < 0) {
+      av_frame_free(&output_frame);
+      swr_free(&swr_ctx);
+      TORCH_CHECK(false, "Could not allocate output frame samples");
+    }
+
+    int ret = swr_convert(
+        swr_ctx,
+        output_frame->data,
+        output_frame->nb_samples,
+        (const uint8_t**)avFrame->data,
+        avFrame->nb_samples);
+    if (ret < 0) {
+      av_frame_free(&output_frame);
+      swr_free(&swr_ctx);
+      TORCH_CHECK(false, "Error while converting\n");
+    }
+
+    avFrame = output_frame; // lmao
+
   auto numSamples = avFrame->nb_samples; // per channel
   auto numChannels = getNumChannels(avFrame);
   torch::Tensor outputData =
@@ -1368,6 +1434,9 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
           av_get_sample_fmt_name(format));
   }
   frameOutput.data = outputData;
+    // TODO
+  av_frame_free(&output_frame);
+  swr_free(&swr_ctx);
 }
 
 // --------------------------------------------------------------------------