@@ -23,6 +23,7 @@ extern "C" {
2323#include < libavutil/imgutils.h>
2424#include < libavutil/log.h>
2525#include < libavutil/pixdesc.h>
26+ #include < libswresample/swresample.h>
2627#include < libswscale/swscale.h>
2728}
2829
@@ -1341,6 +1342,71 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13411342
13421343 const AVFrame* avFrame = avFrameStream.avFrame .get ();
13431344
1345+ AVFrame* output_frame = nullptr ;
1346+ SwrContext* swr_ctx = NULL ; // TODO RAII
1347+
1348+ const auto & streamInfo = streamInfos_[activeStreamIndex_];
1349+ const auto & streamMetadata =
1350+ containerMetadata_.allStreamMetadata [activeStreamIndex_];
1351+ int sampleRate = static_cast <int >(streamMetadata.sampleRate .value ());
1352+
1353+ AVSampleFormat sampleFormat = AV_SAMPLE_FMT_FLTP;
1354+ AVChannelLayout layout = streamInfo.codecContext ->ch_layout ;
1355+
1356+ int status = swr_alloc_set_opts2 (
1357+ &swr_ctx,
1358+ &layout,
1359+ sampleFormat,
1360+ sampleRate,
1361+ &layout,
1362+ sampleFormat,
1363+ sampleRate,
1364+ 0 ,
1365+ NULL );
1366+
1367+ TORCH_CHECK (status == 0 , " IS NULL" );
1368+
1369+ if (swr_init (swr_ctx) < 0 ) {
1370+ swr_free (&swr_ctx);
1371+ TORCH_CHECK (false , " Failed to initialize the resampling context\n " );
1372+ }
1373+
1374+ // Allocate output frame
1375+ output_frame = av_frame_alloc ();
1376+ if (!output_frame) {
1377+ swr_free (&swr_ctx);
1378+ TORCH_CHECK (false , " Could not allocate output frame\n " );
1379+ }
1380+ output_frame->ch_layout = layout;
1381+ output_frame->sample_rate = sampleRate;
1382+ output_frame->format = sampleFormat;
1383+
1384+ output_frame->nb_samples = av_rescale_rnd (
1385+ swr_get_delay (swr_ctx, sampleRate) + avFrame->nb_samples ,
1386+ sampleRate,
1387+ sampleRate,
1388+ AV_ROUND_UP);
1389+
1390+ if (av_frame_get_buffer (output_frame, 0 ) < 0 ) {
1391+ av_frame_free (&output_frame);
1392+ swr_free (&swr_ctx);
1393+ TORCH_CHECK (false , " Could not allocate output frame samples" );
1394+ }
1395+
1396+ int ret = swr_convert (
1397+ swr_ctx,
1398+ output_frame->data ,
1399+ output_frame->nb_samples ,
1400+ (const uint8_t **)avFrame->data ,
1401+ avFrame->nb_samples );
1402+ if (ret < 0 ) {
1403+ av_frame_free (&output_frame);
1404+ swr_free (&swr_ctx);
1405+ TORCH_CHECK (false , " Error while converting\n " );
1406+ }
1407+
1408+ avFrame = output_frame; // lmao
1409+
13441410 auto numSamples = avFrame->nb_samples ; // per channel
13451411 auto numChannels = getNumChannels (avFrame);
13461412 torch::Tensor outputData =
@@ -1368,6 +1434,9 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13681434 av_get_sample_fmt_name (format));
13691435 }
13701436 frameOutput.data = outputData;
1437+ // TODO
1438+ av_frame_free (&output_frame);
1439+ swr_free (&swr_ctx);
13711440}
13721441
13731442// --------------------------------------------------------------------------
0 commit comments