@@ -23,6 +23,7 @@ extern "C" {
2323#include < libavutil/imgutils.h>
2424#include < libavutil/log.h>
2525#include < libavutil/pixdesc.h>
26+ #include < libswresample/swresample.h>
2627#include < libswscale/swscale.h>
2728}
2829
@@ -541,14 +542,18 @@ void VideoDecoder::addVideoStream(
541542 videoStreamOptions.colorConversionLibrary .value_or (defaultLibrary);
542543}
543544
544- void VideoDecoder::addAudioStream (int streamIndex) {
545+ void VideoDecoder::addAudioStream (
546+ int streamIndex,
547+ const AudioStreamOptions& audioStreamOptions) {
545548 TORCH_CHECK (
546549 seekMode_ == SeekMode::approximate,
547550 " seek_mode must be 'approximate' for audio streams." );
548551
549552 addStream (streamIndex, AVMEDIA_TYPE_AUDIO);
550553
551554 auto & streamInfo = streamInfos_[activeStreamIndex_];
555+ streamInfo.audioStreamOptions = audioStreamOptions;
556+
552557 auto & streamMetadata =
553558 containerMetadata_.allStreamMetadata [activeStreamIndex_];
554559 streamMetadata.sampleRate =
@@ -1332,6 +1337,82 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13321337 " pre-allocated audio tensor not supported yet." );
13331338
13341339 const AVFrame* avFrame = avFrameStream.avFrame .get ();
1340+ AVFrame* output_frame = nullptr ;
1341+ SwrContext* swr_ctx = NULL ; // TODO RAII
1342+
1343+ const auto sampleRate =
1344+ streamInfos_[activeStreamIndex_].audioStreamOptions .sampleRate ;
1345+ if (sampleRate.has_value ()) {
1346+ int outRate = static_cast <int >(*sampleRate);
1347+ auto & streamMetadata =
1348+ containerMetadata_.allStreamMetadata [activeStreamIndex_];
1349+ int inRate = static_cast <int >(streamMetadata.sampleRate .value ());
1350+
1351+ printf (" RESAMPLEING FROM %d to %d\n " , outRate, inRate);
1352+ AVSampleFormat sampleFormat = AV_SAMPLE_FMT_FLTP;
1353+
1354+ AVChannelLayout stereoLayout = AV_CHANNEL_LAYOUT_STEREO;
1355+ const AVChannelLayout* chl = &stereoLayout;
1356+
1357+ int status = swr_alloc_set_opts2 (
1358+ &swr_ctx,
1359+ chl,
1360+ sampleFormat,
1361+ outRate,
1362+ chl,
1363+ sampleFormat,
1364+ inRate,
1365+ 0 ,
1366+ NULL );
1367+
1368+ TORCH_CHECK (status == 0 , " IS NULL" );
1369+
1370+ if (swr_init (swr_ctx) < 0 ) {
1371+ swr_free (&swr_ctx);
1372+ TORCH_CHECK (false , " Failed to initialize the resampling context\n " );
1373+ }
1374+
1375+ // Allocate output frame
1376+ output_frame = av_frame_alloc ();
1377+ if (!output_frame) {
1378+ swr_free (&swr_ctx);
1379+ TORCH_CHECK (false , " Could not allocate output frame\n " );
1380+ }
1381+ output_frame->ch_layout = stereoLayout;
1382+ output_frame->sample_rate = outRate;
1383+ output_frame->format = sampleFormat;
1384+
1385+ output_frame->nb_samples = av_rescale_rnd (
1386+ swr_get_delay (swr_ctx, inRate) + avFrame->nb_samples ,
1387+ outRate,
1388+ inRate,
1389+ AV_ROUND_UP);
1390+
1391+ if (av_frame_get_buffer (output_frame, 0 ) < 0 ) {
1392+ av_frame_free (&output_frame);
1393+ swr_free (&swr_ctx);
1394+ TORCH_CHECK (false , " Could not allocate output frame samples" );
1395+ }
1396+
1397+ int ret = swr_convert (
1398+ swr_ctx,
1399+ output_frame->data ,
1400+ output_frame->nb_samples ,
1401+ (const uint8_t **)avFrame->data ,
1402+ avFrame->nb_samples );
1403+ if (ret < 0 ) {
1404+ av_frame_free (&output_frame);
1405+ swr_free (&swr_ctx);
1406+ TORCH_CHECK (false , " Error while converting\n " );
1407+ }
1408+
1409+ printf (
1410+ " nb_samples: %d %d\n " , avFrame->nb_samples , output_frame->nb_samples );
1411+
1412+ avFrame = output_frame; // lmao
1413+ } else {
1414+ printf (" NO RESAMPLING\n " );
1415+ }
13351416
13361417 auto numSamples = avFrame->nb_samples ; // per channel
13371418 auto numChannels = getNumChannels (avFrame);
@@ -1360,6 +1441,10 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13601441 av_get_sample_fmt_name (format));
13611442 }
13621443 frameOutput.data = outputData;
1444+
1445+ // TODO
1446+ av_frame_free (&output_frame);
1447+ swr_free (&swr_ctx);
13631448}
13641449
13651450// --------------------------------------------------------------------------
0 commit comments