Skip to content

Commit 979e72a

Browse files
committed
WIP
1 parent 846b7b8 commit 979e72a

File tree

1 file changed

+18
-7
lines changed

1 file changed

+18
-7
lines changed

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,12 @@ void VideoDecoder::addAudioStream(int streamIndex) {
556556
static_cast<int64_t>(streamInfo.codecContext->sample_rate);
557557
streamMetadata.numChannels =
558558
static_cast<int64_t>(getNumChannels(streamInfo.codecContext));
559+
560+
// FFmpeg docs say that the decoder will try to decode natively in this
561+
// format, if it can. Docs don't say what the decoder does when it doesn't
562+
// support that format, but it looks like it does nothing, so this probably
563+
// doesn't hurt.
564+
streamInfo.codecContext->request_sample_fmt = AV_SAMPLE_FMT_FLTP;
559565
}
560566

561567
// --------------------------------------------------------------------------
@@ -1342,24 +1348,28 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13421348

13431349
const AVFrame* avFrame = avFrameStream.avFrame.get();
13441350

1345-
AVFrame* output_frame = nullptr;
1346-
SwrContext* swr_ctx = NULL; // TODO RAII
1351+
AVSampleFormat sourceSampleFormat =
1352+
static_cast<AVSampleFormat>(avFrame->format);
1353+
AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
1354+
1355+
AVFrame* output_frame = nullptr;
1356+
SwrContext* swr_ctx = NULL; // TODO RAII
1357+
if (sourceSampleFormat != desiredSampleFormat) {
13471358

13481359
const auto& streamInfo = streamInfos_[activeStreamIndex_];
13491360
const auto& streamMetadata =
13501361
containerMetadata_.allStreamMetadata[activeStreamIndex_];
13511362
int sampleRate = static_cast<int>(streamMetadata.sampleRate.value());
13521363

1353-
AVSampleFormat sampleFormat = AV_SAMPLE_FMT_FLTP;
13541364
AVChannelLayout layout = streamInfo.codecContext->ch_layout;
13551365

13561366
int status = swr_alloc_set_opts2(
13571367
&swr_ctx,
13581368
&layout,
1359-
sampleFormat,
1369+
desiredSampleFormat,
13601370
sampleRate,
13611371
&layout,
1362-
sampleFormat,
1372+
sourceSampleFormat,
13631373
sampleRate,
13641374
0,
13651375
NULL);
@@ -1379,7 +1389,7 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13791389
}
13801390
output_frame->ch_layout = layout;
13811391
output_frame->sample_rate = sampleRate;
1382-
output_frame->format = sampleFormat;
1392+
output_frame->format = desiredSampleFormat;
13831393

13841394
output_frame->nb_samples = av_rescale_rnd(
13851395
swr_get_delay(swr_ctx, sampleRate) + avFrame->nb_samples,
@@ -1406,6 +1416,7 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
14061416
}
14071417

14081418
avFrame = output_frame; // lmao
1419+
}
14091420

14101421
auto numSamples = avFrame->nb_samples; // per channel
14111422
auto numChannels = getNumChannels(avFrame);
@@ -1434,7 +1445,7 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
14341445
av_get_sample_fmt_name(format));
14351446
}
14361447
frameOutput.data = outputData;
1437-
// TODO
1448+
// TODO
14381449
av_frame_free(&output_frame);
14391450
swr_free(&swr_ctx);
14401451
}

0 commit comments

Comments
 (0)