@@ -1355,9 +1355,14 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13551355 int desiredSampleRate =
13561356 streamInfo.audioStreamOptions .sampleRate .value_or (sourceSampleRate);
13571357
1358+ int sourceNumChannels = getNumChannels (srcAVFrame);
1359+ int desiredNumChannels =
1360+ streamInfo.audioStreamOptions .numChannels .value_or (sourceNumChannels);
1361+
13581362 bool mustConvert =
13591363 (sourceSampleFormat != desiredSampleFormat ||
1360- sourceSampleRate != desiredSampleRate);
1364+ sourceSampleRate != desiredSampleRate ||
1365+ sourceNumChannels != desiredNumChannels);
13611366
13621367 UniqueAVFrame convertedAVFrame;
13631368 if (mustConvert) {
@@ -1367,10 +1372,11 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13671372 sourceSampleFormat,
13681373 desiredSampleFormat,
13691374 sourceSampleRate,
1370- desiredSampleRate));
1375+ desiredSampleRate,
1376+ desiredNumChannels));
13711377 }
13721378
1373- convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate (
1379+ convertedAVFrame = convertAudioAVFrameSamples (
13741380 streamInfo.swrContext ,
13751381 srcAVFrame,
13761382 desiredSampleFormat,
@@ -1389,15 +1395,15 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13891395 av_get_sample_fmt_name (format));
13901396
13911397 auto numSamples = avFrame->nb_samples ; // per channel
1392- auto numChannels = getNumChannels (avFrame);
13931398
1394- frameOutput.data = torch::empty ({numChannels, numSamples}, torch::kFloat32 );
1399+ frameOutput.data =
1400+ torch::empty ({desiredNumChannels, numSamples}, torch::kFloat32 );
13951401
13961402 if (numSamples > 0 ) {
13971403 uint8_t * outputChannelData =
13981404 static_cast <uint8_t *>(frameOutput.data .data_ptr ());
13991405 auto numBytesPerChannel = numSamples * av_get_bytes_per_sample (format);
1400- for (auto channel = 0 ; channel < numChannels ;
1406+ for (auto channel = 0 ; channel < desiredNumChannels ;
14011407 ++channel, outputChannelData += numBytesPerChannel) {
14021408 std::memcpy (
14031409 outputChannelData,
@@ -1424,7 +1430,8 @@ std::optional<torch::Tensor> SingleStreamDecoder::maybeFlushSwrBuffers() {
14241430 return std::nullopt ;
14251431 }
14261432
1427- auto numChannels = getNumChannels (streamInfo.codecContext );
1433+ int numChannels = streamInfo.audioStreamOptions .numChannels .value_or (
1434+ getNumChannels (streamInfo.codecContext ));
14281435 torch::Tensor lastSamples =
14291436 torch::empty ({numChannels, numRemainingSamples}, torch::kFloat32 );
14301437
0 commit comments