-
Notifications
You must be signed in to change notification settings - Fork 75
Encoding: allow user-defined encoded sample rate #700
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 34 commits
52d624b
aad9c7d
2d76a7b
7d643f2
5d9eb54
c40deef
96e5e60
952af0f
88a87c4
e0ba0c5
2c559b2
70ae1a1
75e23b9
b6e3c27
387328a
823e7f0
4be2953
639d5ab
3ce4612
6c91450
b2eed2f
8fdb6ed
3399b34
6d7908f
f30d0ff
ef1b461
6d2aef1
17cd1d8
17340a6
e74d72d
5ef60d7
51e80a3
b6f8478
0af41b7
00ca07e
fa0856a
61bbe4f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -105,7 +105,7 @@ AudioEncoder::AudioEncoder( | |
| int sampleRate, | ||
| std::string_view fileName, | ||
| const AudioStreamOptions& audioStreamOptions) | ||
| : samples_(validateSamples(samples)) { | ||
| : samples_(validateSamples(samples)), inSampleRate_(sampleRate) { | ||
| setFFmpegLogLevel(); | ||
| AVFormatContext* avFormatContext = nullptr; | ||
| int status = avformat_alloc_output_context2( | ||
|
|
@@ -128,7 +128,7 @@ AudioEncoder::AudioEncoder( | |
| ", make sure it's a valid path? ", | ||
| getFFMPEGErrorStringFromErrorCode(status)); | ||
|
|
||
| initializeEncoder(sampleRate, audioStreamOptions); | ||
| initializeEncoder(audioStreamOptions); | ||
| } | ||
|
|
||
| AudioEncoder::AudioEncoder( | ||
|
|
@@ -138,6 +138,7 @@ AudioEncoder::AudioEncoder( | |
| std::unique_ptr<AVIOToTensorContext> avioContextHolder, | ||
| const AudioStreamOptions& audioStreamOptions) | ||
| : samples_(validateSamples(samples)), | ||
| inSampleRate_(sampleRate), | ||
| avioContextHolder_(std::move(avioContextHolder)) { | ||
| setFFmpegLogLevel(); | ||
| AVFormatContext* avFormatContext = nullptr; | ||
|
|
@@ -155,11 +156,10 @@ AudioEncoder::AudioEncoder( | |
|
|
||
| avFormatContext_->pb = avioContextHolder_->getAVIOContext(); | ||
|
|
||
| initializeEncoder(sampleRate, audioStreamOptions); | ||
| initializeEncoder(audioStreamOptions); | ||
| } | ||
|
|
||
| void AudioEncoder::initializeEncoder( | ||
| int sampleRate, | ||
| const AudioStreamOptions& audioStreamOptions) { | ||
| // We use the AVFormatContext's default codec for that | ||
| // specific format/container. | ||
|
|
@@ -187,8 +187,9 @@ void AudioEncoder::initializeEncoder( | |
| // not related to the input sampes. | ||
| setDefaultChannelLayout(avCodecContext_, outNumChannels_); | ||
|
|
||
| validateSampleRate(*avCodec, sampleRate); | ||
| avCodecContext_->sample_rate = sampleRate; | ||
| outSampleRate_ = audioStreamOptions.sampleRate.value_or(inSampleRate_); | ||
| validateSampleRate(*avCodec, outSampleRate_); | ||
| avCodecContext_->sample_rate = outSampleRate_; | ||
|
|
||
| // Input samples are expected to be FLTP. Not all encoders support FLTP, so we | ||
| // may need to convert the samples into a supported output sample format, | ||
|
|
@@ -213,6 +214,21 @@ void AudioEncoder::initializeEncoder( | |
| "avcodec_parameters_from_context failed: ", | ||
| getFFMPEGErrorStringFromErrorCode(status)); | ||
| streamIndex_ = avStream->index; | ||
|
|
||
| // If sample rate conversion is needed and the encoder doesn't support | ||
| // variable frame size, we need to create an intermediate FIFO. See | ||
| // [Encoding loop, sample rate conversion and FIFO]. | ||
| if (((avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE) == 0) && | ||
| (inSampleRate_ != outSampleRate_)) { | ||
| // frame_size * 2 is a decent default size. FFmpeg automatically | ||
| // re-allocates the fifo if more space is needed. | ||
| auto avAudioFifo = av_audio_fifo_alloc( | ||
| avCodecContext_->sample_fmt, | ||
| outNumChannels_, | ||
| avCodecContext_->frame_size * 2); | ||
| TORCH_CHECK(avAudioFifo != nullptr, "Couldn't create AVAudioFifo."); | ||
| avAudioFifo_.reset(avAudioFifo); | ||
| } | ||
| } | ||
|
|
||
| torch::Tensor AudioEncoder::encodeToTensor() { | ||
|
|
@@ -230,24 +246,15 @@ void AudioEncoder::encode() { | |
| TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice."); | ||
| encodeWasCalled_ = true; | ||
|
|
||
| UniqueAVFrame avFrame(av_frame_alloc()); | ||
| TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame."); | ||
| // Default to 256 like in torchaudio | ||
| int numSamplesAllocatedPerFrame = | ||
| avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256; | ||
| avFrame->nb_samples = numSamplesAllocatedPerFrame; | ||
| avFrame->format = AV_SAMPLE_FMT_FLTP; | ||
| avFrame->sample_rate = avCodecContext_->sample_rate; | ||
| UniqueAVFrame avFrame = allocateAVFrame( | ||
| numSamplesAllocatedPerFrame, | ||
| inSampleRate_, | ||
| static_cast<int>(samples_.sizes()[0]), | ||
| AV_SAMPLE_FMT_FLTP); | ||
| avFrame->pts = 0; | ||
| // We set the channel layout of the frame to the default layout corresponding | ||
| // to the input samples' number of channels | ||
| setDefaultChannelLayout(avFrame, static_cast<int>(samples_.sizes()[0])); | ||
|
|
||
| auto status = av_frame_get_buffer(avFrame.get(), 0); | ||
| TORCH_CHECK( | ||
| status == AVSUCCESS, | ||
| "Couldn't allocate avFrame's buffers: ", | ||
| getFFMPEGErrorStringFromErrorCode(status)); | ||
|
|
||
| AutoAVPacket autoAVPacket; | ||
|
|
||
|
|
@@ -257,19 +264,13 @@ void AudioEncoder::encode() { | |
| int numBytesPerSample = static_cast<int>(samples_.element_size()); | ||
| int numBytesPerChannel = numSamples * numBytesPerSample; | ||
|
|
||
| status = avformat_write_header(avFormatContext_.get(), nullptr); | ||
| auto status = avformat_write_header(avFormatContext_.get(), nullptr); | ||
| TORCH_CHECK( | ||
| status == AVSUCCESS, | ||
| "Error in avformat_write_header: ", | ||
| getFFMPEGErrorStringFromErrorCode(status)); | ||
|
|
||
| while (numEncodedSamples < numSamples) { | ||
| status = av_frame_make_writable(avFrame.get()); | ||
| TORCH_CHECK( | ||
| status == AVSUCCESS, | ||
| "Couldn't make AVFrame writable: ", | ||
| getFFMPEGErrorStringFromErrorCode(status)); | ||
|
|
||
| int numSamplesToEncode = | ||
| std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples); | ||
| int numBytesToEncode = numSamplesToEncode * numBytesPerSample; | ||
|
|
@@ -290,10 +291,9 @@ void AudioEncoder::encode() { | |
| avFrame->nb_samples = numSamplesToEncode; | ||
|
|
||
| UniqueAVFrame convertedAVFrame = maybeConvertAVFrame(avFrame); | ||
| encodeInnerLoop(autoAVPacket, convertedAVFrame); | ||
| encodeFrameThroughFifo(autoAVPacket, convertedAVFrame); | ||
|
|
||
| numEncodedSamples += numSamplesToEncode; | ||
| avFrame->pts += static_cast<int64_t>(numSamplesToEncode); | ||
| } | ||
| TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong."); | ||
|
|
||
|
|
@@ -309,7 +309,8 @@ void AudioEncoder::encode() { | |
| UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) { | ||
| if (static_cast<AVSampleFormat>(avFrame->format) == | ||
| avCodecContext_->sample_fmt && | ||
| getNumChannels(avFrame) == outNumChannels_) { | ||
| getNumChannels(avFrame) == outNumChannels_ && | ||
| avFrame->sample_rate == outSampleRate_) { | ||
| // Note: the clone references the same underlying data, it's a cheap copy. | ||
| return UniqueAVFrame(av_frame_clone(avFrame.get())); | ||
| } | ||
|
|
@@ -318,31 +319,84 @@ UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) { | |
| swrContext_.reset(createSwrContext( | ||
| static_cast<AVSampleFormat>(avFrame->format), | ||
| avCodecContext_->sample_fmt, | ||
| avFrame->sample_rate, // No sample rate conversion | ||
| avFrame->sample_rate, | ||
| outSampleRate_, | ||
| avFrame, | ||
| outNumChannels_)); | ||
| } | ||
| UniqueAVFrame convertedAVFrame = convertAudioAVFrameSamples( | ||
| swrContext_, | ||
| avFrame, | ||
| avCodecContext_->sample_fmt, | ||
| avFrame->sample_rate, // No sample rate conversion | ||
| outSampleRate_, | ||
| outNumChannels_); | ||
|
|
||
| if (avFrame->sample_rate == outSampleRate_) { | ||
| TORCH_CHECK( | ||
| convertedAVFrame->nb_samples == avFrame->nb_samples, | ||
| "convertedAVFrame->nb_samples=", | ||
| convertedAVFrame->nb_samples, | ||
| " differs from ", | ||
| "avFrame->nb_samples=", | ||
| avFrame->nb_samples, | ||
| "This is unexpected, please report on the TorchCodec bug tracker."); | ||
| } | ||
| return convertedAVFrame; | ||
| } | ||
|
|
||
| void AudioEncoder::encodeFrameThroughFifo( | ||
| AutoAVPacket& autoAVPacket, | ||
| const UniqueAVFrame& avFrame, | ||
| bool andFlushFifo) { | ||
| if (avAudioFifo_ == nullptr) { | ||
| encodeFrame(autoAVPacket, avFrame); | ||
| return; | ||
| } | ||
| int numSamplesWritten = av_audio_fifo_write( | ||
| avAudioFifo_.get(), | ||
| reinterpret_cast<void**>(avFrame->data), | ||
| avFrame->nb_samples); | ||
| TORCH_CHECK( | ||
| convertedAVFrame->nb_samples == avFrame->nb_samples, | ||
| "convertedAVFrame->nb_samples=", | ||
| convertedAVFrame->nb_samples, | ||
| " differs from ", | ||
| "avFrame->nb_samples=", | ||
| numSamplesWritten == avFrame->nb_samples, | ||
| "Tried to write ", | ||
| avFrame->nb_samples, | ||
| "This is unexpected, please report on the TorchCodec bug tracker."); | ||
| return convertedAVFrame; | ||
| " samples, but only wrote ", | ||
| numSamplesWritten); | ||
|
|
||
| UniqueAVFrame newavFrame = allocateAVFrame( | ||
| avCodecContext_->frame_size, | ||
| outSampleRate_, | ||
| outNumChannels_, | ||
| avCodecContext_->sample_fmt); | ||
|
|
||
| while (av_audio_fifo_size(avAudioFifo_.get()) >= | ||
| (andFlushFifo ? 1 : avCodecContext_->frame_size)) { | ||
| int samplesToRead = std::min( | ||
| av_audio_fifo_size(avAudioFifo_.get()), newavFrame->nb_samples); | ||
| int numSamplesRead = av_audio_fifo_read( | ||
| avAudioFifo_.get(), | ||
| reinterpret_cast<void**>(newavFrame->data), | ||
| samplesToRead); | ||
| TORCH_CHECK( | ||
| numSamplesRead == samplesToRead, | ||
| "Tried to read ", | ||
| samplesToRead, | ||
| " samples, but only read ", | ||
| numSamplesRead); | ||
|
|
||
| newavFrame->nb_samples = numSamplesRead; | ||
| encodeFrame(autoAVPacket, newavFrame); | ||
| } | ||
| } | ||
|
|
||
| void AudioEncoder::encodeInnerLoop( | ||
| void AudioEncoder::encodeFrame( | ||
| AutoAVPacket& autoAVPacket, | ||
| const UniqueAVFrame& avFrame) { | ||
| if (avFrame != nullptr) { | ||
| avFrame->pts = lastEncodedAVFramePts_; | ||
| lastEncodedAVFramePts_ += avFrame->nb_samples; | ||
| } | ||
|
|
||
| auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get()); | ||
| TORCH_CHECK( | ||
| status == AVSUCCESS, | ||
|
|
@@ -381,11 +435,39 @@ void AudioEncoder::encodeInnerLoop( | |
| } | ||
| } | ||
|
|
||
| void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) { | ||
| // Similar to the decoder's method with the same name, but for encoding this | ||
| // time. That is, when sample conversion is involved, libswresample may have | ||
| // buffered some samples that we now need to flush and send to the encoder. | ||
| if (swrContext_ == nullptr && inSampleRate_ == outSampleRate_) { | ||
| return; | ||
| } | ||
| TORCH_CHECK( | ||
| swrContext_ != nullptr, | ||
| "swrContext is null, but sample rate conversion is needed. ", | ||
| "This is unexpected, please report on the TorchCodec bug tracker."); | ||
|
|
||
| int numRemainingSamples = // this is an upper bound | ||
| swr_get_out_samples(swrContext_.get(), 0); | ||
| if (numRemainingSamples == 0) { | ||
| return; | ||
| } | ||
|
|
||
| UniqueAVFrame avFrame = allocateAVFrame( | ||
| numRemainingSamples, | ||
| outSampleRate_, | ||
| outNumChannels_, | ||
| avCodecContext_->sample_fmt); | ||
| int actualNumRemainingSamples = swr_convert( | ||
| swrContext_.get(), avFrame->data, avFrame->nb_samples, NULL, 0); | ||
| avFrame->nb_samples = actualNumRemainingSamples; | ||
|
|
||
| encodeFrameThroughFifo(autoAVPacket, avFrame, /*andFlushFifo=*/true); | ||
|
||
| } | ||
|
|
||
| void AudioEncoder::flushBuffers() { | ||
| // We flush the main FFmpeg buffers, but not swresample buffers. Flushing | ||
| // swresample is only necessary when converting sample rates, which we don't | ||
| // do for encoding. | ||
| AutoAVPacket autoAVPacket; | ||
| encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr)); | ||
| maybeFlushSwrBuffers(autoAVPacket); | ||
| encodeFrame(autoAVPacket, UniqueAVFrame(nullptr)); | ||
| } | ||
| } // namespace facebook::torchcodec | ||
Uh oh!
There was an error while loading. Please reload this page.