-
Notifications
You must be signed in to change notification settings - Fork 75
Encoding: support wav, flac etc. #630
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
7921558
2a19014
73bdc85
54f5543
24842b6
c3ac80a
5b39c8f
1f9f904
f525848
9150137
872b569
a0dcafd
f49d507
ee3a199
485ee2e
27fdbac
8467b92
9ab1bd1
67dba5a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -92,14 +92,13 @@ AudioEncoder::AudioEncoder( | |
| validateSampleRate(*avCodec, sampleRate); | ||
| avCodecContext_->sample_rate = sampleRate; | ||
|
|
||
| // Note: This is the format of the **input** waveform. This doesn't determine | ||
| // the output. | ||
| // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we | ||
| // may need to convert the wf into a supported output sample format, which is | ||
| // what the `.sample_fmt` defines. | ||
| avCodecContext_->sample_fmt = findOutputSampleFormat(*avCodec); | ||
|
|
||
| // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed | ||
| // planar. | ||
| // TODO-ENCODING If the encoder doesn't support FLTP (like flac), FFmpeg will | ||
| // raise. We need to handle this, probably converting the format with | ||
| // libswresample. | ||
| avCodecContext_->sample_fmt = AV_SAMPLE_FMT_FLTP; | ||
| // planar (fltp). | ||
|
|
||
| int numChannels = static_cast<int>(wf_.sizes()[0]); | ||
| TORCH_CHECK( | ||
|
|
@@ -120,12 +119,6 @@ AudioEncoder::AudioEncoder( | |
| "avcodec_open2 failed: ", | ||
| getFFMPEGErrorStringFromErrorCode(status)); | ||
|
|
||
| TORCH_CHECK( | ||
| avCodecContext_->frame_size > 0, | ||
| "frame_size is ", | ||
| avCodecContext_->frame_size, | ||
| ". Cannot encode. This should probably never happen?"); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This won't always be non-zero, see below.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting - it might be worth noting why from the docs (https://ffmpeg.org/doxygen/6.0/structAVCodecContext.html#aec57f0d859a6df8b479cd93ca3a44a33, which I admit to not understanding) when we turn 0 into our default. |
||
|
|
||
| // We're allocating the stream here. Streams are meant to be freed by | ||
| // avformat_free_context(avFormatContext), which we call in the | ||
| // avFormatContext_'s destructor. | ||
|
|
@@ -140,11 +133,37 @@ AudioEncoder::AudioEncoder( | |
| streamIndex_ = avStream->index; | ||
| } | ||
|
|
||
| AVSampleFormat AudioEncoder::findOutputSampleFormat(const AVCodec& avCodec) { | ||
| // Find a sample format that the encoder supports. If FLTP is supported then | ||
| // we use that, since this is the expected format of the input waveform. | ||
| // Otherwise, we'll need to convert the waveform before passing it to the | ||
| // encoder. Right now, the output format we'll choose is just the first format | ||
| // in the `sample_fmts` list that the AVCodec defines. Eventually, we may | ||
| // allow the user to choose. | ||
| // TODO-ENCODING: a better default would probably be to choose the highest | ||
| // available precision | ||
| if (avCodec.sample_fmts == nullptr) { | ||
| // Can't really validate anything in this case, best we can do is hope that | ||
| // FLTP is supported by the encoder. If not, FFmpeg will raise. | ||
| return AV_SAMPLE_FMT_FLTP; | ||
| } | ||
|
|
||
| for (auto i = 0; avCodec.sample_fmts[i] != -1; ++i) { | ||
| if (avCodec.sample_fmts[i] == AV_SAMPLE_FMT_FLTP) { | ||
| return AV_SAMPLE_FMT_FLTP; | ||
| } | ||
| } | ||
| return avCodec.sample_fmts[0]; | ||
NicolasHug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| void AudioEncoder::encode() { | ||
| UniqueAVFrame avFrame(av_frame_alloc()); | ||
| TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame."); | ||
| avFrame->nb_samples = avCodecContext_->frame_size; | ||
| avFrame->format = avCodecContext_->sample_fmt; | ||
| // Default to 256 like in torchaudio | ||
| int numSamplesAllocatedPerFrame = | ||
| avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256; | ||
| avFrame->nb_samples = numSamplesAllocatedPerFrame; | ||
| avFrame->format = AV_SAMPLE_FMT_FLTP; | ||
| avFrame->sample_rate = avCodecContext_->sample_rate; | ||
| avFrame->pts = 0; | ||
| setChannelLayout(avFrame, avCodecContext_); | ||
|
|
@@ -160,7 +179,6 @@ void AudioEncoder::encode() { | |
| uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr()); | ||
| int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel | ||
| int numEncodedSamples = 0; // per channel | ||
| int numSamplesPerFrame = avCodecContext_->frame_size; // per channel | ||
| int numBytesPerSample = static_cast<int>(wf_.element_size()); | ||
| int numBytesPerChannel = numSamples * numBytesPerSample; | ||
|
|
||
|
|
@@ -178,7 +196,7 @@ void AudioEncoder::encode() { | |
| getFFMPEGErrorStringFromErrorCode(status)); | ||
|
|
||
| int numSamplesToEncode = | ||
| std::min(numSamplesPerFrame, numSamples - numEncodedSamples); | ||
| std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples); | ||
| int numBytesToEncode = numSamplesToEncode * numBytesPerSample; | ||
|
|
||
| for (int ch = 0; ch < wf_.sizes()[0]; ch++) { | ||
|
|
@@ -211,7 +229,37 @@ void AudioEncoder::encode() { | |
|
|
||
| void AudioEncoder::encodeInnerLoop( | ||
| AutoAVPacket& autoAVPacket, | ||
| const UniqueAVFrame& avFrame) { | ||
| const UniqueAVFrame& srcAVFrame) { | ||
| bool mustConvert = | ||
| (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP && | ||
| srcAVFrame != nullptr); | ||
| UniqueAVFrame convertedAVFrame; | ||
| if (mustConvert) { | ||
| if (!swrContext_) { | ||
| swrContext_.reset(createSwrContext( | ||
| avCodecContext_, | ||
| AV_SAMPLE_FMT_FLTP, | ||
| avCodecContext_->sample_fmt, | ||
| srcAVFrame->sample_rate, // No sample rate conversion | ||
| srcAVFrame->sample_rate)); | ||
| } | ||
| convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate( | ||
| swrContext_, | ||
| srcAVFrame, | ||
| avCodecContext_->sample_fmt, | ||
| srcAVFrame->sample_rate, // No sample rate conversion | ||
| srcAVFrame->sample_rate); | ||
| TORCH_CHECK( | ||
| convertedAVFrame->nb_samples == srcAVFrame->nb_samples, | ||
| "convertedAVFrame->nb_samples=", | ||
| convertedAVFrame->nb_samples, | ||
| " differs from ", | ||
| "srcAVFrame->nb_samples=", | ||
| srcAVFrame->nb_samples, | ||
| "This is unexpected, please report on the TorchCodec bug tracker."); | ||
| } | ||
| const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame; | ||
|
|
||
| auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get()); | ||
| TORCH_CHECK( | ||
| status == AVSUCCESS, | ||
|
|
@@ -248,6 +296,9 @@ void AudioEncoder::encodeInnerLoop( | |
| } | ||
|
|
||
| void AudioEncoder::flushBuffers() { | ||
| // We flush the main FFmpeg buffers, but not swresample buffers. Flushing | ||
| // swresample is only necessary when converting sample rates, which we don't | ||
| // do for encoding. | ||
| AutoAVPacket autoAVPacket; | ||
| encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr)); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My original comment was wrong: that's not the format of the input waveform. It's the format of the input AVFrame that we pass to
avcodec_send_frame(). And it needs to be a format that the codec supports.