Skip to content

Commit b2eed2f

Browse files
committed
Merge branch 'move-conversion-out' into encoding_sample_rate_lezzzgo
2 parents 3ce4612 + 6c91450 commit b2eed2f

File tree

3 files changed

+83
-76
lines changed

3 files changed

+83
-76
lines changed

src/torchcodec/_core/Encoder.cpp

Lines changed: 78 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,6 @@ void AudioEncoder::initializeEncoder(
215215
status == AVSUCCESS,
216216
"avcodec_open2 failed: ",
217217
getFFMPEGErrorStringFromErrorCode(status));
218-
219-
bool supportsVariableFrameSize = avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE;
220-
printf("supportsVariableFrameSize = %d\n", supportsVariableFrameSize);
221218

222219
// We're allocating the stream here. Streams are meant to be freed by
223220
// avformat_free_context(avFormatContext), which we call in the
@@ -232,11 +229,19 @@ void AudioEncoder::initializeEncoder(
232229
getFFMPEGErrorStringFromErrorCode(status));
233230
streamIndex_ = avStream->index;
234231

235-
// frame_size * 2 is a decent default size. FFmpeg automatically re-allocates
236-
// the fifo if more space is needed.
237-
auto avAudioFifo = av_audio_fifo_alloc(avCodecContext_->sample_fmt, outNumChannels_, avCodecContext_->frame_size * 2);
238-
TORCH_CHECK(avAudioFifo!= nullptr, "Couldn't create AVAudioFifo.");
239-
avAudioFifo_.reset(avAudioFifo);
232+
// bool supportsVariableFrameSize =
233+
// avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE;
234+
// printf("supportsVariableFrameSize = %d\n", supportsVariableFrameSize);
235+
236+
// // frame_size * 2 is a decent default size. FFmpeg automatically
237+
// re-allocates
238+
// // the fifo if more space is needed.
239+
// auto avAudioFifo = av_audio_fifo_alloc(
240+
// avCodecContext_->sample_fmt,
241+
// outNumChannels_,
242+
// avCodecContext_->frame_size * 2);
243+
// TORCH_CHECK(avAudioFifo != nullptr, "Couldn't create AVAudioFifo.");
244+
// avAudioFifo_.reset(avAudioFifo);
240245
}
241246

242247
torch::Tensor AudioEncoder::encodeToTensor() {
@@ -300,10 +305,13 @@ void AudioEncoder::encode() {
300305
// encoded frame would contain more samples than necessary and our results
301306
// wouldn't match the ffmpeg CLI.
302307
avFrame->nb_samples = numSamplesToEncode;
303-
encodeInnerLoop(autoAVPacket, avFrame);
304308

305-
avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
309+
UniqueAVFrame convertedAVFrame = maybeConvertAVFrame(avFrame);
310+
encodeInnerLoop(autoAVPacket, convertedAVFrame);
311+
306312
numEncodedSamples += numSamplesToEncode;
313+
// TODO-ENCODING set frame pts correctly, and test against it.
314+
// avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
307315
}
308316
TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
309317

@@ -316,67 +324,69 @@ void AudioEncoder::encode() {
316324
getFFMPEGErrorStringFromErrorCode(status));
317325
}
318326

319-
void AudioEncoder::encodeInnerLoop(
320-
AutoAVPacket& autoAVPacket,
321-
UniqueAVFrame& srcAVFrame,
322-
bool allowConvert) {
323-
// TODO: Probably makes more sense to move the conversion away? It shouldn't
324-
// be in inner loop in any case. We should also remove allowConvert.
325-
bool mustConvert =
326-
(allowConvert && srcAVFrame != nullptr &&
327-
(static_cast<AVSampleFormat>(srcAVFrame->format) !=
328-
avCodecContext_->sample_fmt ||
329-
getNumChannels(srcAVFrame) != outNumChannels_ ||
330-
srcAVFrame->sample_rate != outSampleRate_));
331-
332-
UniqueAVFrame convertedAVFrame;
333-
if (mustConvert) {
334-
if (!swrContext_) {
335-
swrContext_.reset(createSwrContext(
336-
AV_SAMPLE_FMT_FLTP,
337-
avCodecContext_->sample_fmt,
338-
srcAVFrame->sample_rate,
339-
outSampleRate_,
340-
srcAVFrame,
341-
outNumChannels_));
342-
}
343-
convertedAVFrame = convertAudioAVFrameSamples(
344-
swrContext_,
345-
srcAVFrame,
327+
UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) {
328+
if (static_cast<AVSampleFormat>(avFrame->format) ==
329+
avCodecContext_->sample_fmt &&
330+
getNumChannels(avFrame) == outNumChannels_ &&
331+
avFrame->sample_rate == outSampleRate_) {
332+
// Note: the clone references the same underlying data, it's a cheap copy.
333+
return UniqueAVFrame(av_frame_clone(avFrame.get()));
334+
}
335+
336+
if (!swrContext_) {
337+
swrContext_.reset(createSwrContext(
338+
static_cast<AVSampleFormat>(avFrame->format),
346339
avCodecContext_->sample_fmt,
340+
avFrame->sample_rate,
347341
outSampleRate_,
348-
outNumChannels_);
349-
if (outSampleRate_ == sampleRateInput_) {
350-
TORCH_CHECK(
351-
convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
352-
"convertedAVFrame->nb_samples=",
353-
convertedAVFrame->nb_samples,
354-
" differs from ",
355-
"srcAVFrame->nb_samples=",
356-
srcAVFrame->nb_samples,
357-
"This is unexpected, please report on the TorchCodec bug tracker.");
358-
}
342+
avFrame,
343+
outNumChannels_));
359344
}
360-
UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
361-
362-
if (avFrame != nullptr) {
363-
// TODO static cast
364-
int numSamplesWritten = av_audio_fifo_write(avAudioFifo_.get(), (void**)avFrame->data, avFrame->nb_samples);
365-
TORCH_CHECK(numSamplesWritten == avFrame->nb_samples, "Tried to write TODO");
366-
printf("Writing %d samples to fifo (size = %d)\n", avFrame->nb_samples, av_audio_fifo_size(avAudioFifo_.get()));
367-
368-
avFrame = allocateAVFrame(avCodecContext_->frame_size, outSampleRate_, outNumChannels_);
369-
// TODO cast
370-
int numSamplesRead = av_audio_fifo_read(avAudioFifo_.get(), (void**)avFrame->data, avFrame->nb_samples);
371-
printf("Read %d from fifo\n", numSamplesRead);
372-
TORCH_CHECK(numSamplesRead > 0, "Tried to read TODO");
345+
UniqueAVFrame convertedAVFrame = convertAudioAVFrameSamples(
346+
swrContext_,
347+
avFrame,
348+
avCodecContext_->sample_fmt,
349+
outSampleRate_,
350+
outNumChannels_);
351+
352+
if (avFrame->sample_rate == outSampleRate_) {
353+
TORCH_CHECK(
354+
convertedAVFrame->nb_samples == avFrame->nb_samples,
355+
"convertedAVFrame->nb_samples=",
356+
convertedAVFrame->nb_samples,
357+
" differs from ",
358+
"avFrame->nb_samples=",
359+
avFrame->nb_samples,
360+
"This is unexpected, please report on the TorchCodec bug tracker.");
373361
}
362+
return convertedAVFrame;
363+
}
374364

375-
if (avFrame != nullptr) {
376-
printf("Sending frame with %d samples\n", avFrame->nb_samples);
377-
} else{
378-
printf("AVFrame is empty\n");
379-
}
365+
void AudioEncoder::encodeInnerLoop(
366+
AutoAVPacket& autoAVPacket,
367+
const UniqueAVFrame& avFrame) {
368+
// if (avFrame != nullptr) {
369+
// // TODO static cast
370+
// int numSamplesWritten = av_audio_fifo_write(avAudioFifo_.get(),
371+
// (void**)avFrame->data, avFrame->nb_samples);
372+
// TORCH_CHECK(numSamplesWritten == avFrame->nb_samples, "Tried to write
373+
// TODO"); printf("Writing %d samples to fifo (size = %d)\n",
374+
// avFrame->nb_samples, av_audio_fifo_size(avAudioFifo_.get()));
375+
376+
// avFrame = allocateAVFrame(avCodecContext_->frame_size, outSampleRate_,
377+
// outNumChannels_);
378+
// // TODO cast
379+
// int numSamplesRead = av_audio_fifo_read(avAudioFifo_.get(),
380+
// (void**)avFrame->data, avFrame->nb_samples); printf("Read %d from
381+
// fifo\n", numSamplesRead); TORCH_CHECK(numSamplesRead > 0, "Tried to
382+
// read TODO");
383+
// }
384+
385+
// if (avFrame != nullptr) {
386+
// printf("Sending frame with %d samples\n", avFrame->nb_samples);
387+
// } else{
388+
// printf("AVFrame is empty\n");
389+
// }
380390
auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
381391
TORCH_CHECK(
382392
status == AVSUCCESS,
@@ -434,13 +444,12 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
434444
swrContext_.get(), avFrame->data, avFrame->nb_samples, NULL, 0);
435445
avFrame->nb_samples = actualNumRemainingSamples;
436446

437-
encodeInnerLoop(autoAVPacket, avFrame, false);
447+
encodeInnerLoop(autoAVPacket, avFrame);
438448
}
439449

440450
void AudioEncoder::flushBuffers() {
441451
AutoAVPacket autoAVPacket;
442452
maybeFlushSwrBuffers(autoAVPacket);
443-
auto zob = UniqueAVFrame(nullptr);
444-
encodeInnerLoop(autoAVPacket, zob);
453+
encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
445454
}
446455
} // namespace facebook::torchcodec

src/torchcodec/_core/Encoder.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ class AudioEncoder {
3636

3737
private:
3838
void initializeEncoder(const AudioStreamOptions& audioStreamOptions);
39+
UniqueAVFrame maybeConvertAVFrame(const UniqueAVFrame& avFrame);
3940
void encodeInnerLoop(
4041
AutoAVPacket& autoAVPacket,
41-
UniqueAVFrame& srcAVFrame,
42-
bool allowConvert = true);
42+
const UniqueAVFrame& srcAVFrame);
4343
void maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket);
4444
void flushBuffers();
4545

@@ -55,7 +55,6 @@ class AudioEncoder {
5555
const torch::Tensor wf_;
5656
int sampleRateInput_ = -1;
5757

58-
5958
UniqueAVAudioFifo avAudioFifo_;
6059

6160
// Stores the AVIOContext for the output tensor buffer.

src/torchcodec/_core/FFMPEGCommon.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ extern "C" {
1515
#include <libavfilter/avfilter.h>
1616
#include <libavformat/avformat.h>
1717
#include <libavformat/avio.h>
18+
#include <libavutil/audio_fifo.h>
1819
#include <libavutil/avutil.h>
1920
#include <libavutil/dict.h>
2021
#include <libavutil/display.h>
2122
#include <libavutil/file.h>
2223
#include <libavutil/opt.h>
2324
#include <libavutil/pixfmt.h>
2425
#include <libavutil/version.h>
25-
#include <libavutil/audio_fifo.h>
2626
#include <libswresample/swresample.h>
2727
#include <libswscale/swscale.h>
2828
}
@@ -74,9 +74,8 @@ using UniqueSwsContext =
7474
std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
7575
using UniqueSwrContext =
7676
std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
77-
using UniqueAVAudioFifo = std::unique_ptr<
78-
AVAudioFifo,
79-
Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
77+
using UniqueAVAudioFifo = std::
78+
unique_ptr<AVAudioFifo, Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
8079

8180
// These 2 classes share the same underlying AVPacket object. They are meant to
8281
// be used in tandem, like so:

0 commit comments

Comments
 (0)