Skip to content

Commit 6d7908f

Browse files
committed
Use intermediate FIFO, WIP
1 parent 3399b34 commit 6d7908f

File tree

2 files changed

+51
-35
lines changed

2 files changed

+51
-35
lines changed

src/torchcodec/_core/Encoder.cpp

Lines changed: 50 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,18 @@ AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
9696
return avCodec.sample_fmts[0];
9797
}
9898

99-
UniqueAVFrame allocateAVFrame(int numSamples, int sampleRate, int numChannels) {
99+
UniqueAVFrame allocateAVFrame(
100+
int numSamples,
101+
int sampleRate,
102+
int numChannels,
103+
AVSampleFormat sampleFormat) {
100104
auto avFrame = UniqueAVFrame(av_frame_alloc());
101105
TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
102106

103107
avFrame->nb_samples = numSamples;
104-
avFrame->format = AV_SAMPLE_FMT_FLTP;
105108
avFrame->sample_rate = sampleRate;
106109
av_channel_layout_default(&avFrame->ch_layout, numChannels);
110+
avFrame->format = sampleFormat;
107111
auto status = av_frame_get_buffer(avFrame.get(), 0);
108112
TORCH_CHECK(
109113
status == AVSUCCESS,
@@ -239,12 +243,12 @@ void AudioEncoder::initializeEncoder(
239243
// // frame_size * 2 is a decent default size. FFmpeg automatically
240244
// re-allocates
241245
// // the fifo if more space is needed.
242-
// auto avAudioFifo = av_audio_fifo_alloc(
243-
// avCodecContext_->sample_fmt,
244-
// outNumChannels_,
245-
// avCodecContext_->frame_size * 2);
246-
// TORCH_CHECK(avAudioFifo != nullptr, "Couldn't create AVAudioFifo.");
247-
// avAudioFifo_.reset(avAudioFifo);
246+
auto avAudioFifo = av_audio_fifo_alloc(
247+
avCodecContext_->sample_fmt,
248+
outNumChannels_,
249+
avCodecContext_->frame_size * 2);
250+
TORCH_CHECK(avAudioFifo != nullptr, "Couldn't create AVAudioFifo.");
251+
avAudioFifo_.reset(avAudioFifo);
248252
}
249253

250254
torch::Tensor AudioEncoder::encodeToTensor() {
@@ -268,7 +272,8 @@ void AudioEncoder::encode() {
268272
UniqueAVFrame avFrame = allocateAVFrame(
269273
numSamplesAllocatedPerFrame,
270274
sampleRateInput_,
271-
static_cast<int>(samples_.sizes()[0]));
275+
static_cast<int>(samples_.sizes()[0]),
276+
AV_SAMPLE_FMT_FLTP);
272277
avFrame->pts = 0;
273278

274279
AutoAVPacket autoAVPacket;
@@ -312,7 +317,34 @@ void AudioEncoder::encode() {
312317
avFrame->nb_samples = numSamplesToEncode;
313318

314319
UniqueAVFrame convertedAVFrame = maybeConvertAVFrame(avFrame);
315-
encodeInnerLoop(autoAVPacket, convertedAVFrame);
320+
// TODO static cast
321+
int numSamplesWritten = av_audio_fifo_write(
322+
avAudioFifo_.get(),
323+
(void**)convertedAVFrame->data,
324+
convertedAVFrame->nb_samples);
325+
TORCH_CHECK(
326+
numSamplesWritten == convertedAVFrame->nb_samples,
327+
"Tried to write TODO");
328+
329+
UniqueAVFrame newavFrame = allocateAVFrame(
330+
avCodecContext_->frame_size,
331+
outSampleRate_,
332+
outNumChannels_,
333+
avCodecContext_->sample_fmt);
334+
while (av_audio_fifo_size(avAudioFifo_.get()) >=
335+
avCodecContext_->frame_size) {
336+
337+
// TODO cast
338+
int numSamplesRead = av_audio_fifo_read(
339+
avAudioFifo_.get(), (void**)newavFrame->data, newavFrame->nb_samples);
340+
TORCH_CHECK(numSamplesRead > 0, "Tried to read TODO");
341+
342+
// UniqueAVFrame clonedFrame(av_frame_clone(newavFrame.get()));
343+
// UniqueAVFrame refFrame(av_frame_alloc());
344+
// av_frame_ref(refFrame.get(), newavFrame.get());
345+
346+
encodeInnerLoop(autoAVPacket, newavFrame);
347+
}
316348

317349
numEncodedSamples += numSamplesToEncode;
318350
// TODO-ENCODING set frame pts correctly, and test against it.
@@ -335,6 +367,7 @@ UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) {
335367
getNumChannels(avFrame) == outNumChannels_ &&
336368
avFrame->sample_rate == outSampleRate_) {
337369
// Note: the clone references the same underlying data, it's a cheap copy.
370+
TORCH_CHECK(false, "unexpected");
338371
return UniqueAVFrame(av_frame_clone(avFrame.get()));
339372
}
340373

@@ -370,28 +403,6 @@ UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) {
370403
void AudioEncoder::encodeInnerLoop(
371404
AutoAVPacket& autoAVPacket,
372405
const UniqueAVFrame& avFrame) {
373-
// if (avFrame != nullptr) {
374-
// // TODO static cast
375-
// int numSamplesWritten = av_audio_fifo_write(avAudioFifo_.get(),
376-
// (void**)avFrame->data, avFrame->nb_samples);
377-
// TORCH_CHECK(numSamplesWritten == avFrame->nb_samples, "Tried to write
378-
// TODO"); printf("Writing %d samples to fifo (size = %d)\n",
379-
// avFrame->nb_samples, av_audio_fifo_size(avAudioFifo_.get()));
380-
381-
// avFrame = allocateAVFrame(avCodecContext_->frame_size, outSampleRate_,
382-
// outNumChannels_);
383-
// // TODO cast
384-
// int numSamplesRead = av_audio_fifo_read(avAudioFifo_.get(),
385-
// (void**)avFrame->data, avFrame->nb_samples); printf("Read %d from
386-
// fifo\n", numSamplesRead); TORCH_CHECK(numSamplesRead > 0, "Tried to
387-
// read TODO");
388-
// }
389-
390-
// if (avFrame != nullptr) {
391-
// printf("Sending frame with %d samples\n", avFrame->nb_samples);
392-
// } else{
393-
// printf("AVFrame is empty\n");
394-
// }
395406
auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
396407
TORCH_CHECK(
397408
status == AVSUCCESS,
@@ -443,8 +454,11 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
443454
return;
444455
}
445456

446-
UniqueAVFrame avFrame =
447-
allocateAVFrame(numRemainingSamples, outSampleRate_, outNumChannels_);
457+
UniqueAVFrame avFrame = allocateAVFrame(
458+
numRemainingSamples,
459+
outSampleRate_,
460+
outNumChannels_,
461+
avCodecContext_->sample_fmt);
448462
int actualNumRemainingSamples = swr_convert(
449463
swrContext_.get(), avFrame->data, avFrame->nb_samples, NULL, 0);
450464
avFrame->nb_samples = actualNumRemainingSamples;
@@ -453,8 +467,10 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
453467
}
454468

455469
void AudioEncoder::flushBuffers() {
470+
printf("Flushing, there are %d samples in fifo\n", av_audio_fifo_size(avAudioFifo_.get()));
456471
AutoAVPacket autoAVPacket;
457472
maybeFlushSwrBuffers(autoAVPacket);
458473
encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
474+
printf("Done flushing, there are %d samples in fifo\n", av_audio_fifo_size(avAudioFifo_.get()));
459475
}
460476
} // namespace facebook::torchcodec

src/torchcodec/_core/Encoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class AudioEncoder {
3939
UniqueAVFrame maybeConvertAVFrame(const UniqueAVFrame& avFrame);
4040
void encodeInnerLoop(
4141
AutoAVPacket& autoAVPacket,
42-
const UniqueAVFrame& srcAVFrame);
42+
const UniqueAVFrame& avFrame);
4343
void maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket);
4444
void flushBuffers();
4545

0 commit comments

Comments
 (0)