Skip to content

Commit fa0856a

Browse files
committed
Add comments, address review
1 parent 00ca07e commit fa0856a

File tree

3 files changed

+33
-7
lines changed

3 files changed

+33
-7
lines changed

src/torchcodec/_core/Encoder.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,10 @@ UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) {
351351
void AudioEncoder::encodeFrameThroughFifo(
352352
AutoAVPacket& autoAVPacket,
353353
const UniqueAVFrame& avFrame,
354-
bool andFlushFifo) {
354+
// flushFifo is only set to true in maybeFlushSwrBuffers(), i.e. at the very
355+
// end of the encoding process when we're flushing buffers. We also want to
356+
// flush the FIFO so as to not leave any remaining samples in it.
357+
bool flushFifo) {
355358
if (avAudioFifo_ == nullptr) {
356359
encodeFrame(autoAVPacket, avFrame);
357360
return;
@@ -373,8 +376,20 @@ void AudioEncoder::encodeFrameThroughFifo(
373376
outNumChannels_,
374377
avCodecContext_->sample_fmt);
375378

379+
// Explaining the while bound:
380+
// - if we're not flushing the FIFO, i.e. in most cases, we want to pull
381+
// exactly `frame_size` samples from the FIFO, so we have to stop before it
382+
// contains less than `frame_size` samples.
383+
// - if we're flushing the FIFO, we want to read from the FIFO until the very
384+
// last sample it contains.
385+
//
386+
// In both cases, for as long as we can, we're trying to pull exatly
387+
// `frame_size` samples from the FIFO and send each `frame_size`-sized avFrame
388+
// to encodeFrame(). Only the very last avFrame of the encoding process is
389+
// allowed to contained less than frame_size samples. That only happens when
390+
// flushFifo is true.
376391
while (av_audio_fifo_size(avAudioFifo_.get()) >=
377-
(andFlushFifo ? 1 : avCodecContext_->frame_size)) {
392+
(flushFifo ? 1 : avCodecContext_->frame_size)) {
378393
int samplesToRead = std::min(
379394
av_audio_fifo_size(avAudioFifo_.get()), newavFrame->nb_samples);
380395
int numSamplesRead = av_audio_fifo_read(
@@ -466,7 +481,9 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
466481
swrContext_.get(), avFrame->data, avFrame->nb_samples, NULL, 0);
467482
avFrame->nb_samples = actualNumRemainingSamples;
468483

469-
encodeFrameThroughFifo(autoAVPacket, avFrame, /*andFlushFifo=*/true);
484+
// We're potentially sending avFrame through the FIFO (if it exists), in which
485+
// case we also want to flush the FIFO itself.
486+
encodeFrameThroughFifo(autoAVPacket, avFrame, /*flushFifo=*/true);
470487
}
471488

472489
void AudioEncoder::flushBuffers() {

src/torchcodec/_core/Encoder.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class AudioEncoder {
2929
void encodeFrameThroughFifo(
3030
AutoAVPacket& autoAVPacket,
3131
const UniqueAVFrame& avFrame,
32-
bool andFlushFifo = false);
32+
bool flushFifo = false);
3333
void encodeFrame(AutoAVPacket& autoAVPacket, const UniqueAVFrame& avFrame);
3434
void maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket);
3535
void flushBuffers();
@@ -80,6 +80,15 @@ class AudioEncoder {
8080
// - the encoder expects a specific number of samples per AVFrame (fixed frame size)
8181
// This is not the case for all encoders, e.g. WAV doesn't care about frame size.
8282
//
83+
// Also, the FIFO is either:
84+
// - used for every single frame during the encoding process, or
85+
// - not used at all.
86+
// There is no scenario where a given Encoder() instance would sometimes use a
87+
// FIFO, sometimes not.
88+
//
89+
// Drawing made with https://asciiflow.com/, can be copy/pasted there if it
90+
// needs editing:
91+
//
8392
// ┌─One─iteration─of─main─encoding─loop─(encode())───────────────────────────────────────────┐
8493
// │ │
8594
// │ Converts: │
@@ -95,12 +104,12 @@ class AudioEncoder {
95104
// │ ▲ │
96105
// │ │ ┌─EncodeFrameThroughFifo()──────────────┐│
97106
// │ │ │ ││
98-
// │ AVFrame ──────► MaybeConvertAVFrame()───▲──│─┬──────────────┬──▲────►encodeFrame() ││
107+
// │ AVFrame ──────► MaybeConvertAVFrame()───▲──│─┬──► NO FIFO ─►┬──▲────►encodeFrame() ││
99108
// │ with │ │ │ │ │ ││
100109
// │ input │ │ │ │ │ ││
101110
// │ samples │ │ │ │ │ ││
102111
// │ │ │ │ │ │ ││
103-
// │ │ │ └────► FIFO ──┘ │ ││
112+
// │ │ │ └────► FIFO ──┘ │ ││
104113
// │ │ └───────────────────┼───────────────────┘│
105114
// └──────────────────────────────────────────────┼──────────────────────┼────────────────────┘
106115
// │ │

src/torchcodec/encoders/_audio_encoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class AudioEncoder:
1616
which case ``num_channels = 1`` is assumed. Values must be float
1717
values in ``[-1, 1]``.
1818
sample_rate (int): The sample rate of the **input** ``samples``. The
19-
sample rate of the necoded output can be specified using the
19+
sample rate of the encoded output can be specified using the
2020
encoding methods (``to_file``, etc.).
2121
"""
2222

0 commit comments

Comments
 (0)