@@ -61,3 +61,57 @@ class AudioEncoder {
6161 int64_t lastEncodedAVFramePts_ = 0 ;
6262};
6363} // namespace facebook::torchcodec
64+
65+ /* clang-format off */
66+ //
67+ // Note: [Encoding loop, sample rate conversion and FIFO]
68+ //
69+ // The input samples are in a given format, sample rate, and number of channels.
70+ // We may want to change these properties before encoding. The conversion is
71+ // done in maybeConvertAVFrame() and we rely on libswresample. When sample rate
72+ // conversion is needed, this means two things:
73+ // - swr will be storing samples in its internal buffers, which we'll need to
74+ // flush at the very end of the encoding process.
75+ // - the converted AVFrame we get back from maybeConvertAVFrame() typically
76+ // won't have the same number of samples as the original AVFrame. And that's
77+ // a problem, because some encoders expect AVFrames with a specific and
78+ // constant number of samples. If we were to send it as-is, we'd get an error
79+ // in avcodec_send_frame(). In order to feed the encoder with AVFrames
80+ // with the expected number of samples, we go through an intermediate FIFO
81+ // from which we can pull the exact number of samples that we need. Note that
82+ // this involves at least 2 additional copies.
83+ //
84+ // To be clear, the FIFO is only used if BOTH the following conditions are met:
85+ // - sample rate conversion is needed (inSampleRate_ != outSampleRate_)
86+ // - the encoder expects a specific number of samples per AVFrame (fixed frame size)
87+ // This is not the case for all encoders, e.g. WAV doesn't care about frame size.
88+ //
89+ // ┌─One─iteration─of─main─encoding─loop─(encode())───────────────────────────────────────────┐
90+ // │ │
91+ // │ Converts: │
92+ // │ - num channels │
93+ // │ - format │
94+ // │ - sample rate │
95+ // │ If sample rate, │
96+ // │ stores data in │
97+ // │ swr buffers │
98+ // │ which will need │
99+ // │ to be flushed │
100+ // │ │
101+ // │ ▲ │
102+ // │ │ ┌─EncodeFrameThroughFifo()──────────────┐│
103+ // │ │ │ ││
104+ // │ AVFrame ──────► MaybeConvertAVFrame()───▲──│─┬──────────────┬──▲────►encodeFrame() ││
105+ // │ with │ │ │ │ │ ││
106+ // │ input │ │ │ │ │ ││
107+ // │ samples │ │ │ │ │ ││
108+ // │ │ │ │ │ │ ││
109+ // │ │ │ └────► FIFO ───┘ │ ││
110+ // │ │ └───────────────────┼───────────────────┘│
111+ // └──────────────────────────────────────────────┼──────────────────────┼────────────────────┘
112+ // │ │
113+ // AVFrame from maybeFlushSwrBuffers() ───┘ │
114+ // Only if sample rate conversion was needed
115+ // nullptr, to flush
116+ // FFmpeg buffers
117+ /* clang-format on */
0 commit comments