Skip to content

Commit 51e80a3

Browse files
committed
Add comments
1 parent 5ef60d7 commit 51e80a3

File tree

2 files changed

+57
-0
lines changed

2 files changed

+57
-0
lines changed

src/torchcodec/_core/Encoder.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,9 @@ void AudioEncoder::initializeEncoder(
215215
getFFMPEGErrorStringFromErrorCode(status));
216216
streamIndex_ = avStream->index;
217217

218+
// If sample rate conversion is needed and the encoder doesn't support
219+
// variable frame size, we need to create an intermediate FIFO. See
220+
// [Encoding loop, sample rate conversion and FIFO].
218221
if (((avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE) == 0) &&
219222
(inSampleRate_ != outSampleRate_)) {
220223
// frame_size * 2 is a decent default size. FFmpeg automatically

src/torchcodec/_core/Encoder.h

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,57 @@ class AudioEncoder {
6161
int64_t lastEncodedAVFramePts_ = 0;
6262
};
6363
} // namespace facebook::torchcodec
64+
65+
/* clang-format off */
66+
//
67+
// Note: [Encoding loop, sample rate conversion and FIFO]
68+
//
69+
// The input samples are in a given format, sample rate, and number of channels.
70+
// We may want to change these properties before encoding. The conversion is
71+
// done in maybeConvertAVFrame() and we rely on libswresample. When sample rate
72+
// conversion is needed, this means two things:
73+
// - swr will be storing samples in its internal buffers, which we'll need to
74+
// flush at the very end of the encoding process.
75+
// - the converted AVFrame we get back from maybeConvertAVFrame() typically
76+
// won't have the same number of samples as the original AVFrame. And that's
77+
// a problem, because some encoders expect AVFrames with a specific and
78+
// constant number of samples. If we were to send it as-is, we'd get an error
79+
// in avcodec_send_frame(). In order to feed the encoder with AVFrames
80+
// with the expected number of samples, we go through an intermediate FIFO
81+
// from which we can pull the exact number of samples that we need. Note that
82+
// this involves at least 2 additional copies.
83+
//
84+
// To be clear, the FIFO is only used if BOTH the following conditions are met:
85+
// - sample rate conversion is needed (inSampleRate_ != outSampleRate_)
86+
// - the encoder expects a specific number of samples per AVFrame (fixed frame size)
87+
// This is not the case for all encoders, e.g. WAV doesn't care about frame size.
88+
//
89+
// ┌─One─iteration─of─main─encoding─loop─(encode())───────────────────────────────────────────┐
90+
// │ │
91+
// │ Converts: │
92+
// │ - num channels │
93+
// │ - format │
94+
// │ - sample rate │
95+
// │ If sample rate, │
96+
// │ stores data in │
97+
// │ swr buffers │
98+
// │ which will need │
99+
// │ to be flushed │
100+
// │ │
101+
// │ ▲ │
102+
// │ │ ┌─EncodeFrameThroughFifo()──────────────┐│
103+
// │ │ │ ││
104+
// │ AVFrame ──────► MaybeConvertAVFrame()───▲──│─┬──────────────┬──▲────►encodeFrame() ││
105+
// │ with │ │ │ │ │ ││
106+
// │ input │ │ │ │ │ ││
107+
// │ samples │ │ │ │ │ ││
108+
// │ │ │ │ │ │ ││
109+
// │ │ │ └────► FIFO ───┘ │ ││
110+
// │ │ └───────────────────┼───────────────────┘│
111+
// └──────────────────────────────────────────────┼──────────────────────┼────────────────────┘
112+
// │ │
113+
// AVFrame from maybeFlushSwrBuffers() ───┘ │
114+
// Only if sample rate conversion was needed
115+
// nullptr, to flush
116+
// FFmpeg buffers
117+
/* clang-format on */

0 commit comments

Comments
 (0)