Add comments

NicolasHug · NicolasHug · commit 51e80a3cd0bf · 2025-07-04T19:33:02.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -215,6 +215,9 @@ void AudioEncoder::initializeEncoder(
       getFFMPEGErrorStringFromErrorCode(status));
   streamIndex_ = avStream->index;
 
+  // If sample rate conversion is needed and the encoder doesn't support
+  // variable frame size, we need to create an intermediate FIFO. See
+  // [Encoding loop, sample rate conversion and FIFO].
   if (((avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE) == 0) &&
       (inSampleRate_ != outSampleRate_)) {
     // frame_size * 2 is a decent default size. FFmpeg automatically
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -61,3 +61,57 @@ class AudioEncoder {
   int64_t lastEncodedAVFramePts_ = 0;
 };
 } // namespace facebook::torchcodec
+
+/* clang-format off */
+//
+// Note: [Encoding loop, sample rate conversion and FIFO]
+//
+// The input samples are in a given format, sample rate, and number of channels.
+// We may want to change these properties before encoding. The conversion is
+// done in maybeConvertAVFrame() and we rely on libswresample. When sample rate
+// conversion is needed, this means two things:
+// - swr will be storing samples in its internal buffers, which we'll need to
+//   flush at the very end of the encoding process.
+// - the converted AVFrame we get back from maybeConvertAVFrame() typically
+//   won't have the same number of samples as the original AVFrame. And that's
+//   a problem, because some encoders expect AVFrames with a specific and
+//   constant number of samples. If we were to send it as-is, we'd get an error
+//   in avcodec_send_frame(). In order to feed the encoder with AVFrames
+//   with the expected number of samples, we go through an intermediate FIFO
+//   from which we can pull the exact number of samples that we need. Note that
+//   this involves at least 2 additional copies.
+//
+// To be clear, the FIFO is only used if BOTH the following conditions are met:
+//  - sample rate conversion is needed (inSampleRate_ != outSampleRate_)
+//  - the encoder expects a specific number of samples per AVFrame (fixed frame size)
+//    This is not the case for all encoders, e.g. WAV doesn't care about frame size.
+//
+// ┌─One─iteration─of─main─encoding─loop─(encode())───────────────────────────────────────────┐
+// │                                                                                          │
+// │                        Converts:                                                         │
+// │                         - num channels                                                   │
+// │                         - format                                                         │
+// │                         - sample rate                                                    │
+// │                        If sample rate,                                                   │
+// │                        stores data in                                                    │
+// │                        swr buffers                                                       │
+// │                        which will need                                                   │
+// │                        to be flushed                                                     │
+// │                                                                                          │
+// │                               ▲                                                          │
+// │                               │                 ┌─EncodeFrameThroughFifo()──────────────┐│
+// │                               │                 │                                       ││
+// │    AVFrame  ──────►  MaybeConvertAVFrame()───▲──│─┬──────────────┬──▲────►encodeFrame() ││
+// │    with                                      │  │ │              │  │                   ││
+// │    input                                     │  │ │              │  │                   ││
+// │    samples                                   │  │ │              │  │                   ││
+// │                                              │  │ │              │  │                   ││
+// │                                              │  │ └────► FIFO ───┘  │                   ││
+// │                                              │  └───────────────────┼───────────────────┘│
+// └──────────────────────────────────────────────┼──────────────────────┼────────────────────┘
+//                                                │                      │
+//  AVFrame from  maybeFlushSwrBuffers()       ───┘                      │
+//  Only if sample rate conversion was needed
+//                                                                 nullptr, to flush
+//                                                                 FFmpeg buffers
+/* clang-format on */