Add tests

NicolasHug · NicolasHug · commit 3cec7611b887 · 2025-04-02T14:03:11.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -12,14 +12,19 @@ Encoder::~Encoder() {}
 
 // TODO-ENCODING: disable ffmpeg logs by default
 
-Encoder::Encoder(int sampleRate, std::string_view fileName)
-    : sampleRate_(sampleRate) {
+Encoder::Encoder(
+    const torch::Tensor wf,
+    int sampleRate,
+    std::string_view fileName)
+    : wf_(wf), sampleRate_(sampleRate) {
   AVFormatContext* avFormatContext = nullptr;
   avformat_alloc_output_context2(
       &avFormatContext, nullptr, nullptr, fileName.data());
   TORCH_CHECK(avFormatContext != nullptr, "Couldn't allocate AVFormatContext.");
   avFormatContext_.reset(avFormatContext);
 
+  // TODO-ENCODING: Should also support encoding into bytes (use
+  // AVIOBytesContext)
   TORCH_CHECK(
       !(avFormatContext->oformat->flags & AVFMT_NOFILE),
       "AVFMT_NOFILE is set. We only support writing to a file.");
@@ -31,7 +36,7 @@ Encoder::Encoder(int sampleRate, std::string_view fileName)
       getFFMPEGErrorStringFromErrorCode(status));
 
   // We use the AVFormatContext's default codec for that
-  // specificavcodec_parameters_from_context format/container.
+  // specific format/container.
   const AVCodec* avCodec =
       avcodec_find_encoder(avFormatContext_->oformat->audio_codec);
   TORCH_CHECK(avCodec != nullptr, "Codec not found");
@@ -40,9 +45,10 @@ Encoder::Encoder(int sampleRate, std::string_view fileName)
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
   avCodecContext_.reset(avCodecContext);
 
-  // This will use the default bit rate
-  // TODO-ENCODING Should let user choose for compressed formats like mp3.
-    // avCodecContext_->bit_rate = 64000;
+  // TODO-ENCODING I think this sets the bit rate to the minimum supported.
+  // That's not what the ffmpeg CLI would choose by default, so we should try to
+  // do the same.
+  // TODO-ENCODING Should also let user choose for compressed formats like mp3.
   avCodecContext_->bit_rate = 0;
 
   // FFmpeg will raise a reasonably informative error if the desired sample rate
@@ -58,8 +64,19 @@ Encoder::Encoder(int sampleRate, std::string_view fileName)
   // libswresample.
   avCodecContext_->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
+  auto numChannels = wf_.sizes()[0];
+  TORCH_CHECK(
+      // TODO-ENCODING is this even true / needed? We can probably support more
+      // with non-planar data?
+      numChannels <= AV_NUM_DATA_POINTERS,
+      "Trying to encode ",
+      numChannels,
+      " channels, but FFmpeg only supports ",
+      AV_NUM_DATA_POINTERS,
+      " channels per frame.");
+
   AVChannelLayout channel_layout;
-  av_channel_layout_default(&channel_layout, 2);
+  av_channel_layout_default(&channel_layout, numChannels);
   avCodecContext_->ch_layout = channel_layout;
 
   status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
@@ -79,7 +96,7 @@ Encoder::Encoder(int sampleRate, std::string_view fileName)
   avcodec_parameters_from_context(avStream_->codecpar, avCodecContext_.get());
 }
 
-void Encoder::encode(const torch::Tensor& wf) {
+void Encoder::encode() {
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
   avFrame->nb_samples = avCodecContext_->frame_size;
@@ -101,24 +118,13 @@ void Encoder::encode(const torch::Tensor& wf) {
 
   AutoAVPacket autoAVPacket;
 
-  uint8_t* pWf = static_cast<uint8_t*>(wf.data_ptr());
-  auto numChannels = wf.sizes()[0];
-  auto numSamples = wf.sizes()[1]; // per channel
+  uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
+  auto numSamples = wf_.sizes()[1]; // per channel
   auto numEncodedSamples = 0; // per channel
   auto numSamplesPerFrame =
       static_cast<long>(avCodecContext_->frame_size); // per channel
-  auto numBytesPerSample = wf.element_size();
-  auto numBytesPerChannel = wf.sizes()[1] * numBytesPerSample;
-
-  TORCH_CHECK(
-      // TODO-ENCODING is this even true / needed? We can probably support more
-      // with non-planar data?
-      numChannels <= AV_NUM_DATA_POINTERS,
-      "Trying to encode ",
-      numChannels,
-      " channels, but FFmpeg only supports ",
-      AV_NUM_DATA_POINTERS,
-      " channels per frame.");
+  auto numBytesPerSample = wf_.element_size();
+  auto numBytesPerChannel = numSamples * numBytesPerSample;
 
   status = avformat_write_header(avFormatContext_.get(), nullptr);
   TORCH_CHECK(
@@ -136,16 +142,22 @@ void Encoder::encode(const torch::Tensor& wf) {
     auto numSamplesToEncode =
         std::min(numSamplesPerFrame, numSamples - numEncodedSamples);
     auto numBytesToEncode = numSamplesToEncode * numBytesPerSample;
-    avFrame->nb_samples = std::min(static_cast<int64_t>(avCodecContext_->frame_size), numSamplesToEncode);
 
-    for (int ch = 0; ch < numChannels; ch++) {
+    for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
       memcpy(
-          avFrame->data[ch], pWf + ch * numBytesPerChannel, numBytesToEncode);
+          avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
     }
-    pWf += numBytesToEncode;
+    pwf += numBytesToEncode;
+
+    // Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
+    // that the frame buffers are allocated to a big enough size. Here, we reset
+    // it to the exact number of samples that need to be encoded, otherwise the
+    // encoded frame would contain more samples than necessary and our results
+    // wouldn't match the ffmpeg CLI.
+    avFrame->nb_samples = numSamplesToEncode;
     encode_inner_loop(autoAVPacket, avFrame);
 
-    avFrame->pts += avFrame->nb_samples;
+    avFrame->pts += numSamplesToEncode;
     numEncodedSamples += numSamplesToEncode;
   }
   TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
@@ -163,11 +175,6 @@ void Encoder::encode_inner_loop(
     AutoAVPacket& autoAVPacket,
     const UniqueAVFrame& avFrame) {
   auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
-//   if (avFrame.get()) {
-//     printf("Sending frame with %d samples\n", avFrame->nb_samples);
-//   } else {
-//     printf("Flushing\n");
-//   }
   TORCH_CHECK(
       status == AVSUCCESS,
       "Error while sending frame: ",
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -7,12 +7,8 @@ class Encoder {
  public:
   ~Encoder();
 
-  // TODO Are we OK passing a string_view to the constructor?
-  // TODO fileName should be optional.
-  // TODO doesn't make much sense to pass fileName and the wf tensor in 2
-  // different calls. Same with sampleRate.
-  Encoder(int sampleRate, std::string_view fileName);
-  void encode(const torch::Tensor& wf);
+  Encoder(const torch::Tensor wf, int sampleRate, std::string_view fileName);
+  void encode();
 
  private:
   void encode_inner_loop(
@@ -31,5 +27,6 @@ class Encoder {
   // resample the waveform internally to match them, but that's not in scope for
   // an initial version (if at all).
   int sampleRate_;
+  const torch::Tensor wf_;
 };
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -28,8 +28,8 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.impl_abstract_pystub(
       "torchcodec._core.ops", "//pytorch/torchcodec:torchcodec");
   m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor");
-  m.def("create_encoder(int sample_rate, str filename) -> Tensor");
-  m.def("encode(Tensor(a!) encoder, Tensor wf) -> ()");
+  m.def("create_encoder(Tensor wf, int sample_rate, str filename) -> Tensor");
+  m.def("encode(Tensor(a!) encoder) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
@@ -194,15 +194,18 @@ at::Tensor create_from_file(
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 
-at::Tensor create_encoder(int64_t sample_rate, std::string_view file_name) {
+at::Tensor create_encoder(
+    const at::Tensor wf,
+    int64_t sample_rate,
+    std::string_view file_name) {
   std::unique_ptr<Encoder> uniqueEncoder =
-      std::make_unique<Encoder>(static_cast<int>(sample_rate), file_name);
+      std::make_unique<Encoder>(wf, static_cast<int>(sample_rate), file_name);
   return wrapEncoderPointerToTensor(std::move(uniqueEncoder));
 }
 
-void encode(at::Tensor& encoder, const at::Tensor& wf) {
+void encode(at::Tensor& encoder) {
   auto encoder_ = unwrapTensorToGetEncoder(encoder);
-  encoder_->encode(wf);
+  encoder_->encode();
 }
 
 // Create a VideoDecoder from the actual bytes of a video and wrap the pointer
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -160,12 +160,14 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
 
 
 @register_fake("torchcodec_ns::create_encoder")
-def create_encoder_abstract(sample_rate: int, filename: str) -> torch.Tensor:
+def create_encoder_abstract(
+    wf: torch.Tensor, sample_rate: int, filename: str
+) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
 
 @register_fake("torchcodec_ns::encode")
-def encode_abstract(encoder: torch.Tensor, wf: torch.Tensor) -> torch.Tensor:
+def encode_abstract(encoder: torch.Tensor) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
 
diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py
@@ -940,48 +940,55 @@ def decode(self, source) -> torch.Tensor:
         )
         return frames
 
-    # def test_round_trip(self, tmp_path):
-    #     asset = NASA_AUDIO_MP3
-
-    #     encoded_path = tmp_path / "output.mp3"
-    #     encoder = create_encoder(
-    #         sample_rate=asset.sample_rate, filename=str(encoded_path)
-    #     )
-
-    #     source_samples = self.decode(asset)
-    #     encode(encoder, source_samples)
+    def test_round_trip(self, tmp_path):
+        # Check that decode(encode(samples)) == samples
+        asset = NASA_AUDIO_MP3
+        source_samples = self.decode(asset)
 
-    #     torch.testing.assert_close(self.decode(encoded_path), source_samples)
+        encoded_path = tmp_path / "output.mp3"
+        encoder = create_encoder(
+            wf=source_samples, sample_rate=asset.sample_rate, filename=str(encoded_path)
+        )
+        encode(encoder)
 
-    def test_against_cli(self, tmp_path):
+        # TODO-ENCODING: tol should be stricter. We need to increase the encoded
+        # bitrate, and / or encode into a lossless format.
+        torch.testing.assert_close(
+            self.decode(encoded_path), source_samples, rtol=0, atol=0.07
+        )
 
-        asset = NASA_AUDIO_MP3
+    # TODO-ENCODING: test more encoding formats
+    @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
+    def test_against_cli(self, asset, tmp_path):
+        # Encodes samples with our encoder and with the FFmpeg CLI, and checks
+        # that both decoded outputs are equal
 
         encoded_by_ffmpeg = tmp_path / "ffmpeg_output.mp3"
         encoded_by_us = tmp_path / "our_output.mp3"
 
-        command = [
-            "ffmpeg",
-            "-i",
-            str(asset.path),
-            # '-vn',
-            # '-ar', '16000',    # Set audio sampling rate
-            # '-ac', '2',        # Set number of audio channels
-            # '-b:a', '192k',    # Set audio bitrate
-            '-b:a', '0',    # Set audio bitrate
-            str(encoded_by_ffmpeg),
-        ]
-        subprocess.run(command, check=True)
+        subprocess.run(
+            [
+                "ffmpeg",
+                "-i",
+                str(asset.path),
+                "-b:a",
+                "0",  # bitrate hardcoded to 0, see corresponding TODO.
+                str(encoded_by_ffmpeg),
+            ],
+            capture_output=True,
+            check=True,
+        )
 
         encoder = create_encoder(
-            sample_rate=asset.sample_rate, filename=str(encoded_by_us)
+            wf=self.decode(asset),
+            sample_rate=asset.sample_rate,
+            filename=str(encoded_by_us),
         )
+        encode(encoder)
 
-        encode(encoder, self.decode(asset))
-
-        from_ffmpeg = self.decode(encoded_by_ffmpeg)
-        from_us = self.decode(encoded_by_us)
-        torch.testing.assert_close(from_us, from_ffmpeg)
+        torch.testing.assert_close(
+            self.decode(encoded_by_ffmpeg), self.decode(encoded_by_us)
+        )
 
 
 if __name__ == "__main__":