Fix mp3 tests

NicolasHug · NicolasHug · commit 83c75b529ff3 · 2025-04-11T13:58:55.000+01:00
diff --git a/src/torchcodec/_core/AVIOBytesContext.cpp b/src/torchcodec/_core/AVIOBytesContext.cpp
@@ -69,7 +69,7 @@ int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
 
 AVIOToTensorContext::AVIOToTensorContext()
     : dataContext_{torch::empty({OUTPUT_TENSOR_SIZE}, {torch::kUInt8}), 0} {
-  createAVIOContext(nullptr, &write, nullptr, &dataContext_);
+  createAVIOContext(nullptr, &write, &seek, &dataContext_);
 }
 
 // The signature of this function is defined by FFMPEG.
@@ -84,6 +84,26 @@ int AVIOToTensorContext::write(void* opaque, uint8_t* buf, int buf_size) {
   return buf_size;
 }
 
+// The signature of this function is defined by FFMPEG.
+int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
+  auto dataContext = static_cast<DataContext*>(opaque);
+  int64_t ret = -1;
+
+  switch (whence) {
+    case AVSEEK_SIZE:
+      ret = dataContext->outputTensor.numel();
+      break;
+    case SEEK_SET:
+      dataContext->current = offset;
+      ret = offset;
+      break;
+    default:
+      break;
+  }
+
+  return ret;
+}
+
 torch::Tensor AVIOToTensorContext::getOutputTensor() {
   return dataContext_.outputTensor.narrow(
       /*dim=*/0, /*start=*/0, /*length=*/dataContext_.current);
diff --git a/src/torchcodec/_core/AVIOBytesContext.h b/src/torchcodec/_core/AVIOBytesContext.h
@@ -46,6 +46,7 @@ class AVIOToTensorContext : public AVIOContextHolder {
 
   static const int OUTPUT_TENSOR_SIZE = 5'000'000; // TODO-ENCODING handle this
   static int write(void* opaque, uint8_t* buf, int buf_size);
+  static int64_t seek(void* opaque, int64_t offset, int whence);
 
   DataContext dataContext_;
 };
diff --git a/src/torchcodec/_core/AVIOContextHolder.h b/src/torchcodec/_core/AVIOContextHolder.h
@@ -18,11 +18,11 @@ namespace facebook::torchcodec {
 //      UniqueAVIOContext, as the AVIOContext points to a buffer which must be
 //      freed.
 //   2. It is a base class for AVIOContext specializations. When specializing a
-//      AVIOContext, we need to provide:
-//        1. - For decoding: A read callback function and a seek callback
-//             function.
-//           - For encoding: A write callback function.
-//        2. A pointer to some context object that has the same lifetime as the
+//      AVIOContext, we need to provide four things:
+//        1. A read callback function, for decoding.
+//        2. A seek callback function, for decoding and encoding.
+//        3. A write callback function, for encoding>
+//        4. A pointer to some context object that has the same lifetime as the
 //           AVIOContext itself. This context object holds the custom state that
 //           tracks the custom behavior of reading, seeking and writing. It is
 //           provided upon AVIOContext creation and to the read, seek and
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -57,8 +57,6 @@ AudioEncoder::AudioEncoder(
   TORCH_CHECK(
       wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
 
-  avioContextHolder_ = std::make_unique<AVIOToTensorContext>();
-
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
   int status = AVSUCCESS;
@@ -84,6 +82,7 @@ AudioEncoder::AudioEncoder(
         "avio_open failed: ",
         getFFMPEGErrorStringFromErrorCode(status));
   } else {
+    avioContextHolder_ = std::make_unique<AVIOToTensorContext>();
     avFormatContext->pb = avioContextHolder_->getAVIOContext();
   }
 
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -1162,12 +1162,9 @@ def test_round_trip(self, encode_method, output_format, tmp_path):
 
     @pytest.mark.skipif(in_fbcode(), reason="TODO: enable ffmpeg CLI")
     @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
-    @pytest.mark.parametrize(
-        "encode_method", (encode_audio_to_file, encode_audio_to_tensor)
-    )
     @pytest.mark.parametrize("bit_rate", (None, 0, 44_100, 999_999_999))
     @pytest.mark.parametrize("output_format", ("mp3", "wav", "flac"))
-    def test_against_cli(self, asset, encode_method, bit_rate, output_format, tmp_path):
+    def test_against_cli(self, asset, bit_rate, output_format, tmp_path):
         # Encodes samples with our encoder and with the FFmpeg CLI, and checks
         # that both decoded outputs are equal
 
@@ -1186,24 +1183,14 @@ def test_against_cli(self, asset, encode_method, bit_rate, output_format, tmp_pa
             check=True,
         )
 
-        if encode_method is encode_audio_to_file:
-            encoded_by_us = tmp_path / f"our_output.{output_format}"
-            encode_audio_to_file(
-                wf=self.decode(asset),
-                sample_rate=asset.sample_rate,
-                filename=str(encoded_by_us),
-                bit_rate=bit_rate,
-            )
-        else:
-            encoded_by_us = encode_audio_to_tensor(
-                wf=self.decode(asset),
-                sample_rate=asset.sample_rate,
-                format=output_format,
-                bit_rate=bit_rate,
-            )
+        encoded_by_us = tmp_path / f"our_output.{output_format}"
+        encode_audio_to_file(
+            wf=self.decode(asset),
+            sample_rate=asset.sample_rate,
+            filename=str(encoded_by_us),
+            bit_rate=bit_rate,
+        )
 
-        if output_format == "mp3" and encode_method is encode_audio_to_tensor:
-            pytest.skip("TODO-ENCODING investigate, decoded lengths are slightly different")
         rtol, atol = (0, 1e-4) if output_format == "wav" else (None, None)
         torch.testing.assert_close(
             self.decode(encoded_by_ffmpeg),
@@ -1212,6 +1199,32 @@ def test_against_cli(self, asset, encode_method, bit_rate, output_format, tmp_pa
             atol=atol,
         )
 
+    @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
+    @pytest.mark.parametrize("bit_rate", (None, 0, 44_100, 999_999_999))
+    @pytest.mark.parametrize("output_format", ("mp3", "wav", "flac"))
+    def test_tensor_against_file(self, asset, bit_rate, output_format, tmp_path):
+        if get_ffmpeg_major_version() == 4 and output_format == "wav":
+            pytest.skip("Swresample with FFmpeg 4 doesn't work on wav files")
+
+        encoded_file = tmp_path / f"our_output.{output_format}"
+        encode_audio_to_file(
+            wf=self.decode(asset),
+            sample_rate=asset.sample_rate,
+            filename=str(encoded_file),
+            bit_rate=bit_rate,
+        )
+
+        encoded_tensor = encode_audio_to_tensor(
+            wf=self.decode(asset),
+            sample_rate=asset.sample_rate,
+            format=output_format,
+            bit_rate=bit_rate,
+        )
+
+        torch.testing.assert_close(
+            self.decode(encoded_file), self.decode(encoded_tensor)
+        )
+
 
 if __name__ == "__main__":
     pytest.main()