Properly set frames pts

NicolasHug · NicolasHug · commit 17340a6c96f4 · 2025-07-04T17:26:26.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -318,7 +318,6 @@ void AudioEncoder::encode() {
     encodeFrameThroughFifo(autoAVPacket, convertedAVFrame);
 
     numEncodedSamples += numSamplesToEncode;
-    avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
   }
   TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
 
@@ -405,6 +404,11 @@ void AudioEncoder::encodeFrameThroughFifo(
 void AudioEncoder::encodeFrame(
     AutoAVPacket& autoAVPacket,
     const UniqueAVFrame& avFrame) {
+  if (avFrame != nullptr) {
+    avFrame->pts = lastEncodedAVFramePts_;
+    lastEncodedAVFramePts_ += avFrame->nb_samples;
+  }
+
   auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
   TORCH_CHECK(
       status == AVSUCCESS,
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -58,5 +58,6 @@ class AudioEncoder {
   std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
 
   bool encodeWasCalled_ = false;
+  int64_t lastEncodedAVFramePts_ = 0;
 };
 } // namespace facebook::torchcodec
diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -229,7 +229,7 @@ def test_against_cli(
             ["ffmpeg", "-i", str(asset.path)]
             + (["-b:a", f"{bit_rate}"] if bit_rate is not None else [])
             + (["-ac", f"{num_channels}"] if num_channels is not None else [])
-            + (["-ar", f"{sample_rate}"] if sample_rate is not None else [])
+            + ["-ar", f"{sample_rate}"]
             + [
                 str(encoded_by_ffmpeg),
             ],
@@ -247,17 +247,19 @@ def test_against_cli(
         else:
             encoded_by_us = encoder.to_tensor(format=format, **params)
 
-        # captured = capfd.readouterr()
-        # if format == "wav":
-        #     assert "Timestamps are unset in a packet" not in captured.err
-        # if format == "mp3":
-        #     assert "Queue input is backward in time" not in captured.err
-        # if format in ("flac", "wav"):
-        #     assert "Encoder did not produce proper pts" not in captured.err
-        # if format in ("flac", "mp3"):
-        #     assert "Application provided invalid" not in captured.err
-
+        captured = capfd.readouterr()
         if format == "wav":
+            assert "Timestamps are unset in a packet" not in captured.err
+        if format == "mp3":
+            assert "Queue input is backward in time" not in captured.err
+        if format in ("flac", "wav"):
+            assert "Encoder did not produce proper pts" not in captured.err
+        if format in ("flac", "mp3"):
+            assert "Application provided invalid" not in captured.err
+
+        if sample_rate != asset.sample_rate:
+            rtol, atol = 0, 1e-3
+        elif format == "wav":
             rtol, atol = 0, 1e-4
         elif format == "mp3" and asset is SINE_MONO_S32 and num_channels == 2:
             # Not sure why, this one needs slightly higher tol. With default
@@ -268,7 +270,6 @@ def test_against_cli(
         else:
             rtol, atol = None, None
 
-        rtol, atol = 0, 1e-3
         samples_by_us = self.decode(encoded_by_us)
         samples_by_ffmpeg = self.decode(encoded_by_ffmpeg)
         torch.testing.assert_close(