Address some comments

NicolasHug · NicolasHug · commit 061c60f5848b · 2025-04-03T10:56:49.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -1,18 +1,13 @@
 #include "src/torchcodec/_core/Encoder.h"
 #include "torch/types.h"
 
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavformat/avformat.h>
-}
-
 namespace facebook::torchcodec {
 
-Encoder::~Encoder() {}
+AudioEncoder::~AudioEncoder() {}
 
 // TODO-ENCODING: disable ffmpeg logs by default
 
-Encoder::Encoder(
+AudioEncoder::AudioEncoder(
     const torch::Tensor wf,
     int sampleRate,
     std::string_view fileName)
@@ -24,21 +19,21 @@ Encoder::Encoder(
   TORCH_CHECK(
       wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
   AVFormatContext* avFormatContext = nullptr;
-  avformat_alloc_output_context2(
+  auto status = avformat_alloc_output_context2(
       &avFormatContext, nullptr, nullptr, fileName.data());
   TORCH_CHECK(
       avFormatContext != nullptr,
       "Couldn't allocate AVFormatContext. ",
-      "Check the desired extension?");
+      "Check the desired extension? ",
+      getFFMPEGErrorStringFromErrorCode(status));
   avFormatContext_.reset(avFormatContext);
 
   // TODO-ENCODING: Should also support encoding into bytes (use
   // AVIOBytesContext)
   TORCH_CHECK(
       !(avFormatContext->oformat->flags & AVFMT_NOFILE),
       "AVFMT_NOFILE is set. We only support writing to a file.");
-  auto status =
-      avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
+  status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
   TORCH_CHECK(
       status >= 0,
       "avio_open failed: ",
@@ -85,7 +80,10 @@ Encoder::Encoder(
   setDefaultChannelLayout(avCodecContext_, numChannels);
 
   status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
-  TORCH_CHECK(status == AVSUCCESS, getFFMPEGErrorStringFromErrorCode(status));
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "avcodec_open2 failed: ",
+      getFFMPEGErrorStringFromErrorCode(status));
 
   TORCH_CHECK(
       avCodecContext_->frame_size > 0,
@@ -96,12 +94,18 @@ Encoder::Encoder(
   // We're allocating the stream here. Streams are meant to be freed by
   // avformat_free_context(avFormatContext), which we call in the
   // avFormatContext_'s destructor.
-  avStream_ = avformat_new_stream(avFormatContext_.get(), nullptr);
-  TORCH_CHECK(avStream_ != nullptr, "Couldn't create new stream.");
-  avcodec_parameters_from_context(avStream_->codecpar, avCodecContext_.get());
+  AVStream* avStream = avformat_new_stream(avFormatContext_.get(), nullptr);
+  TORCH_CHECK(avStream != nullptr, "Couldn't create new stream.");
+  status = avcodec_parameters_from_context(
+      avStream->codecpar, avCodecContext_.get());
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "avcodec_parameters_from_context failed: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  streamIndex_ = avStream->index;
 }
 
-void Encoder::encode() {
+void AudioEncoder::encode() {
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
   avFrame->nb_samples = avCodecContext_->frame_size;
@@ -119,12 +123,11 @@ void Encoder::encode() {
   AutoAVPacket autoAVPacket;
 
   uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
-  auto numSamples = wf_.sizes()[1]; // per channel
-  auto numEncodedSamples = 0; // per channel
-  auto numSamplesPerFrame =
-      static_cast<long>(avCodecContext_->frame_size); // per channel
-  auto numBytesPerSample = wf_.element_size();
-  auto numBytesPerChannel = numSamples * numBytesPerSample;
+  int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel
+  int numEncodedSamples = 0; // per channel
+  int numSamplesPerFrame = avCodecContext_->frame_size; // per channel
+  int numBytesPerSample = wf_.element_size();
+  int numBytesPerChannel = numSamples * numBytesPerSample;
 
   status = avformat_write_header(avFormatContext_.get(), nullptr);
   TORCH_CHECK(
@@ -139,12 +142,12 @@ void Encoder::encode() {
         "Couldn't make AVFrame writable: ",
         getFFMPEGErrorStringFromErrorCode(status));
 
-    auto numSamplesToEncode = std::min(
-        numSamplesPerFrame, static_cast<long>(numSamples - numEncodedSamples));
-    auto numBytesToEncode = numSamplesToEncode * numBytesPerSample;
+    int numSamplesToEncode =
+        std::min(numSamplesPerFrame, numSamples - numEncodedSamples);
+    int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
 
     for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
-      memcpy(
+      std::memcpy(
           avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
     }
     pwf += numBytesToEncode;
@@ -155,14 +158,14 @@ void Encoder::encode() {
     // encoded frame would contain more samples than necessary and our results
     // wouldn't match the ffmpeg CLI.
     avFrame->nb_samples = numSamplesToEncode;
-    encode_inner_loop(autoAVPacket, avFrame);
+    encodeInnerLoop(autoAVPacket, avFrame);
 
-    avFrame->pts += numSamplesToEncode;
+    avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
     numEncodedSamples += numSamplesToEncode;
   }
   TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
 
-  encode_inner_loop(autoAVPacket, UniqueAVFrame(nullptr)); // flush
+  flushBuffers();
 
   status = av_write_trailer(avFormatContext_.get());
   TORCH_CHECK(
@@ -171,7 +174,7 @@ void Encoder::encode() {
       getFFMPEGErrorStringFromErrorCode(status));
 }
 
-void Encoder::encode_inner_loop(
+void AudioEncoder::encodeInnerLoop(
     AutoAVPacket& autoAVPacket,
     const UniqueAVFrame& avFrame) {
   auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
@@ -199,10 +202,7 @@ void Encoder::encode_inner_loop(
         "Error receiving packet: ",
         getFFMPEGErrorStringFromErrorCode(status));
 
-    // TODO-ENCODING why are these 2 lines needed??
-    av_packet_rescale_ts(
-        packet.get(), avCodecContext_->time_base, avStream_->time_base);
-    packet->stream_index = avStream_->index;
+    packet->stream_index = streamIndex_;
 
     status = av_interleaved_write_frame(avFormatContext_.get(), packet.get());
     TORCH_CHECK(
@@ -211,4 +211,9 @@ void Encoder::encode_inner_loop(
         getFFMPEGErrorStringFromErrorCode(status));
   }
 }
+
+void AudioEncoder::flushBuffers() {
+  AutoAVPacket autoAVPacket;
+  encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
+}
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -3,21 +3,25 @@
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 
 namespace facebook::torchcodec {
-class Encoder {
+class AudioEncoder {
  public:
-  ~Encoder();
+  ~AudioEncoder();
 
-  Encoder(const torch::Tensor wf, int sampleRate, std::string_view fileName);
+  AudioEncoder(
+      const torch::Tensor wf,
+      int sampleRate,
+      std::string_view fileName);
   void encode();
 
  private:
-  void encode_inner_loop(
+  void encodeInnerLoop(
       AutoAVPacket& autoAVPacket,
       const UniqueAVFrame& avFrame);
+  void flushBuffers();
 
-  UniqueAVFormatContextForEncoding avFormatContext_;
+  UniqueEncodingAVFormatContext avFormatContext_;
   UniqueAVCodecContext avCodecContext_;
-  AVStream* avStream_;
+  int streamIndex_;
 
   const torch::Tensor wf_;
   // The *output* sample rate. We can't really decide for the user what it
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -50,10 +50,10 @@ struct Deleter {
 };
 
 // Unique pointers for FFMPEG structures.
-using UniqueAVFormatContextForDecoding = std::unique_ptr<
+using UniqueDecodingAVFormatContext = std::unique_ptr<
     AVFormatContext,
     Deleterp<AVFormatContext, void, avformat_close_input>>;
-using UniqueAVFormatContextForEncoding = std::unique_ptr<
+using UniqueEncodingAVFormatContext = std::unique_ptr<
     AVFormatContext,
     Deleter<AVFormatContext, void, avformat_free_context>>;
 using UniqueAVCodecContext = std::unique_ptr<
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1443,7 +1443,7 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
     auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
     for (auto channel = 0; channel < numChannels;
          ++channel, outputChannelData += numBytesPerChannel) {
-      memcpy(
+      std::memcpy(
           outputChannelData,
           avFrame->extended_data[channel],
           numBytesPerChannel);
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -492,7 +492,7 @@ class SingleStreamDecoder {
 
   SeekMode seekMode_;
   ContainerMetadata containerMetadata_;
-  UniqueAVFormatContextForDecoding formatContext_;
+  UniqueDecodingAVFormatContext formatContext_;
   std::map<int, StreamInfo> streamInfos_;
   const int NO_ACTIVE_STREAM = -2;
   int activeStreamIndex_ = NO_ACTIVE_STREAM;
diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py
@@ -18,12 +18,12 @@
     _test_frame_pts_equality,
     add_audio_stream,
     add_video_stream,
-    create_encoder,
+    create_audio_encoder,
     create_from_bytes,
     create_from_file,
     create_from_file_like,
     create_from_tensor,
-    encode,
+    encode_audio,
     get_ffmpeg_library_versions,
     get_frame_at_index,
     get_frame_at_pts,
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -28,8 +28,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.impl_abstract_pystub(
       "torchcodec._core.ops", "//pytorch/torchcodec:torchcodec");
   m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor");
-  m.def("create_encoder(Tensor wf, int sample_rate, str filename) -> Tensor");
-  m.def("encode(Tensor(a!) encoder) -> ()");
+  m.def(
+      "create_audio_encoder(Tensor wf, int sample_rate, str filename) -> Tensor");
+  m.def("encode_audio(Tensor(a!) encoder) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
@@ -384,35 +385,42 @@ OpsAudioFramesOutput get_frames_by_pts_in_range_audio(
   return makeOpsAudioFramesOutput(result);
 }
 
-at::Tensor wrapEncoderPointerToTensor(std::unique_ptr<Encoder> uniqueEncoder) {
-  Encoder* encoder = uniqueEncoder.release();
+at::Tensor wrapAudioEncoderPointerToTensor(
+    std::unique_ptr<AudioEncoder> uniqueAudioEncoder) {
+  AudioEncoder* encoder = uniqueAudioEncoder.release();
 
   auto deleter = [encoder](void*) { delete encoder; };
   at::Tensor tensor =
-      at::from_blob(encoder, {sizeof(Encoder)}, deleter, {at::kLong});
-  auto encoder_ = static_cast<Encoder*>(tensor.mutable_data_ptr());
-  TORCH_CHECK_EQ(encoder_, encoder) << "Encoder=" << encoder_;
+      at::from_blob(encoder, {sizeof(AudioEncoder*)}, deleter, {at::kLong});
+  auto encoder_ = static_cast<AudioEncoder*>(tensor.mutable_data_ptr());
+  TORCH_CHECK_EQ(encoder_, encoder) << "AudioEncoder=" << encoder_;
   return tensor;
 }
 
-Encoder* unwrapTensorToGetEncoder(at::Tensor& tensor) {
+AudioEncoder* unwrapTensorToGetAudioEncoder(at::Tensor& tensor) {
   TORCH_INTERNAL_ASSERT(tensor.is_contiguous());
   void* buffer = tensor.mutable_data_ptr();
-  Encoder* encoder = static_cast<Encoder*>(buffer);
+  AudioEncoder* encoder = static_cast<AudioEncoder*>(buffer);
   return encoder;
 }
 
-at::Tensor create_encoder(
+at::Tensor create_audio_encoder(
     const at::Tensor wf,
     int64_t sample_rate,
     std::string_view file_name) {
-  std::unique_ptr<Encoder> uniqueEncoder =
-      std::make_unique<Encoder>(wf, static_cast<int>(sample_rate), file_name);
-  return wrapEncoderPointerToTensor(std::move(uniqueEncoder));
-}
-
-void encode(at::Tensor& encoder) {
-  auto encoder_ = unwrapTensorToGetEncoder(encoder);
+  TORCH_CHECK(
+      sample_rate <= std::numeric_limits<int>::max(),
+      "sample_rate=",
+      sample_rate,
+      " is too large to be cast to an int.");
+  std::unique_ptr<AudioEncoder> uniqueAudioEncoder =
+      std::make_unique<AudioEncoder>(
+          wf, static_cast<int>(sample_rate), file_name);
+  return wrapAudioEncoderPointerToTensor(std::move(uniqueAudioEncoder));
+}
+
+void encode_audio(at::Tensor& encoder) {
+  auto encoder_ = unwrapTensorToGetAudioEncoder(encoder);
   encoder_->encode();
 }
 
@@ -650,15 +658,15 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
 
 TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("create_from_file", &create_from_file);
-  m.impl("create_encoder", &create_encoder);
+  m.impl("create_audio_encoder", &create_audio_encoder);
   m.impl("create_from_tensor", &create_from_tensor);
   m.impl("_convert_to_tensor", &_convert_to_tensor);
   m.impl(
       "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
 }
 
 TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
-  m.impl("encode", &encode);
+  m.impl("encode_audio", &encode_audio);
   m.impl("seek_to_pts", &seek_to_pts);
   m.impl("add_video_stream", &add_video_stream);
   m.impl("_add_video_stream", &_add_video_stream);
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -91,10 +91,12 @@ def load_torchcodec_shared_libraries():
 create_from_file = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.create_from_file.default
 )
-create_encoder = torch._dynamo.disallow_in_graph(
-    torch.ops.torchcodec_ns.create_encoder.default
+create_audio_encoder = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns.create_audio_encoder.default
+)
+encode_audio = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns.encode_audio.default
 )
-encode = torch._dynamo.disallow_in_graph(torch.ops.torchcodec_ns.encode.default)
 create_from_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.create_from_tensor.default
 )
@@ -159,15 +161,15 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
     return torch.empty([], dtype=torch.long)
 
 
-@register_fake("torchcodec_ns::create_encoder")
-def create_encoder_abstract(
+@register_fake("torchcodec_ns::create_audio_encoder")
+def create_audio_encoder_abstract(
     wf: torch.Tensor, sample_rate: int, filename: str
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
 
-@register_fake("torchcodec_ns::encode")
-def encode_abstract(encoder: torch.Tensor) -> torch.Tensor:
+@register_fake("torchcodec_ns::encode_audio")
+def encode_audio_abstract(encoder: torch.Tensor) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
 
diff --git a/test/test_ops.py b/test/test_ops.py