From 52098c9f416e275b53cafce24030afac676911b4 Mon Sep 17 00:00:00 2001
From: danielflores3 <danielflores3@fb.com>
Date: Thu, 22 May 2025 10:30:54 -0700
Subject: [PATCH 1/7] Rename 'wf' to 'samples' in  AudioEncoder

---
 src/torchcodec/_core/Encoder.cpp    | 46 +++++++++++++++--------------
 src/torchcodec/_core/Encoder.h      |  8 ++---
 src/torchcodec/_core/custom_ops.cpp | 12 ++++----
 3 files changed, 34 insertions(+), 32 deletions(-)
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
index a5303b0fa..dfceb2bb6 100644
--- a/src/torchcodec/_core/Encoder.cpp
+++ b/src/torchcodec/_core/Encoder.cpp
@@ -8,16 +8,16 @@ namespace facebook::torchcodec {
 
 namespace {
 
-torch::Tensor validateWf(torch::Tensor wf) {
+torch::Tensor validateSamples(torch::Tensor samples) {
   TORCH_CHECK(
-      wf.dtype() == torch::kFloat32,
-      "waveform must have float32 dtype, got ",
-      wf.dtype());
-  TORCH_CHECK(wf.dim() == 2, "waveform must have 2 dimensions, got ", wf.dim());
+      samples.dtype() == torch::kFloat32,
+      "samples must have float32 dtype, got ",
+      samples.dtype());
+  TORCH_CHECK(samples.dim() == 2, "samples must have 2 dimensions, got ", samples.dim());
 
   // We enforce this, but if we get user reports we should investigate whether
   // that's actually needed.
-  int numChannels = static_cast<int>(wf.sizes()[0]);
+  int numChannels = static_cast<int>(samples.sizes()[0]);
   TORCH_CHECK(
       numChannels <= AV_NUM_DATA_POINTERS,
       "Trying to encode ",
@@ -26,7 +26,7 @@ torch::Tensor validateWf(torch::Tensor wf) {
       AV_NUM_DATA_POINTERS,
       " channels per frame.");
 
-  return wf.contiguous();
+  return samples.contiguous();
 }
 
 void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
@@ -71,7 +71,7 @@ static const std::vector<AVSampleFormat> preferredFormatsOrder = {
 
 AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
   // Find a sample format that the encoder supports. We prefer using FLT[P],
-  // since this is the format of the input waveform. If FLTP isn't supported
+  // since this is the format of the input samples. If FLTP isn't supported
   // then we'll need to convert the AVFrame's format. Our heuristic is to encode
   // into the format with the highest resolution.
   if (avCodec.sample_fmts == nullptr) {
@@ -98,12 +98,12 @@ AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
 AudioEncoder::~AudioEncoder() {}
 
 AudioEncoder::AudioEncoder(
-    const torch::Tensor wf,
+    const torch::Tensor samples,
     int sampleRate,
     std::string_view fileName,
     std::optional<int64_t> bitRate,
     std::optional<int64_t> numChannels)
-    : wf_(validateWf(wf)) {
+    : samples_(validateSamples(samples)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
   int status = avformat_alloc_output_context2(
@@ -130,13 +130,13 @@ AudioEncoder::AudioEncoder(
 }
 
 AudioEncoder::AudioEncoder(
-    const torch::Tensor wf,
+    const torch::Tensor samples,
     int sampleRate,
     std::string_view formatName,
     std::unique_ptr<AVIOToTensorContext> avioContextHolder,
     std::optional<int64_t> bitRate,
     std::optional<int64_t> numChannels)
-    : wf_(validateWf(wf)), avioContextHolder_(std::move(avioContextHolder)) {
+    : samples_(validateSamples(samples)), avioContextHolder_(std::move(avioContextHolder)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
   int status = avformat_alloc_output_context2(
@@ -177,7 +177,7 @@ void AudioEncoder::initializeEncoder(
   // well when "-b:a" isn't specified.
   avCodecContext_->bit_rate = bitRate.value_or(0);
 
-  desiredNumChannels_ = static_cast<int>(numChannels.value_or(wf_.sizes()[0]));
+  desiredNumChannels_ = static_cast<int>(numChannels.value_or(samples_.sizes()[0]));
   validateNumChannels(*avCodec, desiredNumChannels_);
   // The avCodecContext layout defines the layout of the encoded output, it's
   // not related to the input sampes.
@@ -186,11 +186,13 @@ void AudioEncoder::initializeEncoder(
   validateSampleRate(*avCodec, sampleRate);
   avCodecContext_->sample_rate = sampleRate;
 
-  // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we
-  // may need to convert the wf into a supported output sample format, which is
+  // Input samples are expected to be FLTP. Not all encoders support FLTP, so we
+  // may need to convert the samples into a supported output sample format, which is
   // what the `.sample_fmt` defines.
   avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec);
 
+  setDefaultChannelLayout(avCodecContext_, static_cast<int>(samples_.sizes()[0]));
+
   int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
   TORCH_CHECK(
       status == AVSUCCESS,
@@ -237,7 +239,7 @@ void AudioEncoder::encode() {
   avFrame->pts = 0;
   // We set the channel layout of the frame to the default layout corresponding
   // to the input samples' number of channels
-  setDefaultChannelLayout(avFrame, static_cast<int>(wf_.sizes()[0]));
+  setDefaultChannelLayout(avFrame, static_cast<int>(samples_.sizes()[0]));
 
   auto status = av_frame_get_buffer(avFrame.get(), 0);
   TORCH_CHECK(
@@ -247,10 +249,10 @@ void AudioEncoder::encode() {
 
   AutoAVPacket autoAVPacket;
 
-  uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
-  int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel
+  uint8_t* psamples = static_cast<uint8_t*>(samples_.data_ptr());
+  int numSamples = static_cast<int>(samples_.sizes()[1]); // per channel
   int numEncodedSamples = 0; // per channel
-  int numBytesPerSample = static_cast<int>(wf_.element_size());
+  int numBytesPerSample = static_cast<int>(samples_.element_size());
   int numBytesPerChannel = numSamples * numBytesPerSample;
 
   status = avformat_write_header(avFormatContext_.get(), nullptr);
@@ -270,11 +272,11 @@ void AudioEncoder::encode() {
         std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples);
     int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
 
-    for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
+    for (int ch = 0; ch < samples_.sizes()[0]; ch++) {
       std::memcpy(
-          avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
+          avFrame->data[ch], psamples + ch * numBytesPerChannel, numBytesToEncode);
     }
-    pwf += numBytesToEncode;
+    psamples += numBytesToEncode;
 
     // Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
     // that the frame buffers are allocated to a big enough size. Here, we reset
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
index afbc1d3fa..231a73ec8 100644
--- a/src/torchcodec/_core/Encoder.h
+++ b/src/torchcodec/_core/Encoder.h
@@ -17,9 +17,9 @@ class AudioEncoder {
   // TODO-ENCODING: bundle the optional params like bitRate, numChannels, etc.
   // into an AudioStreamOptions struct, or similar.
   AudioEncoder(
-      const torch::Tensor wf,
+      const torch::Tensor samples,
       // The *output* sample rate. We can't really decide for the user what it
-      // should be. Particularly, the sample rate of the input waveform should
+      // should be. Particularly, the sample rate of the input samples should
       // match this, and that's up to the user. If sample rates don't match,
       // encoding will still work but audio will be distorted.
       int sampleRate,
@@ -27,7 +27,7 @@ class AudioEncoder {
       std::optional<int64_t> bitRate = std::nullopt,
       std::optional<int64_t> numChannels = std::nullopt);
   AudioEncoder(
-      const torch::Tensor wf,
+      const torch::Tensor samples,
       int sampleRate,
       std::string_view formatName,
       std::unique_ptr<AVIOToTensorContext> avioContextHolder,
@@ -54,7 +54,7 @@ class AudioEncoder {
   // see other TODO above.
   int desiredNumChannels_ = -1;
 
-  const torch::Tensor wf_;
+  const torch::Tensor samples_;
 
   // Stores the AVIOContext for the output tensor buffer.
   std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index c6e43d09c..2d04092ae 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -29,9 +29,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "torchcodec._core.ops", "//pytorch/torchcodec:torchcodec");
   m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor");
   m.def(
-      "encode_audio_to_file(Tensor wf, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None) -> ()");
+      "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None) -> ()");
   m.def(
-      "encode_audio_to_tensor(Tensor wf, int sample_rate, str format, int? bit_rate=None, int? num_channels=None) -> Tensor");
+      "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None) -> Tensor");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
@@ -388,25 +388,25 @@ OpsAudioFramesOutput get_frames_by_pts_in_range_audio(
 }
 
 void encode_audio_to_file(
-    const at::Tensor wf,
+    const at::Tensor samples,
     int64_t sample_rate,
     std::string_view file_name,
     std::optional<int64_t> bit_rate = std::nullopt,
     std::optional<int64_t> num_channels = std::nullopt) {
   AudioEncoder(
-      wf, validateSampleRate(sample_rate), file_name, bit_rate, num_channels)
+      samples, validateSampleRate(sample_rate), file_name, bit_rate, num_channels)
       .encode();
 }
 
 at::Tensor encode_audio_to_tensor(
-    const at::Tensor wf,
+    const at::Tensor samples,
     int64_t sample_rate,
     std::string_view format,
     std::optional<int64_t> bit_rate = std::nullopt,
     std::optional<int64_t> num_channels = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   return AudioEncoder(
-             wf,
+             samples,
              validateSampleRate(sample_rate),
              format,
              std::move(avioContextHolder),

From 597fef359bba316a5620c47f6e871e133b0ccf77 Mon Sep 17 00:00:00 2001
From: danielflores3 <danielflores3@fb.com>
Date: Tue, 27 May 2025 07:44:11 -0700
Subject: [PATCH 2/7] Rename 'wf' to 'samples' in  AudioEncoder

---
 src/torchcodec/_core/Encoder.cpp | 26 +++++++++++++++++---------
 src/torchcodec/_core/Encoder.h   |  6 ++++++
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
index b20251173..a7d6b0772 100644
--- a/src/torchcodec/_core/Encoder.cpp
+++ b/src/torchcodec/_core/Encoder.cpp
@@ -13,7 +13,10 @@ torch::Tensor validateSamples(torch::Tensor samples) {
       samples.dtype() == torch::kFloat32,
       "samples must have float32 dtype, got ",
       samples.dtype());
-  TORCH_CHECK(samples.dim() == 2, "samples must have 2 dimensions, got ", samples.dim());
+  TORCH_CHECK(
+      samples.dim() == 2,
+      "samples must have 2 dimensions, got ",
+      samples.dim());
 
   // We enforce this, but if we get user reports we should investigate whether
   // that's actually needed.
@@ -102,7 +105,7 @@ AudioEncoder::AudioEncoder(
     int sampleRate,
     std::string_view fileName,
     const AudioStreamOptions& audioStreamOptions)
-: samples_(validateSamples(samples)) {
+    : samples_(validateSamples(samples)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
   int status = avformat_alloc_output_context2(
@@ -134,7 +137,8 @@ AudioEncoder::AudioEncoder(
     std::string_view formatName,
     std::unique_ptr<AVIOToTensorContext> avioContextHolder,
     const AudioStreamOptions& audioStreamOptions)
-    : samples_(validateSamples(samples)), avioContextHolder_(std::move(avioContextHolder)) {
+    : samples_(validateSamples(samples)),
+      avioContextHolder_(std::move(avioContextHolder)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
   int status = avformat_alloc_output_context2(
@@ -175,8 +179,9 @@ void AudioEncoder::initializeEncoder(
   // bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as
   // well when "-b:a" isn't specified.
   avCodecContext_->bit_rate = desiredBitRate.value_or(0);
-  outNumChannels_ =
-      static_cast<int>(audioStreamOptions.numChannels.value_or(samples_.sizes()[0]));
+
+  outNumChannels_ = static_cast<int>(
+      audioStreamOptions.numChannels.value_or(samples_.sizes()[0]));
   validateNumChannels(*avCodec, outNumChannels_);
   // The avCodecContext layout defines the layout of the encoded output, it's
   // not related to the input sampes.
@@ -186,11 +191,12 @@ void AudioEncoder::initializeEncoder(
   avCodecContext_->sample_rate = sampleRate;
 
   // Input samples are expected to be FLTP. Not all encoders support FLTP, so we
-  // may need to convert the samples into a supported output sample format, which is
-  // what the `.sample_fmt` defines.
+  // may need to convert the samples into a supported output sample format,
+  // which is what the `.sample_fmt` defines.
   avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec);
 
-  setDefaultChannelLayout(avCodecContext_, static_cast<int>(samples_.sizes()[0]));
+  setDefaultChannelLayout(
+      avCodecContext_, static_cast<int>(samples_.sizes()[0]));
 
   int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
   TORCH_CHECK(
@@ -273,7 +279,9 @@ void AudioEncoder::encode() {
 
     for (int ch = 0; ch < samples_.sizes()[0]; ch++) {
       std::memcpy(
-          avFrame->data[ch], psamples + ch * numBytesPerChannel, numBytesToEncode);
+          avFrame->data[ch],
+          psamples + ch * numBytesPerChannel,
+          numBytesToEncode);
     }
     psamples += numBytesToEncode;
 
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
index bb746d040..a1aeb72bd 100644
--- a/src/torchcodec/_core/Encoder.h
+++ b/src/torchcodec/_core/Encoder.h
@@ -47,10 +47,16 @@ class AudioEncoder {
   UniqueAVCodecContext avCodecContext_;
   int streamIndex_;
   UniqueSwrContext swrContext_;
+
   AudioStreamOptions audioStreamOptions;
 
   int outNumChannels_ = -1;
 
+  // TODO-ENCODING: desiredNumChannels should just be part of an options struct,
+  // see other TODO above.
+  int desiredNumChannels_ = -1;
+  int outNumChannels_ = -1;
+
   const torch::Tensor samples_;
 
   // Stores the AVIOContext for the output tensor buffer.

From 09b3b9832aca05b1e4e07643dd3581f1b90bd165 Mon Sep 17 00:00:00 2001
From: danielflores3 <danielflores3@fb.com>
Date: Tue, 27 May 2025 07:44:11 -0700
Subject: [PATCH 3/7] Rename 'wf' to 'samples' in  AudioEncoder


From e041abcdfdcf55f5cf15e9a87f7f3f067abcd7b9 Mon Sep 17 00:00:00 2001
From: Daniel Flores <danielflores3@meta.com>
Date: Tue, 27 May 2025 09:17:11 -0700
Subject: [PATCH 4/7] updated variables in python files

---
 src/torchcodec/_core/ops.py               |  5 ++---
 src/torchcodec/encoders/_audio_encoder.py |  4 ++--
 test/test_ops.py                          | 10 ++++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index 11751e32b..a68b51e22 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -161,10 +161,9 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
     return torch.empty([], dtype=torch.long)
 
 
-# TODO-ENCODING: rename wf to samples
 @register_fake("torchcodec_ns::encode_audio_to_file")
 def encode_audio_to_file_abstract(
-    wf: torch.Tensor,
+    samples: torch.Tensor,
     sample_rate: int,
     filename: str,
     bit_rate: Optional[int] = None,
@@ -175,7 +174,7 @@ def encode_audio_to_file_abstract(
 
 @register_fake("torchcodec_ns::encode_audio_to_tensor")
 def encode_audio_to_tensor_abstract(
-    wf: torch.Tensor,
+    samples: torch.Tensor,
     sample_rate: int,
     format: str,
     bit_rate: Optional[int] = None,
diff --git a/src/torchcodec/encoders/_audio_encoder.py b/src/torchcodec/encoders/_audio_encoder.py
index 469fbefbc..3ad039121 100644
--- a/src/torchcodec/encoders/_audio_encoder.py
+++ b/src/torchcodec/encoders/_audio_encoder.py
@@ -34,7 +34,7 @@ def to_file(
         num_channels: Optional[int] = None,
     ) -> None:
         _core.encode_audio_to_file(
-            wf=self._samples,
+            samples=self._samples,
             sample_rate=self._sample_rate,
             filename=dest,
             bit_rate=bit_rate,
@@ -49,7 +49,7 @@ def to_tensor(
         num_channels: Optional[int] = None,
     ) -> Tensor:
         return _core.encode_audio_to_tensor(
-            wf=self._samples,
+            samples=self._samples,
             sample_rate=self._sample_rate,
             format=format,
             bit_rate=bit_rate,
diff --git a/test/test_ops.py b/test/test_ops.py
index 5a5fe6755..77be702b6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1101,22 +1101,24 @@ def test_bad_input(self, tmp_path):
 
         with pytest.raises(RuntimeError, match="must have float32 dtype, got int"):
             encode_audio_to_file(
-                wf=torch.arange(10, dtype=torch.int),
+                samples=torch.arange(10, dtype=torch.int),
                 sample_rate=10,
                 filename=valid_output_file,
             )
         with pytest.raises(RuntimeError, match="must have 2 dimensions, got 1"):
             encode_audio_to_file(
-                wf=torch.rand(3), sample_rate=10, filename=valid_output_file
+                samples=torch.rand(3), sample_rate=10, filename=valid_output_file
             )
 
         with pytest.raises(RuntimeError, match="No such file or directory"):
             encode_audio_to_file(
-                wf=torch.rand(2, 10), sample_rate=10, filename="./bad/path.mp3"
+                samples=torch.rand(2, 10), sample_rate=10, filename="./bad/path.mp3"
             )
         with pytest.raises(RuntimeError, match="check the desired extension"):
             encode_audio_to_file(
-                wf=torch.rand(2, 10), sample_rate=10, filename="./file.bad_extension"
+                samples=torch.rand(2, 10),
+                sample_rate=10,
+                filename="./file.bad_extension",
             )
 
 

From 968b30fc8e58c27033d55739dae4b18ebd5a409e Mon Sep 17 00:00:00 2001
From: Daniel Flores <danielflores3@meta.com>
Date: Tue, 27 May 2025 09:17:11 -0700
Subject: [PATCH 5/7] updated variables in python files

---
 src/torchcodec/_core/Encoder.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
index a1aeb72bd..69fc6ec0a 100644
--- a/src/torchcodec/_core/Encoder.h
+++ b/src/torchcodec/_core/Encoder.h
@@ -47,11 +47,8 @@ class AudioEncoder {
   UniqueAVCodecContext avCodecContext_;
   int streamIndex_;
   UniqueSwrContext swrContext_;
-
   AudioStreamOptions audioStreamOptions;
 
-  int outNumChannels_ = -1;
-
   // TODO-ENCODING: desiredNumChannels should just be part of an options struct,
   // see other TODO above.
   int desiredNumChannels_ = -1;

From cbfba11811f49dbe82c98566ffde767f6b951388 Mon Sep 17 00:00:00 2001
From: Daniel Flores <danielflores3@meta.com>
Date: Tue, 27 May 2025 17:26:10 -0700
Subject: [PATCH 6/7] updated variables in python files

---
 src/torchcodec/_core/Encoder.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
index a7d6b0772..453ae0e05 100644
--- a/src/torchcodec/_core/Encoder.cpp
+++ b/src/torchcodec/_core/Encoder.cpp
@@ -195,9 +195,6 @@ void AudioEncoder::initializeEncoder(
   // which is what the `.sample_fmt` defines.
   avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec);
 
-  setDefaultChannelLayout(
-      avCodecContext_, static_cast<int>(samples_.sizes()[0]));
-
   int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
   TORCH_CHECK(
       status == AVSUCCESS,

From ab49a59cf02f15a108a3268db32dbf2b9ebcbda9 Mon Sep 17 00:00:00 2001
From: Daniel Flores <danielflores3@meta.com>
Date: Tue, 27 May 2025 17:26:10 -0700
Subject: [PATCH 7/7] updated variables in python files

---
 src/torchcodec/_core/Encoder.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
index 69fc6ec0a..bb746d040 100644
--- a/src/torchcodec/_core/Encoder.h
+++ b/src/torchcodec/_core/Encoder.h
@@ -49,9 +49,6 @@ class AudioEncoder {
   UniqueSwrContext swrContext_;
   AudioStreamOptions audioStreamOptions;
 
-  // TODO-ENCODING: desiredNumChannels should just be part of an options struct,
-  // see other TODO above.
-  int desiredNumChannels_ = -1;
   int outNumChannels_ = -1;
 
   const torch::Tensor samples_;