From 52098c9f416e275b53cafce24030afac676911b4 Mon Sep 17 00:00:00 2001 From: danielflores3 Date: Thu, 22 May 2025 10:30:54 -0700 Subject: [PATCH 1/7] Rename 'wf' to 'samples' in AudioEncoder --- src/torchcodec/_core/Encoder.cpp | 46 +++++++++++++++-------------- src/torchcodec/_core/Encoder.h | 8 ++--- src/torchcodec/_core/custom_ops.cpp | 12 ++++---- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index a5303b0fa..dfceb2bb6 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -8,16 +8,16 @@ namespace facebook::torchcodec { namespace { -torch::Tensor validateWf(torch::Tensor wf) { +torch::Tensor validateSamples(torch::Tensor samples) { TORCH_CHECK( - wf.dtype() == torch::kFloat32, - "waveform must have float32 dtype, got ", - wf.dtype()); - TORCH_CHECK(wf.dim() == 2, "waveform must have 2 dimensions, got ", wf.dim()); + samples.dtype() == torch::kFloat32, + "samples must have float32 dtype, got ", + samples.dtype()); + TORCH_CHECK(samples.dim() == 2, "samples must have 2 dimensions, got ", samples.dim()); // We enforce this, but if we get user reports we should investigate whether // that's actually needed. - int numChannels = static_cast(wf.sizes()[0]); + int numChannels = static_cast(samples.sizes()[0]); TORCH_CHECK( numChannels <= AV_NUM_DATA_POINTERS, "Trying to encode ", @@ -26,7 +26,7 @@ torch::Tensor validateWf(torch::Tensor wf) { AV_NUM_DATA_POINTERS, " channels per frame."); - return wf.contiguous(); + return samples.contiguous(); } void validateSampleRate(const AVCodec& avCodec, int sampleRate) { @@ -71,7 +71,7 @@ static const std::vector preferredFormatsOrder = { AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) { // Find a sample format that the encoder supports. We prefer using FLT[P], - // since this is the format of the input waveform. If FLTP isn't supported + // since this is the format of the input samples. If FLTP isn't supported // then we'll need to convert the AVFrame's format. Our heuristic is to encode // into the format with the highest resolution. if (avCodec.sample_fmts == nullptr) { @@ -98,12 +98,12 @@ AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) { AudioEncoder::~AudioEncoder() {} AudioEncoder::AudioEncoder( - const torch::Tensor wf, + const torch::Tensor samples, int sampleRate, std::string_view fileName, std::optional bitRate, std::optional numChannels) - : wf_(validateWf(wf)) { + : samples_(validateSamples(samples)) { setFFmpegLogLevel(); AVFormatContext* avFormatContext = nullptr; int status = avformat_alloc_output_context2( @@ -130,13 +130,13 @@ AudioEncoder::AudioEncoder( } AudioEncoder::AudioEncoder( - const torch::Tensor wf, + const torch::Tensor samples, int sampleRate, std::string_view formatName, std::unique_ptr avioContextHolder, std::optional bitRate, std::optional numChannels) - : wf_(validateWf(wf)), avioContextHolder_(std::move(avioContextHolder)) { + : samples_(validateSamples(samples)), avioContextHolder_(std::move(avioContextHolder)) { setFFmpegLogLevel(); AVFormatContext* avFormatContext = nullptr; int status = avformat_alloc_output_context2( @@ -177,7 +177,7 @@ void AudioEncoder::initializeEncoder( // well when "-b:a" isn't specified. avCodecContext_->bit_rate = bitRate.value_or(0); - desiredNumChannels_ = static_cast(numChannels.value_or(wf_.sizes()[0])); + desiredNumChannels_ = static_cast(numChannels.value_or(samples_.sizes()[0])); validateNumChannels(*avCodec, desiredNumChannels_); // The avCodecContext layout defines the layout of the encoded output, it's // not related to the input sampes. @@ -186,11 +186,13 @@ void AudioEncoder::initializeEncoder( validateSampleRate(*avCodec, sampleRate); avCodecContext_->sample_rate = sampleRate; - // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we - // may need to convert the wf into a supported output sample format, which is + // Input samples are expected to be FLTP. Not all encoders support FLTP, so we + // may need to convert the samples into a supported output sample format, which is // what the `.sample_fmt` defines. avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec); + setDefaultChannelLayout(avCodecContext_, static_cast(samples_.sizes()[0])); + int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr); TORCH_CHECK( status == AVSUCCESS, @@ -237,7 +239,7 @@ void AudioEncoder::encode() { avFrame->pts = 0; // We set the channel layout of the frame to the default layout corresponding // to the input samples' number of channels - setDefaultChannelLayout(avFrame, static_cast(wf_.sizes()[0])); + setDefaultChannelLayout(avFrame, static_cast(samples_.sizes()[0])); auto status = av_frame_get_buffer(avFrame.get(), 0); TORCH_CHECK( @@ -247,10 +249,10 @@ void AudioEncoder::encode() { AutoAVPacket autoAVPacket; - uint8_t* pwf = static_cast(wf_.data_ptr()); - int numSamples = static_cast(wf_.sizes()[1]); // per channel + uint8_t* psamples = static_cast(samples_.data_ptr()); + int numSamples = static_cast(samples_.sizes()[1]); // per channel int numEncodedSamples = 0; // per channel - int numBytesPerSample = static_cast(wf_.element_size()); + int numBytesPerSample = static_cast(samples_.element_size()); int numBytesPerChannel = numSamples * numBytesPerSample; status = avformat_write_header(avFormatContext_.get(), nullptr); @@ -270,11 +272,11 @@ void AudioEncoder::encode() { std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples); int numBytesToEncode = numSamplesToEncode * numBytesPerSample; - for (int ch = 0; ch < wf_.sizes()[0]; ch++) { + for (int ch = 0; ch < samples_.sizes()[0]; ch++) { std::memcpy( - avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode); + avFrame->data[ch], psamples + ch * numBytesPerChannel, numBytesToEncode); } - pwf += numBytesToEncode; + psamples += numBytesToEncode; // Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so // that the frame buffers are allocated to a big enough size. Here, we reset diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h index afbc1d3fa..231a73ec8 100644 --- a/src/torchcodec/_core/Encoder.h +++ b/src/torchcodec/_core/Encoder.h @@ -17,9 +17,9 @@ class AudioEncoder { // TODO-ENCODING: bundle the optional params like bitRate, numChannels, etc. // into an AudioStreamOptions struct, or similar. AudioEncoder( - const torch::Tensor wf, + const torch::Tensor samples, // The *output* sample rate. We can't really decide for the user what it - // should be. Particularly, the sample rate of the input waveform should + // should be. Particularly, the sample rate of the input samples should // match this, and that's up to the user. If sample rates don't match, // encoding will still work but audio will be distorted. int sampleRate, @@ -27,7 +27,7 @@ class AudioEncoder { std::optional bitRate = std::nullopt, std::optional numChannels = std::nullopt); AudioEncoder( - const torch::Tensor wf, + const torch::Tensor samples, int sampleRate, std::string_view formatName, std::unique_ptr avioContextHolder, @@ -54,7 +54,7 @@ class AudioEncoder { // see other TODO above. int desiredNumChannels_ = -1; - const torch::Tensor wf_; + const torch::Tensor samples_; // Stores the AVIOContext for the output tensor buffer. std::unique_ptr avioContextHolder_; diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index c6e43d09c..2d04092ae 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -29,9 +29,9 @@ TORCH_LIBRARY(torchcodec_ns, m) { "torchcodec._core.ops", "//pytorch/torchcodec:torchcodec"); m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor"); m.def( - "encode_audio_to_file(Tensor wf, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None) -> ()"); + "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None) -> ()"); m.def( - "encode_audio_to_tensor(Tensor wf, int sample_rate, str format, int? bit_rate=None, int? num_channels=None) -> Tensor"); + "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None) -> Tensor"); m.def( "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor"); m.def("_convert_to_tensor(int decoder_ptr) -> Tensor"); @@ -388,25 +388,25 @@ OpsAudioFramesOutput get_frames_by_pts_in_range_audio( } void encode_audio_to_file( - const at::Tensor wf, + const at::Tensor samples, int64_t sample_rate, std::string_view file_name, std::optional bit_rate = std::nullopt, std::optional num_channels = std::nullopt) { AudioEncoder( - wf, validateSampleRate(sample_rate), file_name, bit_rate, num_channels) + samples, validateSampleRate(sample_rate), file_name, bit_rate, num_channels) .encode(); } at::Tensor encode_audio_to_tensor( - const at::Tensor wf, + const at::Tensor samples, int64_t sample_rate, std::string_view format, std::optional bit_rate = std::nullopt, std::optional num_channels = std::nullopt) { auto avioContextHolder = std::make_unique(); return AudioEncoder( - wf, + samples, validateSampleRate(sample_rate), format, std::move(avioContextHolder), From 597fef359bba316a5620c47f6e871e133b0ccf77 Mon Sep 17 00:00:00 2001 From: danielflores3 Date: Tue, 27 May 2025 07:44:11 -0700 Subject: [PATCH 2/7] Rename 'wf' to 'samples' in AudioEncoder --- src/torchcodec/_core/Encoder.cpp | 26 +++++++++++++++++--------- src/torchcodec/_core/Encoder.h | 6 ++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index b20251173..a7d6b0772 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -13,7 +13,10 @@ torch::Tensor validateSamples(torch::Tensor samples) { samples.dtype() == torch::kFloat32, "samples must have float32 dtype, got ", samples.dtype()); - TORCH_CHECK(samples.dim() == 2, "samples must have 2 dimensions, got ", samples.dim()); + TORCH_CHECK( + samples.dim() == 2, + "samples must have 2 dimensions, got ", + samples.dim()); // We enforce this, but if we get user reports we should investigate whether // that's actually needed. @@ -102,7 +105,7 @@ AudioEncoder::AudioEncoder( int sampleRate, std::string_view fileName, const AudioStreamOptions& audioStreamOptions) -: samples_(validateSamples(samples)) { + : samples_(validateSamples(samples)) { setFFmpegLogLevel(); AVFormatContext* avFormatContext = nullptr; int status = avformat_alloc_output_context2( @@ -134,7 +137,8 @@ AudioEncoder::AudioEncoder( std::string_view formatName, std::unique_ptr avioContextHolder, const AudioStreamOptions& audioStreamOptions) - : samples_(validateSamples(samples)), avioContextHolder_(std::move(avioContextHolder)) { + : samples_(validateSamples(samples)), + avioContextHolder_(std::move(avioContextHolder)) { setFFmpegLogLevel(); AVFormatContext* avFormatContext = nullptr; int status = avformat_alloc_output_context2( @@ -175,8 +179,9 @@ void AudioEncoder::initializeEncoder( // bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as // well when "-b:a" isn't specified. avCodecContext_->bit_rate = desiredBitRate.value_or(0); - outNumChannels_ = - static_cast(audioStreamOptions.numChannels.value_or(samples_.sizes()[0])); + + outNumChannels_ = static_cast( + audioStreamOptions.numChannels.value_or(samples_.sizes()[0])); validateNumChannels(*avCodec, outNumChannels_); // The avCodecContext layout defines the layout of the encoded output, it's // not related to the input sampes. @@ -186,11 +191,12 @@ void AudioEncoder::initializeEncoder( avCodecContext_->sample_rate = sampleRate; // Input samples are expected to be FLTP. Not all encoders support FLTP, so we - // may need to convert the samples into a supported output sample format, which is - // what the `.sample_fmt` defines. + // may need to convert the samples into a supported output sample format, + // which is what the `.sample_fmt` defines. avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec); - setDefaultChannelLayout(avCodecContext_, static_cast(samples_.sizes()[0])); + setDefaultChannelLayout( + avCodecContext_, static_cast(samples_.sizes()[0])); int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr); TORCH_CHECK( @@ -273,7 +279,9 @@ void AudioEncoder::encode() { for (int ch = 0; ch < samples_.sizes()[0]; ch++) { std::memcpy( - avFrame->data[ch], psamples + ch * numBytesPerChannel, numBytesToEncode); + avFrame->data[ch], + psamples + ch * numBytesPerChannel, + numBytesToEncode); } psamples += numBytesToEncode; diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h index bb746d040..a1aeb72bd 100644 --- a/src/torchcodec/_core/Encoder.h +++ b/src/torchcodec/_core/Encoder.h @@ -47,10 +47,16 @@ class AudioEncoder { UniqueAVCodecContext avCodecContext_; int streamIndex_; UniqueSwrContext swrContext_; + AudioStreamOptions audioStreamOptions; int outNumChannels_ = -1; + // TODO-ENCODING: desiredNumChannels should just be part of an options struct, + // see other TODO above. + int desiredNumChannels_ = -1; + int outNumChannels_ = -1; + const torch::Tensor samples_; // Stores the AVIOContext for the output tensor buffer. From 09b3b9832aca05b1e4e07643dd3581f1b90bd165 Mon Sep 17 00:00:00 2001 From: danielflores3 Date: Tue, 27 May 2025 07:44:11 -0700 Subject: [PATCH 3/7] Rename 'wf' to 'samples' in AudioEncoder From e041abcdfdcf55f5cf15e9a87f7f3f067abcd7b9 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Tue, 27 May 2025 09:17:11 -0700 Subject: [PATCH 4/7] updated variables in python files --- src/torchcodec/_core/ops.py | 5 ++--- src/torchcodec/encoders/_audio_encoder.py | 4 ++-- test/test_ops.py | 10 ++++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index 11751e32b..a68b51e22 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -161,10 +161,9 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch. return torch.empty([], dtype=torch.long) -# TODO-ENCODING: rename wf to samples @register_fake("torchcodec_ns::encode_audio_to_file") def encode_audio_to_file_abstract( - wf: torch.Tensor, + samples: torch.Tensor, sample_rate: int, filename: str, bit_rate: Optional[int] = None, @@ -175,7 +174,7 @@ def encode_audio_to_file_abstract( @register_fake("torchcodec_ns::encode_audio_to_tensor") def encode_audio_to_tensor_abstract( - wf: torch.Tensor, + samples: torch.Tensor, sample_rate: int, format: str, bit_rate: Optional[int] = None, diff --git a/src/torchcodec/encoders/_audio_encoder.py b/src/torchcodec/encoders/_audio_encoder.py index 469fbefbc..3ad039121 100644 --- a/src/torchcodec/encoders/_audio_encoder.py +++ b/src/torchcodec/encoders/_audio_encoder.py @@ -34,7 +34,7 @@ def to_file( num_channels: Optional[int] = None, ) -> None: _core.encode_audio_to_file( - wf=self._samples, + samples=self._samples, sample_rate=self._sample_rate, filename=dest, bit_rate=bit_rate, @@ -49,7 +49,7 @@ def to_tensor( num_channels: Optional[int] = None, ) -> Tensor: return _core.encode_audio_to_tensor( - wf=self._samples, + samples=self._samples, sample_rate=self._sample_rate, format=format, bit_rate=bit_rate, diff --git a/test/test_ops.py b/test/test_ops.py index 5a5fe6755..77be702b6 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1101,22 +1101,24 @@ def test_bad_input(self, tmp_path): with pytest.raises(RuntimeError, match="must have float32 dtype, got int"): encode_audio_to_file( - wf=torch.arange(10, dtype=torch.int), + samples=torch.arange(10, dtype=torch.int), sample_rate=10, filename=valid_output_file, ) with pytest.raises(RuntimeError, match="must have 2 dimensions, got 1"): encode_audio_to_file( - wf=torch.rand(3), sample_rate=10, filename=valid_output_file + samples=torch.rand(3), sample_rate=10, filename=valid_output_file ) with pytest.raises(RuntimeError, match="No such file or directory"): encode_audio_to_file( - wf=torch.rand(2, 10), sample_rate=10, filename="./bad/path.mp3" + samples=torch.rand(2, 10), sample_rate=10, filename="./bad/path.mp3" ) with pytest.raises(RuntimeError, match="check the desired extension"): encode_audio_to_file( - wf=torch.rand(2, 10), sample_rate=10, filename="./file.bad_extension" + samples=torch.rand(2, 10), + sample_rate=10, + filename="./file.bad_extension", ) From 968b30fc8e58c27033d55739dae4b18ebd5a409e Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Tue, 27 May 2025 09:17:11 -0700 Subject: [PATCH 5/7] updated variables in python files --- src/torchcodec/_core/Encoder.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h index a1aeb72bd..69fc6ec0a 100644 --- a/src/torchcodec/_core/Encoder.h +++ b/src/torchcodec/_core/Encoder.h @@ -47,11 +47,8 @@ class AudioEncoder { UniqueAVCodecContext avCodecContext_; int streamIndex_; UniqueSwrContext swrContext_; - AudioStreamOptions audioStreamOptions; - int outNumChannels_ = -1; - // TODO-ENCODING: desiredNumChannels should just be part of an options struct, // see other TODO above. int desiredNumChannels_ = -1; From cbfba11811f49dbe82c98566ffde767f6b951388 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Tue, 27 May 2025 17:26:10 -0700 Subject: [PATCH 6/7] updated variables in python files --- src/torchcodec/_core/Encoder.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index a7d6b0772..453ae0e05 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -195,9 +195,6 @@ void AudioEncoder::initializeEncoder( // which is what the `.sample_fmt` defines. avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec); - setDefaultChannelLayout( - avCodecContext_, static_cast(samples_.sizes()[0])); - int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr); TORCH_CHECK( status == AVSUCCESS, From ab49a59cf02f15a108a3268db32dbf2b9ebcbda9 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Tue, 27 May 2025 17:26:10 -0700 Subject: [PATCH 7/7] updated variables in python files --- src/torchcodec/_core/Encoder.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h index 69fc6ec0a..bb746d040 100644 --- a/src/torchcodec/_core/Encoder.h +++ b/src/torchcodec/_core/Encoder.h @@ -49,9 +49,6 @@ class AudioEncoder { UniqueSwrContext swrContext_; AudioStreamOptions audioStreamOptions; - // TODO-ENCODING: desiredNumChannels should just be part of an options struct, - // see other TODO above. - int desiredNumChannels_ = -1; int outNumChannels_ = -1; const torch::Tensor samples_;