@@ -8,16 +8,19 @@ namespace facebook::torchcodec {
88
99namespace {
1010
11- torch::Tensor validateWf (torch::Tensor wf ) {
11+ torch::Tensor validateSamples (torch::Tensor samples ) {
1212 TORCH_CHECK (
13- wf.dtype () == torch::kFloat32 ,
14- " waveform must have float32 dtype, got " ,
15- wf.dtype ());
16- TORCH_CHECK (wf.dim () == 2 , " waveform must have 2 dimensions, got " , wf.dim ());
13+ samples.dtype () == torch::kFloat32 ,
14+ " samples must have float32 dtype, got " ,
15+ samples.dtype ());
16+ TORCH_CHECK (
17+ samples.dim () == 2 ,
18+ " samples must have 2 dimensions, got " ,
19+ samples.dim ());
1720
1821 // We enforce this, but if we get user reports we should investigate whether
1922 // that's actually needed.
20- int numChannels = static_cast <int >(wf .sizes ()[0 ]);
23+ int numChannels = static_cast <int >(samples .sizes ()[0 ]);
2124 TORCH_CHECK (
2225 numChannels <= AV_NUM_DATA_POINTERS,
2326 " Trying to encode " ,
@@ -26,7 +29,7 @@ torch::Tensor validateWf(torch::Tensor wf) {
2629 AV_NUM_DATA_POINTERS,
2730 " channels per frame." );
2831
29- return wf .contiguous ();
32+ return samples .contiguous ();
3033}
3134
3235void validateSampleRate (const AVCodec& avCodec, int sampleRate) {
@@ -71,7 +74,7 @@ static const std::vector<AVSampleFormat> preferredFormatsOrder = {
7174
7275AVSampleFormat findBestOutputSampleFormat (const AVCodec& avCodec) {
7376 // Find a sample format that the encoder supports. We prefer using FLT[P],
74- // since this is the format of the input waveform . If FLTP isn't supported
77+ // since this is the format of the input samples . If FLTP isn't supported
7578 // then we'll need to convert the AVFrame's format. Our heuristic is to encode
7679 // into the format with the highest resolution.
7780 if (avCodec.sample_fmts == nullptr ) {
@@ -115,11 +118,11 @@ UniqueAVFrame allocateAVFrame(int numSamples, int sampleRate, int numChannels) {
115118AudioEncoder::~AudioEncoder () {}
116119
117120AudioEncoder::AudioEncoder (
118- const torch::Tensor wf ,
121+ const torch::Tensor samples ,
119122 int sampleRate,
120123 std::string_view fileName,
121124 const AudioStreamOptions& audioStreamOptions)
122- : wf_(validateWf(wf )), sampleRateInput_(sampleRate) {
125+ : samples_(validateSamples(samples )), sampleRateInput_(sampleRate) {
123126 setFFmpegLogLevel ();
124127 AVFormatContext* avFormatContext = nullptr ;
125128 int status = avformat_alloc_output_context2 (
@@ -146,12 +149,12 @@ AudioEncoder::AudioEncoder(
146149}
147150
148151AudioEncoder::AudioEncoder (
149- const torch::Tensor wf ,
152+ const torch::Tensor samples ,
150153 int sampleRate,
151154 std::string_view formatName,
152155 std::unique_ptr<AVIOToTensorContext> avioContextHolder,
153156 const AudioStreamOptions& audioStreamOptions)
154- : wf_(validateWf(wf )),
157+ : samples_(validateSamples(samples )),
155158 sampleRateInput_ (sampleRate),
156159 avioContextHolder_(std::move(avioContextHolder)) {
157160 setFFmpegLogLevel ();
@@ -194,8 +197,8 @@ void AudioEncoder::initializeEncoder(
194197 // well when "-b:a" isn't specified.
195198 avCodecContext_->bit_rate = desiredBitRate.value_or (0 );
196199
197- outNumChannels_ =
198- static_cast < int >( audioStreamOptions.numChannels .value_or (wf_ .sizes ()[0 ]));
200+ outNumChannels_ = static_cast < int >(
201+ audioStreamOptions.numChannels .value_or (samples_ .sizes ()[0 ]));
199202 validateNumChannels (*avCodec, outNumChannels_);
200203 // The avCodecContext layout defines the layout of the encoded output, it's
201204 // not related to the input sampes.
@@ -205,9 +208,9 @@ void AudioEncoder::initializeEncoder(
205208 validateSampleRate (*avCodec, outSampleRate_);
206209 avCodecContext_->sample_rate = outSampleRate_;
207210
208- // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we
209- // may need to convert the wf into a supported output sample format, which is
210- // what the `.sample_fmt` defines.
211+ // Input samples are expected to be FLTP. Not all encoders support FLTP, so we
212+ // may need to convert the samples into a supported output sample format,
213+ // which is what the `.sample_fmt` defines.
211214 avCodecContext_->sample_fmt = findBestOutputSampleFormat (*avCodec);
212215
213216 int status = avcodec_open2 (avCodecContext_.get (), avCodec, nullptr );
@@ -265,15 +268,15 @@ void AudioEncoder::encode() {
265268 UniqueAVFrame avFrame = allocateAVFrame (
266269 numSamplesAllocatedPerFrame,
267270 sampleRateInput_,
268- static_cast <int >(wf_ .sizes ()[0 ]));
271+ static_cast <int >(samples_ .sizes ()[0 ]));
269272 avFrame->pts = 0 ;
270273
271274 AutoAVPacket autoAVPacket;
272275
273- uint8_t * pwf = static_cast <uint8_t *>(wf_ .data_ptr ());
274- int numSamples = static_cast <int >(wf_ .sizes ()[1 ]); // per channel
276+ uint8_t * psamples = static_cast <uint8_t *>(samples_ .data_ptr ());
277+ int numSamples = static_cast <int >(samples_ .sizes ()[1 ]); // per channel
275278 int numEncodedSamples = 0 ; // per channel
276- int numBytesPerSample = static_cast <int >(wf_ .element_size ());
279+ int numBytesPerSample = static_cast <int >(samples_ .element_size ());
277280 int numBytesPerChannel = numSamples * numBytesPerSample;
278281
279282 auto status = avformat_write_header (avFormatContext_.get (), nullptr );
@@ -293,11 +296,13 @@ void AudioEncoder::encode() {
293296 std::min (numSamplesAllocatedPerFrame, numSamples - numEncodedSamples);
294297 int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
295298
296- for (int ch = 0 ; ch < wf_ .sizes ()[0 ]; ch++) {
299+ for (int ch = 0 ; ch < samples_ .sizes ()[0 ]; ch++) {
297300 std::memcpy (
298- avFrame->data [ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
301+ avFrame->data [ch],
302+ psamples + ch * numBytesPerChannel,
303+ numBytesToEncode);
299304 }
300- pwf += numBytesToEncode;
305+ psamples += numBytesToEncode;
301306
302307 // Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
303308 // that the frame buffers are allocated to a big enough size. Here, we reset
0 commit comments