@@ -8,16 +8,16 @@ namespace facebook::torchcodec {
88
99namespace {
1010
11- torch::Tensor validateWf (torch::Tensor wf ) {
11+ torch::Tensor validateSamples (torch::Tensor samples ) {
1212 TORCH_CHECK (
13- wf .dtype () == torch::kFloat32 ,
14- " waveform must have float32 dtype, got " ,
15- wf .dtype ());
16- TORCH_CHECK (wf .dim () == 2 , " waveform must have 2 dimensions, got " , wf .dim ());
13+ samples .dtype () == torch::kFloat32 ,
14+ " samples must have float32 dtype, got " ,
15+ samples .dtype ());
16+ TORCH_CHECK (samples .dim () == 2 , " samples must have 2 dimensions, got " , samples .dim ());
1717
1818 // We enforce this, but if we get user reports we should investigate whether
1919 // that's actually needed.
20- int numChannels = static_cast <int >(wf .sizes ()[0 ]);
20+ int numChannels = static_cast <int >(samples .sizes ()[0 ]);
2121 TORCH_CHECK (
2222 numChannels <= AV_NUM_DATA_POINTERS,
2323 " Trying to encode " ,
@@ -26,7 +26,7 @@ torch::Tensor validateWf(torch::Tensor wf) {
2626 AV_NUM_DATA_POINTERS,
2727 " channels per frame." );
2828
29- return wf .contiguous ();
29+ return samples .contiguous ();
3030}
3131
3232void validateSampleRate (const AVCodec& avCodec, int sampleRate) {
@@ -71,7 +71,7 @@ static const std::vector<AVSampleFormat> preferredFormatsOrder = {
7171
7272AVSampleFormat findBestOutputSampleFormat (const AVCodec& avCodec) {
7373 // Find a sample format that the encoder supports. We prefer using FLT[P],
74- // since this is the format of the input waveform . If FLTP isn't supported
74+ // since this is the format of the input samples . If FLTP isn't supported
7575 // then we'll need to convert the AVFrame's format. Our heuristic is to encode
7676 // into the format with the highest resolution.
7777 if (avCodec.sample_fmts == nullptr ) {
@@ -98,12 +98,12 @@ AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
9898AudioEncoder::~AudioEncoder () {}
9999
100100AudioEncoder::AudioEncoder (
101- const torch::Tensor wf ,
101+ const torch::Tensor samples ,
102102 int sampleRate,
103103 std::string_view fileName,
104104 std::optional<int64_t > bitRate,
105105 std::optional<int64_t > numChannels)
106- : wf_(validateWf(wf )) {
106+ : samples_(validateSamples(samples )) {
107107 setFFmpegLogLevel ();
108108 AVFormatContext* avFormatContext = nullptr ;
109109 int status = avformat_alloc_output_context2 (
@@ -130,13 +130,13 @@ AudioEncoder::AudioEncoder(
130130}
131131
132132AudioEncoder::AudioEncoder (
133- const torch::Tensor wf ,
133+ const torch::Tensor samples ,
134134 int sampleRate,
135135 std::string_view formatName,
136136 std::unique_ptr<AVIOToTensorContext> avioContextHolder,
137137 std::optional<int64_t > bitRate,
138138 std::optional<int64_t > numChannels)
139- : wf_(validateWf(wf )), avioContextHolder_(std::move(avioContextHolder)) {
139+ : samples_(validateSamples(samples )), avioContextHolder_(std::move(avioContextHolder)) {
140140 setFFmpegLogLevel ();
141141 AVFormatContext* avFormatContext = nullptr ;
142142 int status = avformat_alloc_output_context2 (
@@ -177,7 +177,7 @@ void AudioEncoder::initializeEncoder(
177177 // well when "-b:a" isn't specified.
178178 avCodecContext_->bit_rate = bitRate.value_or (0 );
179179
180- desiredNumChannels_ = static_cast <int >(numChannels.value_or (wf_ .sizes ()[0 ]));
180+ desiredNumChannels_ = static_cast <int >(numChannels.value_or (samples_ .sizes ()[0 ]));
181181 validateNumChannels (*avCodec, desiredNumChannels_);
182182 // The avCodecContext layout defines the layout of the encoded output, it's
183183 // not related to the input sampes.
@@ -186,11 +186,13 @@ void AudioEncoder::initializeEncoder(
186186 validateSampleRate (*avCodec, sampleRate);
187187 avCodecContext_->sample_rate = sampleRate;
188188
189- // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we
190- // may need to convert the wf into a supported output sample format, which is
189+ // Input samples are expected to be FLTP. Not all encoders support FLTP, so we
190+ // may need to convert the samples into a supported output sample format, which is
191191 // what the `.sample_fmt` defines.
192192 avCodecContext_->sample_fmt = findBestOutputSampleFormat (*avCodec);
193193
194+ setDefaultChannelLayout (avCodecContext_, static_cast <int >(samples_.sizes ()[0 ]));
195+
194196 int status = avcodec_open2 (avCodecContext_.get (), avCodec, nullptr );
195197 TORCH_CHECK (
196198 status == AVSUCCESS,
@@ -237,7 +239,7 @@ void AudioEncoder::encode() {
237239 avFrame->pts = 0 ;
238240 // We set the channel layout of the frame to the default layout corresponding
239241 // to the input samples' number of channels
240- setDefaultChannelLayout (avFrame, static_cast <int >(wf_ .sizes ()[0 ]));
242+ setDefaultChannelLayout (avFrame, static_cast <int >(samples_ .sizes ()[0 ]));
241243
242244 auto status = av_frame_get_buffer (avFrame.get (), 0 );
243245 TORCH_CHECK (
@@ -247,10 +249,10 @@ void AudioEncoder::encode() {
247249
248250 AutoAVPacket autoAVPacket;
249251
250- uint8_t * pwf = static_cast <uint8_t *>(wf_ .data_ptr ());
251- int numSamples = static_cast <int >(wf_ .sizes ()[1 ]); // per channel
252+ uint8_t * psamples = static_cast <uint8_t *>(samples_ .data_ptr ());
253+ int numSamples = static_cast <int >(samples_ .sizes ()[1 ]); // per channel
252254 int numEncodedSamples = 0 ; // per channel
253- int numBytesPerSample = static_cast <int >(wf_ .element_size ());
255+ int numBytesPerSample = static_cast <int >(samples_ .element_size ());
254256 int numBytesPerChannel = numSamples * numBytesPerSample;
255257
256258 status = avformat_write_header (avFormatContext_.get (), nullptr );
@@ -270,11 +272,11 @@ void AudioEncoder::encode() {
270272 std::min (numSamplesAllocatedPerFrame, numSamples - numEncodedSamples);
271273 int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
272274
273- for (int ch = 0 ; ch < wf_ .sizes ()[0 ]; ch++) {
275+ for (int ch = 0 ; ch < samples_ .sizes ()[0 ]; ch++) {
274276 std::memcpy (
275- avFrame->data [ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
277+ avFrame->data [ch], psamples + ch * numBytesPerChannel, numBytesToEncode);
276278 }
277- pwf += numBytesToEncode;
279+ psamples += numBytesToEncode;
278280
279281 // Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
280282 // that the frame buffers are allocated to a big enough size. Here, we reset
0 commit comments