@@ -215,9 +215,6 @@ void AudioEncoder::initializeEncoder(
215215 status == AVSUCCESS,
216216 " avcodec_open2 failed: " ,
217217 getFFMPEGErrorStringFromErrorCode (status));
218-
219- bool supportsVariableFrameSize = avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE;
220- printf (" supportsVariableFrameSize = %d\n " , supportsVariableFrameSize);
221218
222219 // We're allocating the stream here. Streams are meant to be freed by
223220 // avformat_free_context(avFormatContext), which we call in the
@@ -232,11 +229,19 @@ void AudioEncoder::initializeEncoder(
232229 getFFMPEGErrorStringFromErrorCode (status));
233230 streamIndex_ = avStream->index ;
234231
235- // frame_size * 2 is a decent default size. FFmpeg automatically re-allocates
236- // the fifo if more space is needed.
237- auto avAudioFifo = av_audio_fifo_alloc (avCodecContext_->sample_fmt , outNumChannels_, avCodecContext_->frame_size * 2 );
238- TORCH_CHECK (avAudioFifo!= nullptr , " Couldn't create AVAudioFifo." );
239- avAudioFifo_.reset (avAudioFifo);
232+ // bool supportsVariableFrameSize =
233+ // avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE;
234+ // printf("supportsVariableFrameSize = %d\n", supportsVariableFrameSize);
235+
236+ // // frame_size * 2 is a decent default size. FFmpeg automatically
237+ // re-allocates
238+ // // the fifo if more space is needed.
239+ // auto avAudioFifo = av_audio_fifo_alloc(
240+ // avCodecContext_->sample_fmt,
241+ // outNumChannels_,
242+ // avCodecContext_->frame_size * 2);
243+ // TORCH_CHECK(avAudioFifo != nullptr, "Couldn't create AVAudioFifo.");
244+ // avAudioFifo_.reset(avAudioFifo);
240245}
241246
242247torch::Tensor AudioEncoder::encodeToTensor () {
@@ -300,10 +305,13 @@ void AudioEncoder::encode() {
300305 // encoded frame would contain more samples than necessary and our results
301306 // wouldn't match the ffmpeg CLI.
302307 avFrame->nb_samples = numSamplesToEncode;
303- encodeInnerLoop (autoAVPacket, avFrame);
304308
305- avFrame->pts += static_cast <int64_t >(numSamplesToEncode);
309+ UniqueAVFrame convertedAVFrame = maybeConvertAVFrame (avFrame);
310+ encodeInnerLoop (autoAVPacket, convertedAVFrame);
311+
306312 numEncodedSamples += numSamplesToEncode;
313+ // TODO-ENCODING set frame pts correctly, and test against it.
314+ // avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
307315 }
308316 TORCH_CHECK (numEncodedSamples == numSamples, " Hmmmmmm something went wrong." );
309317
@@ -316,67 +324,69 @@ void AudioEncoder::encode() {
316324 getFFMPEGErrorStringFromErrorCode (status));
317325}
318326
319- void AudioEncoder::encodeInnerLoop (
320- AutoAVPacket& autoAVPacket,
321- UniqueAVFrame& srcAVFrame,
322- bool allowConvert) {
323- // TODO: Probably makes more sense to move the conversion away? It shouldn't
324- // be in inner loop in any case. We should also remove allowConvert.
325- bool mustConvert =
326- (allowConvert && srcAVFrame != nullptr &&
327- (static_cast <AVSampleFormat>(srcAVFrame->format ) !=
328- avCodecContext_->sample_fmt ||
329- getNumChannels (srcAVFrame) != outNumChannels_ ||
330- srcAVFrame->sample_rate != outSampleRate_));
331-
332- UniqueAVFrame convertedAVFrame;
333- if (mustConvert) {
334- if (!swrContext_) {
335- swrContext_.reset (createSwrContext (
336- AV_SAMPLE_FMT_FLTP,
337- avCodecContext_->sample_fmt ,
338- srcAVFrame->sample_rate ,
339- outSampleRate_,
340- srcAVFrame,
341- outNumChannels_));
342- }
343- convertedAVFrame = convertAudioAVFrameSamples (
344- swrContext_,
345- srcAVFrame,
327+ UniqueAVFrame AudioEncoder::maybeConvertAVFrame (const UniqueAVFrame& avFrame) {
328+ if (static_cast <AVSampleFormat>(avFrame->format ) ==
329+ avCodecContext_->sample_fmt &&
330+ getNumChannels (avFrame) == outNumChannels_ &&
331+ avFrame->sample_rate == outSampleRate_) {
332+ // Note: the clone references the same underlying data, it's a cheap copy.
333+ return UniqueAVFrame (av_frame_clone (avFrame.get ()));
334+ }
335+
336+ if (!swrContext_) {
337+ swrContext_.reset (createSwrContext (
338+ static_cast <AVSampleFormat>(avFrame->format ),
346339 avCodecContext_->sample_fmt ,
340+ avFrame->sample_rate ,
347341 outSampleRate_,
348- outNumChannels_);
349- if (outSampleRate_ == sampleRateInput_) {
350- TORCH_CHECK (
351- convertedAVFrame->nb_samples == srcAVFrame->nb_samples ,
352- " convertedAVFrame->nb_samples=" ,
353- convertedAVFrame->nb_samples ,
354- " differs from " ,
355- " srcAVFrame->nb_samples=" ,
356- srcAVFrame->nb_samples ,
357- " This is unexpected, please report on the TorchCodec bug tracker." );
358- }
342+ avFrame,
343+ outNumChannels_));
359344 }
360- UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
361-
362- if (avFrame != nullptr ) {
363- // TODO static cast
364- int numSamplesWritten = av_audio_fifo_write (avAudioFifo_.get (), (void **)avFrame->data , avFrame->nb_samples );
365- TORCH_CHECK (numSamplesWritten == avFrame->nb_samples , " Tried to write TODO" );
366- printf (" Writing %d samples to fifo (size = %d)\n " , avFrame->nb_samples , av_audio_fifo_size (avAudioFifo_.get ()));
367-
368- avFrame = allocateAVFrame (avCodecContext_->frame_size , outSampleRate_, outNumChannels_);
369- // TODO cast
370- int numSamplesRead = av_audio_fifo_read (avAudioFifo_.get (), (void **)avFrame->data , avFrame->nb_samples );
371- printf (" Read %d from fifo\n " , numSamplesRead);
372- TORCH_CHECK (numSamplesRead > 0 , " Tried to read TODO" );
345+ UniqueAVFrame convertedAVFrame = convertAudioAVFrameSamples (
346+ swrContext_,
347+ avFrame,
348+ avCodecContext_->sample_fmt ,
349+ outSampleRate_,
350+ outNumChannels_);
351+
352+ if (avFrame->sample_rate == outSampleRate_) {
353+ TORCH_CHECK (
354+ convertedAVFrame->nb_samples == avFrame->nb_samples ,
355+ " convertedAVFrame->nb_samples=" ,
356+ convertedAVFrame->nb_samples ,
357+ " differs from " ,
358+ " avFrame->nb_samples=" ,
359+ avFrame->nb_samples ,
360+ " This is unexpected, please report on the TorchCodec bug tracker." );
373361 }
362+ return convertedAVFrame;
363+ }
374364
375- if (avFrame != nullptr ) {
376- printf (" Sending frame with %d samples\n " , avFrame->nb_samples );
377- } else {
378- printf (" AVFrame is empty\n " );
379- }
365+ void AudioEncoder::encodeInnerLoop (
366+ AutoAVPacket& autoAVPacket,
367+ const UniqueAVFrame& avFrame) {
368+ // if (avFrame != nullptr) {
369+ // // TODO static cast
370+ // int numSamplesWritten = av_audio_fifo_write(avAudioFifo_.get(),
371+ // (void**)avFrame->data, avFrame->nb_samples);
372+ // TORCH_CHECK(numSamplesWritten == avFrame->nb_samples, "Tried to write
373+ // TODO"); printf("Writing %d samples to fifo (size = %d)\n",
374+ // avFrame->nb_samples, av_audio_fifo_size(avAudioFifo_.get()));
375+
376+ // avFrame = allocateAVFrame(avCodecContext_->frame_size, outSampleRate_,
377+ // outNumChannels_);
378+ // // TODO cast
379+ // int numSamplesRead = av_audio_fifo_read(avAudioFifo_.get(),
380+ // (void**)avFrame->data, avFrame->nb_samples); printf("Read %d from
381+ // fifo\n", numSamplesRead); TORCH_CHECK(numSamplesRead > 0, "Tried to
382+ // read TODO");
383+ // }
384+
385+ // if (avFrame != nullptr) {
386+ // printf("Sending frame with %d samples\n", avFrame->nb_samples);
387+ // } else{
388+ // printf("AVFrame is empty\n");
389+ // }
380390 auto status = avcodec_send_frame (avCodecContext_.get (), avFrame.get ());
381391 TORCH_CHECK (
382392 status == AVSUCCESS,
@@ -434,13 +444,12 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
434444 swrContext_.get (), avFrame->data , avFrame->nb_samples , NULL , 0 );
435445 avFrame->nb_samples = actualNumRemainingSamples;
436446
437- encodeInnerLoop (autoAVPacket, avFrame, false );
447+ encodeInnerLoop (autoAVPacket, avFrame);
438448}
439449
440450void AudioEncoder::flushBuffers () {
441451 AutoAVPacket autoAVPacket;
442452 maybeFlushSwrBuffers (autoAVPacket);
443- auto zob = UniqueAVFrame (nullptr );
444- encodeInnerLoop (autoAVPacket, zob);
453+ encodeInnerLoop (autoAVPacket, UniqueAVFrame (nullptr ));
445454}
446455} // namespace facebook::torchcodec
0 commit comments