WIP

NicolasHug · NicolasHug · commit f525848b95d5 · 2025-04-09T11:04:28.000+01:00
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1345,20 +1345,29 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
       static_cast<AVSampleFormat>(srcAVFrame->format);
   AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
 
+  StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
   int sourceSampleRate = srcAVFrame->sample_rate;
   int desiredSampleRate =
-      streamInfos_[activeStreamIndex_].audioStreamOptions.sampleRate.value_or(
-          sourceSampleRate);
+      streamInfo.audioStreamOptions.sampleRate.value_or(sourceSampleRate);
 
   bool mustConvert =
       (sourceSampleFormat != desiredSampleFormat ||
        sourceSampleRate != desiredSampleRate);
 
   UniqueAVFrame convertedAVFrame;
   if (mustConvert) {
+    if (!streamInfo.swrContext) {
+      streamInfo.swrContext.reset(createSwrContext(
+          streamInfo.codecContext,
+          sourceSampleFormat,
+          desiredSampleFormat,
+          sourceSampleRate,
+          desiredSampleRate));
+    }
+
     convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
+        streamInfo.swrContext,
         srcAVFrame,
-        sourceSampleFormat,
         desiredSampleFormat,
         sourceSampleRate,
         desiredSampleRate);
@@ -1394,22 +1403,11 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
 }
 
 UniqueAVFrame SingleStreamDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
+    const UniqueSwrContext& swrContext,
     const UniqueAVFrame& srcAVFrame,
-    AVSampleFormat sourceSampleFormat,
     AVSampleFormat desiredSampleFormat,
     int sourceSampleRate,
     int desiredSampleRate) {
-  auto& streamInfo = streamInfos_[activeStreamIndex_];
-
-  if (!streamInfo.swrContext) {
-    streamInfo.swrContext.reset(createSwrContext(
-        streamInfo.codecContext,
-        sourceSampleFormat,
-        desiredSampleFormat,
-        sourceSampleRate,
-        desiredSampleRate));
-  }
-
   UniqueAVFrame convertedAVFrame(av_frame_alloc());
   TORCH_CHECK(
       convertedAVFrame,
@@ -1428,7 +1426,7 @@ UniqueAVFrame SingleStreamDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
     // output samples, but empirically `av_rescale_rnd()` seems to provide a
     // tighter bound.
     convertedAVFrame->nb_samples = av_rescale_rnd(
-        swr_get_delay(streamInfo.swrContext.get(), sourceSampleRate) +
+        swr_get_delay(swrContext.get(), sourceSampleRate) +
             srcAVFrame->nb_samples,
         desiredSampleRate,
         sourceSampleRate,
@@ -1444,7 +1442,7 @@ UniqueAVFrame SingleStreamDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
       getFFMPEGErrorStringFromErrorCode(status));
 
   auto numConvertedSamples = swr_convert(
-      streamInfo.swrContext.get(),
+      swrContext.get(),
       convertedAVFrame->data,
       convertedAVFrame->nb_samples,
       static_cast<const uint8_t**>(
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -288,8 +288,8 @@ class SingleStreamDecoder {
       torch::Tensor& outputTensor);
 
   UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
+      const UniqueSwrContext& swrContext,
       const UniqueAVFrame& srcAVFrame,
-      AVSampleFormat sourceSampleFormat,
       AVSampleFormat desiredSampleFormat,
       int sourceSampleRate,
       int desiredSampleRate);