meta-pytorch
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 11 additions & 0 deletions b/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 153 additions & 20 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 153 additions & 20 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 12 additions & 0 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoderOps.cpp‎
Lines changed: 6 additions & 0 deletions b/‎src/torchcodec/decoders/_core/VideoDecoderOps.cpp‎
Lines changed: 6 additions & 0 deletions
@@ -60,6 +60,17 @@ int64_t getDuration(const AVFrame* frame) {
 #endif
 }
 
+int64_t getNumChannels(const AVFrame* avFrame) {
+#if LIBAVFILTER_VERSION_MAJOR > 8 || \
+    (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
+  int numChannels = avFrame->ch_layout.nb_channels;
+#else
+  int numChannels = av_get_channel_layout_nb_channels(avFrame->channel_layout);
+#endif
+
+  return static_cast<int64_t>(numChannels);
+}
+
 int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext) {
 #if LIBAVFILTER_VERSION_MAJOR > 8 || \
     (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
 
@@ -139,6 +139,7 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
 int64_t getDuration(const UniqueAVFrame& frame);
 int64_t getDuration(const AVFrame* frame);
 
+int64_t getNumChannels(const AVFrame* avFrame);
 int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
 // Returns true if sws_scale can handle unaligned data.
 
@@ -169,6 +169,20 @@ void VideoDecoder::initializeDecoder() {
       }
       containerMetadata_.numVideoStreams++;
     } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
+      // TODO-AUDIO Remove this, we shouldn't need it. We should probably write
+      // a pts-based "getFramesPlayedInRange" from scratch without going back to
+      // indices.
+      int numSamplesPerFrame = avStream->codecpar->frame_size;
+      int sampleRate = avStream->codecpar->sample_rate;
+      if (numSamplesPerFrame > 0 && sampleRate > 0) {
+        // This should allow the approximate mode to do its magic.
+        // fps is numFrames / duration where
+        // - duration = numSamplesTotal / sampleRate and
+        // - numSamplesTotal = numSamplesPerFrame * numFrames
+        // so fps = numFrames * sampleRate / (numSamplesPerFrame * numFrames)
+        streamMetadata.averageFps =
+            static_cast<double>(sampleRate) / numSamplesPerFrame;
+      }
       containerMetadata_.numAudioStreams++;
     }
 
@@ -549,8 +563,20 @@ void VideoDecoder::addAudioStream(int streamIndex) {
   addStream(streamIndex, AVMEDIA_TYPE_AUDIO);
 
   auto& streamInfo = streamInfos_[activeStreamIndex_];
+
+  // TODO-AUDIO
+  TORCH_CHECK(
+      streamInfo.codecContext->frame_size > 0,
+      "No support for audio variable framerate yet.");
+
   auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
+
+  // TODO-AUDIO
+  TORCH_CHECK(
+      streamMetadata.averageFps.has_value(),
+      "frame_size or sampl_rate aren't known. Cannot decode.");
+
   streamMetadata.sampleRate =
       static_cast<int64_t>(streamInfo.codecContext->sample_rate);
   streamMetadata.numChannels = getNumChannels(streamInfo.codecContext);
@@ -562,7 +588,7 @@ void VideoDecoder::addAudioStream(int streamIndex) {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
   auto output = getNextFrameInternal();
-  output.data = maybePermuteHWC2CHW(output.data);
+  output.data = maybePermuteOutputTensor(output.data);
   return output;
 }
 
@@ -576,6 +602,7 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndex(int64_t frameIndex) {
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
   auto frameOutput = getFrameAtIndexInternal(frameIndex);
   frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
   return frameOutput;
@@ -584,7 +611,7 @@ VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndex(int64_t frameIndex) {
 VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndexInternal(
     int64_t frameIndex,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
 
   const auto& streamInfo = streamInfos_[activeStreamIndex_];
   const auto& streamMetadata =
@@ -688,6 +715,7 @@ VideoDecoder::getFramesInRange(int64_t start, int64_t stop, int64_t step) {
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAt(double seconds) {
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
   double frameStartTime =
       ptsToSeconds(streamInfo.lastDecodedAvFramePts, streamInfo.timeBase);
@@ -759,19 +787,29 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedAt(
 VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
     double startSeconds,
     double stopSeconds) {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
+  // Because we currently never seek with audio streams, we prevent users from
+  // calling this method twice. We could allow multiple calls in the future.
+  // Assuming 2 consecutive calls:
+  // ```
+  // getFramesPlayedInRange(startSeconds1, stopSeconds1);
+  // getFramesPlayedInRange(startSeconds2, stopSeconds2);
+  // ```
+  // We would need to seek back to 0 iff startSeconds2 <= stopSeconds1. This
+  // logic is not implemented for now, so we just error.
+
+  TORCH_CHECK(
+      streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO ||
+          !alreadyCalledGetFramesPlayedInRange_,
+      "Can only decode once with audio stream. Re-create a decoder object if needed.")
+  alreadyCalledGetFramesPlayedInRange_ = true;
 
-  const auto& streamMetadata =
-      containerMetadata_.allStreamMetadata[activeStreamIndex_];
   TORCH_CHECK(
       startSeconds <= stopSeconds,
       "Start seconds (" + std::to_string(startSeconds) +
           ") must be less than or equal to stop seconds (" +
           std::to_string(stopSeconds) + ".");
 
-  const auto& streamInfo = streamInfos_[activeStreamIndex_];
-  const auto& videoStreamOptions = streamInfo.videoStreamOptions;
-
   // Special case needed to implement a half-open range. At first glance, this
   // may seem unnecessary, as our search for stopFrame can return the end, and
   // we don't include stopFramIndex in our output. However, consider the
@@ -790,11 +828,14 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
   // values of the intervals will map to the same frame indices below. Hence, we
   // need this special case below.
   if (startSeconds == stopSeconds) {
-    FrameBatchOutput frameBatchOutput(0, videoStreamOptions, streamMetadata);
-    frameBatchOutput.data = maybePermuteHWC2CHW(frameBatchOutput.data);
+    FrameBatchOutput frameBatchOutput = makeFrameBatchOutput(0);
+    frameBatchOutput.data = maybePermuteOutputTensor(frameBatchOutput.data);
     return frameBatchOutput;
   }
 
+  const auto& streamMetadata =
+      containerMetadata_.allStreamMetadata[activeStreamIndex_];
+
   double minSeconds = getMinSeconds(streamMetadata);
   double maxSeconds = getMaxSeconds(streamMetadata);
   TORCH_CHECK(
@@ -825,15 +866,14 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
   int64_t stopFrameIndex = secondsToIndexUpperBound(stopSeconds);
   int64_t numFrames = stopFrameIndex - startFrameIndex;
 
-  FrameBatchOutput frameBatchOutput(
-      numFrames, videoStreamOptions, streamMetadata);
+  FrameBatchOutput frameBatchOutput = makeFrameBatchOutput(numFrames);
   for (int64_t i = startFrameIndex, f = 0; i < stopFrameIndex; ++i, ++f) {
     FrameOutput frameOutput =
         getFrameAtIndexInternal(i, frameBatchOutput.data[f]);
     frameBatchOutput.ptsSeconds[f] = frameOutput.ptsSeconds;
     frameBatchOutput.durationSeconds[f] = frameOutput.durationSeconds;
   }
-  frameBatchOutput.data = maybePermuteHWC2CHW(frameBatchOutput.data);
+  frameBatchOutput.data = maybePermuteOutputTensor(frameBatchOutput.data);
 
   return frameBatchOutput;
 }
@@ -872,8 +912,12 @@ I    P     P    P    I    P    P    P    I    P    P    I    P    P    I    P
 (2) is more efficient than (1) if there is an I frame between x and y.
 */
 bool VideoDecoder::canWeAvoidSeeking(int64_t targetPts) const {
-  int64_t lastDecodedAvFramePts =
-      streamInfos_.at(activeStreamIndex_).lastDecodedAvFramePts;
+  const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
+  if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
+    return true;
+  }
+
+  int64_t lastDecodedAvFramePts = streamInfo.lastDecodedAvFramePts;
   if (targetPts < lastDecodedAvFramePts) {
     // We can never skip a seek if we are seeking backwards.
     return false;
@@ -898,7 +942,7 @@ bool VideoDecoder::canWeAvoidSeeking(int64_t targetPts) const {
 // AVFormatContext if it is needed. We can skip seeking in certain cases. See
 // the comment of canWeAvoidSeeking() for details.
 void VideoDecoder::maybeSeekToBeforeDesiredPts() {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
 
   int64_t desiredPts =
@@ -945,7 +989,7 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
 
 VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
     std::function<bool(AVFrame*)> filterFunction) {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
 
   resetDecodeStats();
 
@@ -1075,13 +1119,14 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
   AVFrame* avFrame = avFrameStream.avFrame.get();
   frameOutput.streamIndex = streamIndex;
   auto& streamInfo = streamInfos_[streamIndex];
-  TORCH_CHECK(streamInfo.stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO);
   frameOutput.ptsSeconds = ptsToSeconds(
       avFrame->pts, formatContext_->streams[streamIndex]->time_base);
   frameOutput.durationSeconds = ptsToSeconds(
       getDuration(avFrame), formatContext_->streams[streamIndex]->time_base);
-  // TODO: we should fold preAllocatedOutputTensor into AVFrameStream.
-  if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
+  if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
+    convertAudioAVFrameToFrameOutputOnCPU(
+        avFrameStream, frameOutput, preAllocatedOutputTensor);
+  } else if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
     convertAVFrameToFrameOutputOnCPU(
         avFrameStream, frameOutput, preAllocatedOutputTensor);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCUDA) {
@@ -1257,6 +1302,48 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
       filteredAVFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
 }
 
+void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
+    VideoDecoder::AVFrameStream& avFrameStream,
+    FrameOutput& frameOutput,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  const AVFrame* avFrame = avFrameStream.avFrame.get();
+
+  auto numSamples = avFrame->nb_samples; // per channel
+  auto numChannels = getNumChannels(avFrame);
+
+  // TODO-AUDIO: dtype should be format-dependent
+  // TODO-AUDIO rename data to something else
+  torch::Tensor data;
+  if (preAllocatedOutputTensor.has_value()) {
+    data = preAllocatedOutputTensor.value();
+  } else {
+    data = torch::empty({numChannels, numSamples}, torch::kFloat32);
+  }
+
+  AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
+  // TODO Implement all formats
+  switch (format) {
+    case AV_SAMPLE_FMT_FLTP: {
+      uint8_t* outputChannelData = static_cast<uint8_t*>(data.data_ptr());
+      auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
+      for (auto channel = 0; channel < numChannels;
+           ++channel, outputChannelData += numBytesPerChannel) {
+        memcpy(
+            outputChannelData,
+            avFrame->extended_data[channel],
+            numBytesPerChannel);
+      }
+      break;
+    }
+    default:
+      TORCH_CHECK(
+          false,
+          "Unsupported audio format (yet!): ",
+          av_get_sample_fmt_name(format));
+  }
+  frameOutput.data = data;
+}
+
 // --------------------------------------------------------------------------
 // OUTPUT ALLOCATION AND SHAPE CONVERSION
 // --------------------------------------------------------------------------
@@ -1275,6 +1362,41 @@ VideoDecoder::FrameBatchOutput::FrameBatchOutput(
       height, width, videoStreamOptions.device, numFrames);
 }
 
+VideoDecoder::FrameBatchOutput::FrameBatchOutput(
+    int64_t numFrames,
+    int64_t numChannels,
+    int64_t numSamples)
+    : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
+      durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
+  // TODO handle dtypes other than float
+  auto tensorOptions = torch::TensorOptions()
+                           .dtype(torch::kFloat32)
+                           .layout(torch::kStrided)
+                           .device(torch::kCPU);
+  data = torch::empty({numFrames, numChannels, numSamples}, tensorOptions);
+}
+
+VideoDecoder::FrameBatchOutput VideoDecoder::makeFrameBatchOutput(
+    int64_t numFrames) {
+  const auto& streamInfo = streamInfos_[activeStreamIndex_];
+  if (streamInfo.avMediaType == AVMEDIA_TYPE_VIDEO) {
+    const auto& videoStreamOptions = streamInfo.videoStreamOptions;
+    const auto& streamMetadata =
+        containerMetadata_.allStreamMetadata[activeStreamIndex_];
+    return FrameBatchOutput(numFrames, videoStreamOptions, streamMetadata);
+  } else {
+    // TODO-AUDIO
+    // We asserted that frame_size is non-zero when we added the stream, but it
+    // may not always be the case.
+    // When it's 0, we can't pre-allocate the output tensor as we don't know the
+    // number of samples per channel, and it may be non-constant. We'll have to
+    // find a way to make the batch-APIs work without pre-allocation.
+    int64_t numSamples = streamInfo.codecContext->frame_size;
+    int64_t numChannels = getNumChannels(streamInfo.codecContext);
+    return FrameBatchOutput(numFrames, numChannels, numSamples);
+  }
+}
+
 torch::Tensor allocateEmptyHWCTensor(
     int height,
     int width,
@@ -1296,6 +1418,17 @@ torch::Tensor allocateEmptyHWCTensor(
   }
 }
 
+torch::Tensor VideoDecoder::maybePermuteOutputTensor(
+    torch::Tensor& outputTensor) {
+  if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
+    return maybePermuteHWC2CHW(outputTensor);
+  } else {
+    // No need to do anything for audio. We always return (numChannels,
+    // numSamples) or (numFrames, numChannels, numSamples)
+    return outputTensor;
+  }
+}
+
 // Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
 // The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
 // or 4D.
 
@@ -168,6 +168,10 @@ class VideoDecoder {
         int64_t numFrames,
         const VideoStreamOptions& videoStreamOptions,
         const StreamMetadata& streamMetadata);
+    explicit FrameBatchOutput(
+        int64_t numFrames,
+        int64_t numChannels,
+        int64_t numSamples);
   };
 
   // Places the cursor at the first frame on or after the position in seconds.
@@ -372,6 +376,7 @@ class VideoDecoder {
   FrameOutput getNextFrameInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
+  torch::Tensor maybePermuteOutputTensor(torch::Tensor& outputTensor);
   torch::Tensor maybePermuteHWC2CHW(torch::Tensor& hwcTensor);
 
   FrameOutput convertAVFrameToFrameOutput(
@@ -383,12 +388,18 @@ class VideoDecoder {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
+  void convertAudioAVFrameToFrameOutputOnCPU(
+      AVFrameStream& avFrameStream,
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
+
   torch::Tensor convertAVFrameToTensorUsingFilterGraph(const AVFrame* avFrame);
 
   int convertAVFrameToTensorUsingSwsScale(
       const AVFrame* avFrame,
       torch::Tensor& outputTensor);
 
+  FrameBatchOutput makeFrameBatchOutput(int64_t numFrames);
   // --------------------------------------------------------------------------
   // COLOR CONVERSION LIBRARIES HANDLERS CREATION
   // --------------------------------------------------------------------------
@@ -477,6 +488,7 @@ class VideoDecoder {
   bool scannedAllStreams_ = false;
   // Tracks that we've already been initialized.
   bool initialized_ = false;
+  bool alreadyCalledGetFramesPlayedInRange_ = false;
 };
 
 // --------------------------------------------------------------------------
 
@@ -232,6 +232,12 @@ void add_audio_stream(
 }
 
 void seek_to_pts(at::Tensor& decoder, double seconds) {
+  // TODO-AUDIO we should prevent more than one call to this op for audio
+  // streams, for the same reasons we do so for getFramesPlayedInRange(). But we
+  // can't implement the logic here, because we don't know media type (audio vs
+  // video). We also can't do it within setCursorPtsInSeconds because it's used
+  // by all other decoding methods.  This isn't un-doable, just not easy with
+  // the API we currently have.
   auto videoDecoder = static_cast<VideoDecoder*>(decoder.mutable_data_ptr());
   videoDecoder->setCursorPtsInSeconds(seconds);
 }
Original file line number	Diff line number	Diff line change
`@@ -232,6 +232,12 @@ void add_audio_stream(`
`232`	`232`	`}`
`233`	`233`
`234`	`234`	`void seek_to_pts(at::Tensor& decoder, double seconds) {`
	`235`	`+ // TODO-AUDIO we should prevent more than one call to this op for audio`
	`236`	`+ // streams, for the same reasons we do so for getFramesPlayedInRange(). But we`
	`237`	`+ // can't implement the logic here, because we don't know media type (audio vs`
	`238`	`+ // video). We also can't do it within setCursorPtsInSeconds because it's used`
	`239`	`+ // by all other decoding methods. This isn't un-doable, just not easy with`
	`240`	`+ // the API we currently have.`
`235`	`241`	`auto videoDecoder = static_cast<VideoDecoder*>(decoder.mutable_data_ptr());`
`236`	`242`	`videoDecoder->setCursorPtsInSeconds(seconds);`
`237`	`243`	`}`