meta-pytorch
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 11 additions & 4 deletions b/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 2 additions & 1 deletion b/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 124 additions & 7 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 124 additions & 7 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 15 additions & 0 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoderOps.cpp‎
Lines changed: 21 additions & 2 deletions b/‎src/torchcodec/decoders/_core/VideoDecoderOps.cpp‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoderOps.h‎
Lines changed: 11 additions & 0 deletions b/‎src/torchcodec/decoders/_core/VideoDecoderOps.h‎
Lines changed: 11 additions & 0 deletions
@@ -152,7 +152,7 @@ format you want. Refer to Nvidia's GPU support matrix for more details
    the CUDA Toolkit.
 
 2. Install or compile FFmpeg with NVDEC support.
-   TorchCodec with CUDA should work with FFmpeg versions in [5, 7].
+   TorchCodec with CUDA should work with FFmpeg versions in [4, 7].
 
    If FFmpeg is not already installed, or you need a more recent version, an
    easy way to install it is to use `conda`:
 
@@ -1,5 +1,5 @@
 [project]
-name = "TorchCodec"
+name = "torchcodec"
 description = "A video decoder for PyTorch"
 readme = "README.md"
 requires-python = ">=3.8"
 
@@ -60,15 +60,22 @@ int64_t getDuration(const AVFrame* frame) {
 #endif
 }
 
-int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext) {
+int getNumChannels(const AVFrame* avFrame) {
 #if LIBAVFILTER_VERSION_MAJOR > 8 || \
     (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
-  int numChannels = avCodecContext->ch_layout.nb_channels;
+  return avFrame->ch_layout.nb_channels;
 #else
-  int numChannels = avCodecContext->channels;
+  return av_get_channel_layout_nb_channels(avFrame->channel_layout);
 #endif
+}
 
-  return static_cast<int64_t>(numChannels);
+int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
+#if LIBAVFILTER_VERSION_MAJOR > 8 || \
+    (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
+  return avCodecContext->ch_layout.nb_channels;
+#else
+  return avCodecContext->channels;
+#endif
 }
 
 AVIOBytesContext::AVIOBytesContext(
 
@@ -139,7 +139,8 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
 int64_t getDuration(const UniqueAVFrame& frame);
 int64_t getDuration(const AVFrame* frame);
 
-int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext);
+int getNumChannels(const AVFrame* avFrame);
+int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();
 
@@ -8,6 +8,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <iostream>
+#include <limits>
 #include <sstream>
 #include <stdexcept>
 #include <string_view>
@@ -552,7 +553,8 @@ void VideoDecoder::addAudioStream(int streamIndex) {
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
   streamMetadata.sampleRate =
       static_cast<int64_t>(streamInfo.codecContext->sample_rate);
-  streamMetadata.numChannels = getNumChannels(streamInfo.codecContext);
+  streamMetadata.numChannels =
+      static_cast<int64_t>(getNumChannels(streamInfo.codecContext));
 }
 
 // --------------------------------------------------------------------------
@@ -567,6 +569,7 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
 
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
   AVFrameStream avFrameStream = decodeAVFrame(
       [this](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
   return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
@@ -685,6 +688,7 @@ VideoDecoder::getFramesInRange(int64_t start, int64_t stop, int64_t step) {
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAt(double seconds) {
+  validateActiveStream(AVMEDIA_TYPE_VIDEO);
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
   double frameStartTime =
       ptsToSeconds(streamInfo.lastDecodedAvFramePts, streamInfo.timeBase);
@@ -757,7 +761,6 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
     double startSeconds,
     double stopSeconds) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
-
   const auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
   TORCH_CHECK(
@@ -835,6 +838,74 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
   return frameBatchOutput;
 }
 
+VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
+    double startSeconds,
+    std::optional<double> stopSecondsOptional) {
+  validateActiveStream(AVMEDIA_TYPE_AUDIO);
+
+  double stopSeconds =
+      stopSecondsOptional.value_or(std::numeric_limits<double>::max());
+
+  TORCH_CHECK(
+      startSeconds <= stopSeconds,
+      "Start seconds (" + std::to_string(startSeconds) +
+          ") must be less than or equal to stop seconds (" +
+          std::to_string(stopSeconds) + ").");
+
+  if (startSeconds == stopSeconds) {
+    // For consistency with video
+    return AudioFramesOutput{torch::empty({0}), 0.0};
+  }
+
+  StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
+
+  auto startPts = secondsToClosestPts(startSeconds, streamInfo.timeBase);
+  if (startPts < streamInfo.lastDecodedAvFramePts +
+          streamInfo.lastDecodedAvFrameDuration) {
+    // If we need to seek backwards, then we have to seek back to the beginning
+    // of the stream.
+    // TODO-AUDIO: document why this is needed in a big comment.
+    setCursorPtsInSeconds(INT64_MIN);
+  }
+
+  // TODO-AUDIO Pre-allocate a long-enough tensor instead of creating a vec +
+  // cat(). This would save a copy. We know the duration of the output and the
+  // sample rate, so in theory we know the number of output samples.
+  std::vector<torch::Tensor> frames;
+
+  double firstFramePtsSeconds = std::numeric_limits<double>::max();
+  auto stopPts = secondsToClosestPts(stopSeconds, streamInfo.timeBase);
+  auto finished = false;
+  while (!finished) {
+    try {
+      AVFrameStream avFrameStream = decodeAVFrame([startPts](AVFrame* avFrame) {
+        return startPts < avFrame->pts + getDuration(avFrame);
+      });
+      // TODO: it's not great that we are getting a FrameOutput, which is
+      // intended for videos. We should consider bypassing
+      // convertAVFrameToFrameOutput and directly call
+      // convertAudioAVFrameToFrameOutputOnCPU.
+      auto frameOutput = convertAVFrameToFrameOutput(avFrameStream);
+      firstFramePtsSeconds =
+          std::min(firstFramePtsSeconds, frameOutput.ptsSeconds);
+      frames.push_back(frameOutput.data);
+    } catch (const EndOfFileException& e) {
+      finished = true;
+    }
+
+    // If stopSeconds is in [begin, end] of the last decoded frame, we should
+    // stop decoding more frames. Note that if we were to use [begin, end),
+    // which may seem more natural, then we would decode the frame starting at
+    // stopSeconds, which isn't what we want!
+    auto lastDecodedAvFrameEnd = streamInfo.lastDecodedAvFramePts +
+        streamInfo.lastDecodedAvFrameDuration;
+    finished |= (streamInfo.lastDecodedAvFramePts) <= stopPts &&
+        (stopPts <= lastDecodedAvFrameEnd);
+  }
+
+  return AudioFramesOutput{torch::cat(frames, 1), firstFramePtsSeconds};
+}
+
 // --------------------------------------------------------------------------
 // SEEKING APIs
 // --------------------------------------------------------------------------
@@ -871,6 +942,12 @@ I    P     P    P    I    P    P    P    I    P    P    I    P    P    I    P
 (2) is more efficient than (1) if there is an I frame between x and y.
 */
 bool VideoDecoder::canWeAvoidSeeking() const {
+  const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
+  if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
+    // For audio, we only need to seek if a backwards seek was requested within
+    // getFramesPlayedInRangeAudio(), when setCursorPtsInSeconds() was called.
+    return !cursorWasJustSet_;
+  }
   int64_t lastDecodedAvFramePts =
       streamInfos_.at(activeStreamIndex_).lastDecodedAvFramePts;
   if (cursor_ < lastDecodedAvFramePts) {
@@ -897,7 +974,7 @@ bool VideoDecoder::canWeAvoidSeeking() const {
 // AVFormatContext if it is needed. We can skip seeking in certain cases. See
 // the comment of canWeAvoidSeeking() for details.
 void VideoDecoder::maybeSeekToBeforeDesiredPts() {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
 
   decodeStats_.numSeeksAttempted++;
@@ -942,7 +1019,7 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
 
 VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
     std::function<bool(AVFrame*)> filterFunction) {
-  validateActiveStream(AVMEDIA_TYPE_VIDEO);
+  validateActiveStream();
 
   resetDecodeStats();
 
@@ -1071,13 +1148,14 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
   AVFrame* avFrame = avFrameStream.avFrame.get();
   frameOutput.streamIndex = streamIndex;
   auto& streamInfo = streamInfos_[streamIndex];
-  TORCH_CHECK(streamInfo.stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO);
   frameOutput.ptsSeconds = ptsToSeconds(
       avFrame->pts, formatContext_->streams[streamIndex]->time_base);
   frameOutput.durationSeconds = ptsToSeconds(
       getDuration(avFrame), formatContext_->streams[streamIndex]->time_base);
-  // TODO: we should fold preAllocatedOutputTensor into AVFrameStream.
-  if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
+  if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
+    convertAudioAVFrameToFrameOutputOnCPU(
+        avFrameStream, frameOutput, preAllocatedOutputTensor);
+  } else if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
     convertAVFrameToFrameOutputOnCPU(
         avFrameStream, frameOutput, preAllocatedOutputTensor);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCUDA) {
@@ -1253,6 +1331,45 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
       filteredAVFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
 }
 
+void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
+    VideoDecoder::AVFrameStream& avFrameStream,
+    FrameOutput& frameOutput,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  TORCH_CHECK(
+      !preAllocatedOutputTensor.has_value(),
+      "pre-allocated audio tensor not supported yet.");
+
+  const AVFrame* avFrame = avFrameStream.avFrame.get();
+
+  auto numSamples = avFrame->nb_samples; // per channel
+  auto numChannels = getNumChannels(avFrame);
+  torch::Tensor outputData =
+      torch::empty({numChannels, numSamples}, torch::kFloat32);
+
+  AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
+  // TODO-AUDIO Implement all formats.
+  switch (format) {
+    case AV_SAMPLE_FMT_FLTP: {
+      uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
+      auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
+      for (auto channel = 0; channel < numChannels;
+           ++channel, outputChannelData += numBytesPerChannel) {
+        memcpy(
+            outputChannelData,
+            avFrame->extended_data[channel],
+            numBytesPerChannel);
+      }
+      break;
+    }
+    default:
+      TORCH_CHECK(
+          false,
+          "Unsupported audio format (yet!): ",
+          av_get_sample_fmt_name(format));
+  }
+  frameOutput.data = outputData;
+}
+
 // --------------------------------------------------------------------------
 // OUTPUT ALLOCATION AND SHAPE CONVERSION
 // --------------------------------------------------------------------------
 
@@ -170,6 +170,11 @@ class VideoDecoder {
         const StreamMetadata& streamMetadata);
   };
 
+  struct AudioFramesOutput {
+    torch::Tensor data; // shape is (numChannels, numSamples)
+    double ptsSeconds;
+  };
+
   // Places the cursor at the first frame on or after the position in seconds.
   // Calling getNextFrame() will return the first frame at
   // or after this position.
@@ -221,6 +226,11 @@ class VideoDecoder {
       double startSeconds,
       double stopSeconds);
 
+  // TODO-AUDIO: Should accept sampleRate
+  AudioFramesOutput getFramesPlayedInRangeAudio(
+      double startSeconds,
+      std::optional<double> stopSecondsOptional = std::nullopt);
+
   class EndOfFileException : public std::runtime_error {
    public:
     explicit EndOfFileException(const std::string& msg)
@@ -379,6 +389,11 @@ class VideoDecoder {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
+  void convertAudioAVFrameToFrameOutputOnCPU(
+      AVFrameStream& avFrameStream,
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
+
   torch::Tensor convertAVFrameToTensorUsingFilterGraph(const AVFrame* avFrame);
 
   int convertAVFrameToTensorUsingSwsScale(
 
@@ -25,8 +25,7 @@ namespace facebook::torchcodec {
 //   https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native#readme
 TORCH_LIBRARY(torchcodec_ns, m) {
   m.impl_abstract_pystub(
-      "torchcodec.decoders._core.video_decoder_ops",
-      "//pytorch/torchcodec:torchcodec");
+      "torchcodec.decoders._core.ops", "//pytorch/torchcodec:torchcodec");
   m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
@@ -48,6 +47,8 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "get_frames_in_range(Tensor(a!) decoder, *, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
   m.def(
       "get_frames_by_pts_in_range(Tensor(a!) decoder, *, float start_seconds, float stop_seconds) -> (Tensor, Tensor, Tensor)");
+  m.def(
+      "get_frames_by_pts_in_range_audio(Tensor(a!) decoder, *, float start_seconds, float? stop_seconds) -> (Tensor, Tensor)");
   m.def(
       "get_frames_by_pts(Tensor(a!) decoder, *, float[] timestamps) -> (Tensor, Tensor, Tensor)");
   m.def("_get_key_frame_indices(Tensor(a!) decoder) -> Tensor");
@@ -93,6 +94,13 @@ OpsFrameBatchOutput makeOpsFrameBatchOutput(
   return std::make_tuple(batch.data, batch.ptsSeconds, batch.durationSeconds);
 }
 
+OpsAudioFramesOutput makeOpsAudioFramesOutput(
+    VideoDecoder::AudioFramesOutput& audioFrames) {
+  return std::make_tuple(
+      audioFrames.data,
+      torch::tensor(audioFrames.ptsSeconds, torch::dtype(torch::kFloat64)));
+}
+
 VideoDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
   if (seekMode == "exact") {
     return VideoDecoder::SeekMode::exact;
@@ -289,6 +297,16 @@ OpsFrameBatchOutput get_frames_by_pts_in_range(
   return makeOpsFrameBatchOutput(result);
 }
 
+OpsAudioFramesOutput get_frames_by_pts_in_range_audio(
+    at::Tensor& decoder,
+    double start_seconds,
+    std::optional<double> stop_seconds) {
+  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
+  auto result =
+      videoDecoder->getFramesPlayedInRangeAudio(start_seconds, stop_seconds);
+  return makeOpsAudioFramesOutput(result);
+}
+
 std::string quoteValue(const std::string& value) {
   return "\"" + value + "\"";
 }
@@ -540,6 +558,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("get_frames_at_indices", &get_frames_at_indices);
   m.impl("get_frames_in_range", &get_frames_in_range);
   m.impl("get_frames_by_pts_in_range", &get_frames_by_pts_in_range);
+  m.impl("get_frames_by_pts_in_range_audio", &get_frames_by_pts_in_range_audio);
   m.impl("get_frames_by_pts", &get_frames_by_pts);
   m.impl("_test_frame_pts_equality", &_test_frame_pts_equality);
   m.impl(
 
@@ -74,6 +74,12 @@ using OpsFrameOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
 //   single float.
 using OpsFrameBatchOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
 
+// The elements of this tuple are all tensors that represent the concatenation
+// of multiple audio frames:
+//   1. The frames data (concatenated)
+//   2. A single float value for the pts of the first frame, in seconds.
+using OpsAudioFramesOutput = std::tuple<at::Tensor, at::Tensor>;
+
 // Return the frame that is visible at a given timestamp in seconds. Each frame
 // in FFMPEG has a presentation timestamp and a duration. The frame visible at a
 // given timestamp T has T >= PTS and T < PTS + Duration.
@@ -112,6 +118,11 @@ OpsFrameBatchOutput get_frames_by_pts_in_range(
     double start_seconds,
     double stop_seconds);
 
+OpsAudioFramesOutput get_frames_by_pts_in_range_audio(
+    at::Tensor& decoder,
+    double start_seconds,
+    std::optional<double> stop_seconds = std::nullopt);
+
 // For testing only. We need to implement this operation as a core library
 // function because what we're testing is round-tripping pts values as
 // double-precision floating point numbers from C++ to Python and back to C++.