MOOOOORE

NicolasHug · NicolasHug · commit 466ceb443eaf · 2025-03-05T13:58:09.000Z
diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._core import AudioStreamMetadata, VideoStreamMetadata
+from ._core import VideoStreamMetadata
 from ._video_decoder import VideoDecoder  # noqa
 
 SimpleVideoDecoder = VideoDecoder
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -17,7 +17,7 @@
 
 
 class AudioDecoder:
-    """TODO-audio docs"""
+    """TODO-AUDIO docs"""
 
     def __init__(
         self,
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
@@ -60,22 +60,15 @@ int64_t getDuration(const AVFrame* frame) {
 #endif
 }
 
-int getNumChannels(const AVFrame* avFrame) {
+int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext) {
 #if LIBAVFILTER_VERSION_MAJOR > 8 || \
     (IBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
-  return avFrame->ch_layout.nb_channels;
+  int numChannels = avCodecContext->ch_layout.nb_channels;
 #else
-  return av_get_channel_layout_nb_channels(avFrame->channel_layout);
+  int numChannels = avCodecContext->channels;
 #endif
-}
 
-int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
-#if LIBAVFILTER_VERSION_MAJOR > 8 || \
-    (IBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
-  return avCodecContext->ch_layout.nb_channels;
-#else
-  return avCodecContext->channels;
-#endif
+  return static_cast<int64_t>(numChannels);
 }
 
 AVIOBytesContext::AVIOBytesContext(
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -139,8 +139,7 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
 int64_t getDuration(const UniqueAVFrame& frame);
 int64_t getDuration(const AVFrame* frame);
 
-int getNumChannels(const AVFrame* avFrame);
-int getNumChannels(const UniqueAVCodecContext& avCodecContext);
+int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -169,17 +169,6 @@ void VideoDecoder::initializeDecoder() {
       }
       containerMetadata_.numVideoStreams++;
     } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
-      int numSamplesPerFrame = avStream->codecpar->frame_size;
-      int sampleRate = avStream->codecpar->sample_rate;
-      if (numSamplesPerFrame > 0 && sampleRate > 0) {
-        // This should allow the approximate mode to do its magic.
-        // fps is numFrames / duration where
-        // - duration = numSamplesTotal / sampleRate and
-        // - numSamplesTotal = numSamplesPerFrame * numFrames
-        // so fps = numFrames * sampleRate / (numSamplesPerFrame * numFrames)
-        streamMetadata.averageFps =
-            static_cast<double>(sampleRate) / numSamplesPerFrame;
-      }
       containerMetadata_.numAudioStreams++;
     }
 
@@ -422,7 +411,7 @@ VideoDecoder::VideoStreamOptions::VideoStreamOptions(
 void VideoDecoder::addStream(
     int streamIndex,
     AVMediaType mediaType,
-    const VideoStreamOptions& videoStreamOptions) {
+    const torch::Device& device) {
   TORCH_CHECK(
       activeStreamIndex_ == NO_ACTIVE_STREAM,
       "Can only add one single stream.");
@@ -457,36 +446,25 @@ void VideoDecoder::addStream(
       activeStreamIndex_,
       " which is of the wrong media type.");
 
-  // TODO_CODE_QUALITY this is meh to have that in the middle
-  if (mediaType == AVMEDIA_TYPE_VIDEO &&
-      videoStreamOptions.device.type() == torch::kCUDA) {
+  // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
+  // addStream() which is supposed to be generic
+  if (mediaType == AVMEDIA_TYPE_VIDEO && device.type() == torch::kCUDA) {
     avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-        findCudaCodec(
-            videoStreamOptions.device, streamInfo.stream->codecpar->codec_id)
+        findCudaCodec(device, streamInfo.stream->codecpar->codec_id)
             .value_or(avCodec));
   }
 
   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(codecContext != nullptr);
-  codecContext->thread_count =
-      videoStreamOptions.ffmpegThreadCount.value_or(0); // TODO VIDEO ONLY?
   streamInfo.codecContext.reset(codecContext);
 
   int retVal = avcodec_parameters_to_context(
       streamInfo.codecContext.get(), streamInfo.stream->codecpar);
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
-  // TODO_CODE_QUALITY meh again
-  if (mediaType == AVMEDIA_TYPE_VIDEO) {
-    if (videoStreamOptions.device.type() == torch::kCPU) {
-      // No more initialization needed for CPU.
-    } else if (videoStreamOptions.device.type() == torch::kCUDA) {
-      initializeContextOnCuda(videoStreamOptions.device, codecContext);
-    } else {
-      TORCH_CHECK(
-          false, "Invalid device type: " + videoStreamOptions.device.str());
-    }
-    streamInfo.videoStreamOptions = videoStreamOptions;
+  // TODO_CODE_QUALITY same as above.
+  if (mediaType == AVMEDIA_TYPE_VIDEO && device.type() == torch::kCUDA) {
+    initializeContextOnCuda(device, codecContext);
   }
 
   retVal = avcodec_open2(streamInfo.codecContext.get(), avCodec, nullptr);
@@ -512,9 +490,16 @@ void VideoDecoder::addStream(
 void VideoDecoder::addVideoStream(
     int streamIndex,
     const VideoStreamOptions& videoStreamOptions) {
-  addStream(streamIndex, AVMEDIA_TYPE_VIDEO, videoStreamOptions);
+  TORCH_CHECK(
+      videoStreamOptions.device.type() == torch::kCPU ||
+          videoStreamOptions.device.type() == torch::kCUDA,
+      "Invalid device type: " + videoStreamOptions.device.str());
+  addStream(streamIndex, AVMEDIA_TYPE_VIDEO, videoStreamOptions.device);
 
   auto& streamInfo = streamInfos_[activeStreamIndex_];
+  streamInfo.codecContext->thread_count =
+      videoStreamOptions.ffmpegThreadCount.value_or(0);
+
   containerMetadata_.allStreamMetadata[activeStreamIndex_].width =
       streamInfo.codecContext->width;
   containerMetadata_.allStreamMetadata[activeStreamIndex_].height =
@@ -547,8 +532,12 @@ void VideoDecoder::addAudioStream(int streamIndex) {
 
   addStream(streamIndex, AVMEDIA_TYPE_AUDIO);
 
-  containerMetadata_.allStreamMetadata[activeStreamIndex_].sampleRate =
-      streamInfo.codecContext->sample_rate;
+  auto& streamInfo = streamInfos_[activeStreamIndex_];
+  auto& streamMetadata =
+      containerMetadata_.allStreamMetadata[activeStreamIndex_];
+  streamMetadata.sampleRate =
+      static_cast<int64_t>(streamInfo.codecContext->sample_rate);
+  streamMetadata.numChannels = getNumChannels(streamInfo.codecContext);
 }
 
 // --------------------------------------------------------------------------
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -80,6 +80,7 @@ class VideoDecoder {
 
     // Audio-only fields
     std::optional<int64_t> sampleRate;
+    std::optional<int64_t> numChannels;
   };
 
   struct ContainerMetadata {
@@ -428,7 +429,7 @@ class VideoDecoder {
   void addStream(
       int streamIndex,
       AVMediaType mediaType,
-      const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
+      const torch::Device& device = torch::kCPU);
 
   // Returns the "best" stream index for a given media type. The "best" is
   // determined by various heuristics in FFMPEG.
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -488,6 +488,9 @@ std::string get_stream_json_metadata(
   if (streamMetadata.sampleRate.has_value()) {
     map["sampleRate"] = std::to_string(*streamMetadata.sampleRate);
   }
+  if (streamMetadata.numChannels.has_value()) {
+    map["numChannels"] = std::to_string(*streamMetadata.numChannels);
+  }
   if (streamMetadata.mediaType == AVMEDIA_TYPE_VIDEO) {
     map["mediaType"] = "\"video\"";
   } else if (streamMetadata.mediaType == AVMEDIA_TYPE_AUDIO) {
diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -22,7 +22,7 @@
 SPACES = "  "
 
 
-# TODO-audio: docs below are mostly for video streams, we should edit them and /
+# TODO-AUDIO: docs below are mostly for video streams, we should edit them and /
 # or make sure they're OK for audio streams as well. Not sure how to best handle
 # docs for such class hierarchy.
 @dataclass
@@ -161,8 +161,9 @@ def __repr__(self):
 class AudioStreamMetadata(StreamMetadata):
     """Metadata of a single audio stream."""
 
-    # TODO-AUDIO Need sample rate and format and num_channels
+    # TODO-AUDIO Add sample format field
     sample_rate: Optional[int]
+    num_channels: Optional[int]
 
     def __repr__(self):
         return super().__repr__()
@@ -236,6 +237,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
             streams_metadata.append(
                 AudioStreamMetadata(
                     sample_rate=stream_dict.get("sampleRate"),
+                    num_channels=stream_dict.get("numChannels"),
                     **common_meta,
                 )
             )
diff --git a/test/decoders/test_video_decoder.py b/test/decoders/test_video_decoder.py
@@ -11,13 +11,9 @@
 import torch
 from torchcodec import FrameBatch
 
-from torchcodec.decoders import (
-    _core,
-    AudioStreamMetadata,
-    VideoDecoder,
-    VideoStreamMetadata,
-)
+from torchcodec.decoders import _core, VideoDecoder, VideoStreamMetadata
 from torchcodec.decoders._audio_decoder import AudioDecoder
+from torchcodec.decoders._core._metadata import AudioStreamMetadata
 
 from ..utils import (
     assert_frames_equal,
@@ -950,3 +946,4 @@ def test_metadata(self):
         assert decoder.stream_index == decoder.metadata.stream_index == 4
         assert decoder.metadata.duration_seconds == pytest.approx(13.056)
         assert decoder.metadata.sample_rate == 16_000
+        assert decoder.metadata.num_channels == 2