2025-03-22 nightly release (93f5d47)

pytorchbot · pytorchbot · commit bde6f2779207 · 2025-03-22T11:35:33.000Z
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -5,4 +5,5 @@ sphinx_copybutton
 sphinx-tabs
 matplotlib
 torchvision
+ipython
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
diff --git a/docs/source/api_ref_decoders.rst b/docs/source/api_ref_decoders.rst
@@ -7,7 +7,8 @@ torchcodec.decoders
 .. currentmodule:: torchcodec.decoders
 
 
-For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
+For a video decoder tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
+For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_audio_decoding.py`.
 
 
 .. autosummary::
@@ -16,6 +17,7 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
     :template: class.rst
 
     VideoDecoder
+    AudioDecoder
 
 
 .. autosummary::
@@ -24,3 +26,4 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
     :template: dataclass.rst
 
     VideoStreamMetadata
+    AudioStreamMetadata
diff --git a/docs/source/api_ref_torchcodec.rst b/docs/source/api_ref_torchcodec.rst
@@ -14,3 +14,4 @@ torchcodec
 
     Frame
     FrameBatch
+    AudioSamples
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
@@ -4,7 +4,7 @@ Glossary
 .. glossary::
 
     pts
-       Presentation Time Stamp. The time at which a frame should be played.
+       Presentation Time Stamp. The time at which a frame or audio sample should be played.
        In TorchCodec, pts are expressed in seconds.
 
     best stream
diff --git a/examples/audio_decoding.py b/examples/audio_decoding.py
@@ -0,0 +1,111 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+========================================
+Decoding audio streams with AudioDecoder
+========================================
+
+In this example, we'll learn how to decode an audio file using the
+:class:`~torchcodec.decoders.AudioDecoder` class.
+"""
+
+# %%
+# First, a bit of boilerplate: we'll download an audio file from the web and
+# define an audio playing utility.  You can ignore that part and jump right
+# below to :ref:`creating_decoder_audio`.
+import requests
+from IPython.display import Audio
+
+
+def play_audio(samples):
+    return Audio(samples.data, rate=samples.sample_rate)
+
+
+# Audio source is CC0: https://opengameart.org/content/town-theme-rpg
+# Attribution: cynicmusic.com pixelsphere.org
+url = "https://opengameart.org/sites/default/files/TownTheme.mp3"
+response = requests.get(url, headers={"User-Agent": ""})
+if response.status_code != 200:
+    raise RuntimeError(f"Failed to download video. {response.status_code = }.")
+
+raw_audio_bytes = response.content
+
+# %%
+# .. _creating_decoder_audio:
+#
+# Creating a decoder
+# ------------------
+#
+# We can now create a decoder from the raw (encoded) audio bytes. You can of
+# course use a local audio file and pass the path as input. You can also decode
+# audio streams from videos!
+
+from torchcodec.decoders import AudioDecoder
+
+decoder = AudioDecoder(raw_audio_bytes)
+
+# %%
+# The has not yet been decoded by the decoder, but we already have access to
+# some metadata via the ``metadata`` attribute which is an
+# :class:`~torchcodec.decoders.AudioStreamMetadata` object.
+print(decoder.metadata)
+
+# %%
+# Decoding samples
+# ----------------
+#
+# To get decoded samples, we just need to call the
+# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` method,
+# which returns an :class:`~torchcodec.AudioSamples` object:
+
+samples = decoder.get_samples_played_in_range(start_seconds=0)
+
+print(samples)
+play_audio(samples)
+
+# %%
+# The ``.data`` field is a tensor of shape ``(num_channels, num_samples)`` and
+# of float dtype with values in [-1, 1].
+#
+# The ``.pts_seconds`` field indicates the starting time of the output samples.
+# Here it's 0.025 seconds, even though we asked for samples starting from 0. Not
+# all streams start exactly at 0! This is not a bug in TorchCodec, this is a
+# property of the file that was defined when it was encoded.
+#
+# We only output the *start* of the samples, not the end or the duration. Those can
+# be easily derived from the number of samples and the sample rate:
+
+duration_seconds = samples.data.shape[1] / samples.sample_rate
+print(f"Duration = {int(duration_seconds // 60)}m{int(duration_seconds % 60)}s.")
+
+# %%
+# Specifying a range
+# ------------------
+#
+# By default,
+# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range`  decodes
+# the entire audio stream, but we can specify a custom range:
+
+samples = decoder.get_samples_played_in_range(start_seconds=10, stop_seconds=70)
+
+print(samples)
+play_audio(samples)
+
+# %%
+# Custom sample rate
+# ------------------
+#
+# We can also decode the samples into a desired sample rate using the
+# ``sample_rate`` parameter of :class:`~torchcodec.decoders.AudioDecoder`. The
+# ouput will sound the same, but note that the number of samples greatly
+# increased:
+
+decoder = AudioDecoder(raw_audio_bytes, sample_rate=16_000)
+samples = decoder.get_samples_played_in_range(start_seconds=0)
+
+print(samples)
+play_audio(samples)
diff --git a/src/torchcodec/_frame.py b/src/torchcodec/_frame.py
@@ -120,10 +120,12 @@ def __repr__(self):
 class AudioSamples(Iterable):
     """Audio samples with associated metadata."""
 
-    # TODO-AUDIO: docs
     data: Tensor
+    """The sample data (``torch.Tensor`` of float in [-1, 1], shape is ``(num_channels, num_samples)``)."""
     pts_seconds: float
+    """The :term:`pts` of the first sample, in seconds."""
     sample_rate: int
+    """The sample rate of the samples, in Hz."""
 
     def __post_init__(self):
         # This is called after __init__() when a Frame is created. We can run
diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._core import VideoStreamMetadata
+from ._audio_decoder import AudioDecoder  # noqa
+from ._core import AudioStreamMetadata, VideoStreamMetadata
 from ._video_decoder import VideoDecoder  # noqa
 
 SimpleVideoDecoder = VideoDecoder
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -18,7 +18,31 @@
 
 
 class AudioDecoder:
-    """TODO-AUDIO docs"""
+    """A single-stream audio decoder.
+
+    This can be used to decode audio from pure audio files (e.g. mp3, wav,
+    etc.), or from videos that contain audio streams (e.g. mp4 videos).
+
+    Returned samples are float samples normalized in [-1, 1]
+
+    Args:
+        source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the audio:
+
+            - If ``str``: a local path or a URL to a video or audio file.
+            - If ``Pathlib.path``: a path to a local video or audio file.
+            - If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
+        stream_index (int, optional): Specifies which stream in the file to decode samples from.
+            Note that this index is absolute across all media types. If left unspecified, then
+            the :term:`best stream` is used.
+        sample_rate (int, optional): The desired output sample rate of the decoded samples.
+            By default, the samples are returned in their original sample rate.
+
+    Attributes:
+        metadata (AudioStreamMetadata): Metadata of the audio stream.
+        stream_index (int): The stream index that this decoder is retrieving samples from. If a
+            stream index was provided at initialization, this is the same value. If it was left
+            unspecified, this is the :term:`best stream`.
+    """
 
     def __init__(
         self,
@@ -46,10 +70,23 @@ def __init__(
             sample_rate if sample_rate is not None else self.metadata.sample_rate
         )
 
+    # TODO-AUDIO: start_seconds should be 0 by default
     def get_samples_played_in_range(
         self, start_seconds: float, stop_seconds: Optional[float] = None
     ) -> AudioSamples:
-        """TODO-AUDIO docs"""
+        """Returns audio samples in the given range.
+
+        Samples are in the half open range [start_seconds, stop_seconds).
+
+        Args:
+            start_seconds (float): Time, in seconds, of the start of the
+                range.
+            stop_seconds (float): Time, in seconds, of the end of the
+                range. As a half open range, the end is excluded.
+
+        Returns:
+            AudioSamples: The samples within the specified range.
+        """
         if stop_seconds is not None and not start_seconds <= stop_seconds:
             raise ValueError(
                 f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -908,7 +908,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
   // sample rate, so in theory we know the number of output samples.
   std::vector<torch::Tensor> frames;
 
-  double firstFramePtsSeconds = std::numeric_limits<double>::max();
+  std::optional<double> firstFramePtsSeconds = std::nullopt;
   auto stopPts = secondsToClosestPts(stopSeconds, streamInfo.timeBase);
   auto finished = false;
   while (!finished) {
@@ -918,8 +918,9 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
             return startPts < avFrame->pts + getDuration(avFrame);
           });
       auto frameOutput = convertAVFrameToFrameOutput(avFrame);
-      firstFramePtsSeconds =
-          std::min(firstFramePtsSeconds, frameOutput.ptsSeconds);
+      if (!firstFramePtsSeconds.has_value()) {
+        firstFramePtsSeconds = frameOutput.ptsSeconds;
+      }
       frames.push_back(frameOutput.data);
     } catch (const EndOfFileException& e) {
       finished = true;
@@ -940,7 +941,13 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
     frames.push_back(*lastSamples);
   }
 
-  return AudioFramesOutput{torch::cat(frames, 1), firstFramePtsSeconds};
+  TORCH_CHECK(
+      frames.size() > 0 && firstFramePtsSeconds.has_value(),
+      "No audio frames were decoded. ",
+      "This should probably not happen. ",
+      "Please report an issue on the TorchCodec repo.");
+
+  return AudioFramesOutput{torch::cat(frames, 1), *firstFramePtsSeconds};
 }
 
 // --------------------------------------------------------------------------
@@ -1481,8 +1488,11 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
       static_cast<const uint8_t**>(
           const_cast<const uint8_t**>(srcAVFrame->data)),
       srcAVFrame->nb_samples);
+  // numConvertedSamples can be 0 if we're downsampling by a great factor and
+  // the first frame doesn't contain a lot of samples. It should be handled
+  // properly by the caller.
   TORCH_CHECK(
-      numConvertedSamples > 0,
+      numConvertedSamples >= 0,
       "Error in swr_convert: ",
       getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
 
@@ -1509,17 +1519,22 @@ std::optional<torch::Tensor> VideoDecoder::maybeFlushSwrBuffers() {
     return std::nullopt;
   }
 
-  torch::Tensor lastSamples = torch::empty(
-      {getNumChannels(streamInfo.codecContext), numRemainingSamples},
-      torch::kFloat32);
-  uint8_t* lastSamplesData = static_cast<uint8_t*>(lastSamples.data_ptr());
+  auto numChannels = getNumChannels(streamInfo.codecContext);
+  torch::Tensor lastSamples =
+      torch::empty({numChannels, numRemainingSamples}, torch::kFloat32);
+
+  std::vector<uint8_t*> outputBuffers(numChannels);
+  for (auto i = 0; i < numChannels; i++) {
+    outputBuffers[i] = static_cast<uint8_t*>(lastSamples[i].data_ptr());
+  }
 
   auto actualNumRemainingSamples = swr_convert(
       streamInfo.swrContext.get(),
-      &lastSamplesData,
+      outputBuffers.data(),
       numRemainingSamples,
       nullptr,
       0);
+
   return lastSamples.narrow(
       /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
 }
diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -25,6 +25,8 @@
 # TODO-AUDIO: docs below are mostly for video streams, we should edit them and /
 # or make sure they're OK for audio streams as well. Not sure how to best handle
 # docs for such class hierarchy.
+# TODO very related, none of these common fields in this base class show up in
+# the docs right now.
 @dataclass
 class StreamMetadata:
     duration_seconds_from_header: Optional[float]
@@ -162,8 +164,11 @@ class AudioStreamMetadata(StreamMetadata):
     """Metadata of a single audio stream."""
 
     sample_rate: Optional[int]
+    """The original sample rate."""
     num_channels: Optional[int]
+    """The number of channels (1 for mono, 2 for stereo, etc.)"""
     sample_format: Optional[str]
+    """The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc."""
 
     def __repr__(self):
         return super().__repr__()
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -24,6 +24,7 @@
     in_fbcode,
     NASA_AUDIO,
     NASA_AUDIO_MP3,
+    NASA_AUDIO_MP3_44100,
     NASA_VIDEO,
     SINE_MONO_S16,
     SINE_MONO_S32,
@@ -1157,6 +1158,43 @@ def test_sample_rate_conversion(self, start_seconds, stop_seconds):
             rtol=rtol,
         )
 
+    def test_sample_rate_conversion_stereo(self):
+        # Non-regression test for https://github.com/pytorch/torchcodec/pull/584
+        asset = NASA_AUDIO_MP3
+        assert asset.sample_rate == 8000
+        assert asset.num_channels == 2
+        decoder = AudioDecoder(asset.path, sample_rate=44_100)
+        decoder.get_samples_played_in_range(start_seconds=0)
+
+    def test_downsample_empty_frame(self):
+        # Non-regression test for
+        # https://github.com/pytorch/torchcodec/pull/586: when downsampling  by
+        # a great factor, if an input frame has a small amount of sample, the
+        # resampled frame (as output by swresample) may contain zero sample. We
+        # make sure we handle this properly.
+        #
+        # NASA_AUDIO_MP3_44100's first frame has only 47 samples which triggers
+        # the test scenario:
+        # ```
+        # » ffprobe -v error -hide_banner -select_streams a:0 -show_frames -of json test/resources/nasa_13013.mp4.audio_44100.mp3 | grep nb_samples | head -n 3
+        # "nb_samples": 47,
+        # "nb_samples": 1152,
+        # "nb_samples": 1152,
+        # ```
+        asset = NASA_AUDIO_MP3_44100
+        assert asset.sample_rate == 44_100
+        decoder = AudioDecoder(asset.path, sample_rate=8_000)
+        frames_44100_to_8000 = decoder.get_samples_played_in_range(start_seconds=0)
+
+        # Just checking correctness now
+        asset = NASA_AUDIO_MP3
+        assert asset.sample_rate == 8_000
+        decoder = AudioDecoder(asset.path)
+        frames_8000 = decoder.get_samples_played_in_range(start_seconds=0)
+        torch.testing.assert_close(
+            frames_44100_to_8000.data, frames_8000.data, atol=0.03, rtol=0
+        )
+
     def test_s16_ffmpeg4_bug(self):
         # s16 fails on FFmpeg4 but can be decoded on other versions.
         # Debugging logs show that we're hitting:
diff --git a/test/resources/nasa_13013.mp4.audio_44100.mp3 b/test/resources/nasa_13013.mp4.audio_44100.mp3
diff --git a/test/resources/nasa_13013.mp4.audio_44100.mp3.stream0.all_frames_info.json b/test/resources/nasa_13013.mp4.audio_44100.mp3.stream0.all_frames_info.json
diff --git a/test/utils.py b/test/utils.py

Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,4 @@ torchcodec`
`14`	`14`
`15`	`15`	`Frame`
`16`	`16`	`FrameBatch`
	`17`	`+ AudioSamples`