2025-03-25 nightly release (57899ee)

pytorchbot · pytorchbot · commit 7f1b5f108480 · 2025-03-25T11:35:30.000Z
diff --git a/benchmarks/decoders/benchmark_audio_decoders.py b/benchmarks/decoders/benchmark_audio_decoders.py
@@ -71,7 +71,7 @@ def get_duration(path: Path) -> str:
 
 
 def decode_with_torchcodec(path: Path) -> None:
-    AudioDecoder(path).get_samples_played_in_range(start_seconds=0, stop_seconds=None)
+    AudioDecoder(path).get_all_samples()
 
 
 def decode_with_torchaudio_StreamReader(path: Path) -> None:
diff --git a/docs/source/_templates/dataclass.rst b/docs/source/_templates/dataclass.rst
@@ -8,3 +8,4 @@
 .. autoclass:: {{ name }}
     :members:
     :undoc-members: __init__
+    :inherited-members:
diff --git a/examples/audio_decoding.py b/examples/audio_decoding.py
@@ -59,10 +59,10 @@ def play_audio(samples):
 # ----------------
 #
 # To get decoded samples, we just need to call the
-# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` method,
+# :meth:`~torchcodec.decoders.AudioDecoder.get_all_samples` method,
 # which returns an :class:`~torchcodec.AudioSamples` object:
 
-samples = decoder.get_samples_played_in_range()
+samples = decoder.get_all_samples()
 
 print(samples)
 play_audio(samples)
@@ -76,13 +76,12 @@ def play_audio(samples):
 # all streams start exactly at 0! This is not a bug in TorchCodec, this is a
 # property of the file that was defined when it was encoded.
 #
-# %%
 # Specifying a range
 # ------------------
 #
-# By default,
-# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range`  decodes
-# the entire audio stream, but we can specify a custom range:
+# If we don't need all the samples, we can use
+# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` to
+# decode the samples within a custom range:
 
 samples = decoder.get_samples_played_in_range(start_seconds=10, stop_seconds=70)
 
@@ -99,7 +98,7 @@ def play_audio(samples):
 # increased:
 
 decoder = AudioDecoder(raw_audio_bytes, sample_rate=16_000)
-samples = decoder.get_samples_played_in_range(start_seconds=0)
+samples = decoder.get_all_samples()
 
 print(samples)
 play_audio(samples)
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -13,7 +13,7 @@
 from torchcodec.decoders import _core as core
 from torchcodec.decoders._decoder_utils import (
     create_decoder,
-    get_and_validate_stream_metadata,
+    ERROR_REPORTING_INSTRUCTIONS,
 )
 
 
@@ -57,31 +57,54 @@ def __init__(
             self._decoder, stream_index=stream_index, sample_rate=sample_rate
         )
 
-        (
-            self.metadata,
-            self.stream_index,
-            self._begin_stream_seconds,
-            self._end_stream_seconds,
-        ) = get_and_validate_stream_metadata(
-            decoder=self._decoder, stream_index=stream_index, media_type="audio"
+        container_metadata = core.get_container_metadata(self._decoder)
+        self.stream_index = (
+            container_metadata.best_audio_stream_index
+            if stream_index is None
+            else stream_index
         )
+        if self.stream_index is None:
+            raise ValueError(
+                "The best audio stream is unknown and there is no specified stream. "
+                + ERROR_REPORTING_INSTRUCTIONS
+            )
+        self.metadata = container_metadata.streams[self.stream_index]
         assert isinstance(self.metadata, core.AudioStreamMetadata)  # mypy
+
         self._desired_sample_rate = (
             sample_rate if sample_rate is not None else self.metadata.sample_rate
         )
 
+    def get_all_samples(self) -> AudioSamples:
+        """Returns all the audio samples from the source.
+
+        To decode samples in a specific range, use
+        :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range`.
+
+        Returns:
+            AudioSamples: The samples within the file.
+        """
+        return self.get_samples_played_in_range()
+
     def get_samples_played_in_range(
         self, start_seconds: float = 0.0, stop_seconds: Optional[float] = None
     ) -> AudioSamples:
         """Returns audio samples in the given range.
 
         Samples are in the half open range [start_seconds, stop_seconds).
 
+        To decode all the samples from beginning to end, you can call this
+        method while leaving ``start_seconds`` and ``stop_seconds`` to their
+        default values, or use
+        :meth:`~torchcodec.decoders.AudioDecoder.get_all_samples` as a more
+        convenient alias.
+
         Args:
             start_seconds (float): Time, in seconds, of the start of the
                 range. Default: 0.
-            stop_seconds (float): Time, in seconds, of the end of the
-                range. As a half open range, the end is excluded.
+            stop_seconds (float or None): Time, in seconds, of the end of the
+                range. As a half open range, the end is excluded. Default: None,
+                which decodes samples until the end.
 
         Returns:
             AudioSamples: The samples within the specified range.
@@ -90,12 +113,6 @@ def get_samples_played_in_range(
             raise ValueError(
                 f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
             )
-        if not self._begin_stream_seconds <= start_seconds < self._end_stream_seconds:
-            raise ValueError(
-                f"Invalid start seconds: {start_seconds}. "
-                f"It must be greater than or equal to {self._begin_stream_seconds} "
-                f"and less than or equal to {self._end_stream_seconds}."
-            )
         frames, first_pts = core.get_frames_by_pts_in_range_audio(
             self._decoder,
             start_seconds=start_seconds,
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -147,6 +147,10 @@ void VideoDecoder::initializeDecoder() {
       streamMetadata.durationSeconds =
           av_q2d(avStream->time_base) * avStream->duration;
     }
+    if (avStream->start_time != AV_NOPTS_VALUE) {
+      streamMetadata.beginStreamFromHeader =
+          av_q2d(avStream->time_base) * avStream->start_time;
+    }
 
     if (avStream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
       double fps = av_q2d(avStream->r_frame_rate);
@@ -157,7 +161,15 @@ void VideoDecoder::initializeDecoder() {
     } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
       AVSampleFormat format =
           static_cast<AVSampleFormat>(avStream->codecpar->format);
-      streamMetadata.sampleFormat = av_get_sample_fmt_name(format);
+
+      // If the AVSampleFormat is not recognized, we get back nullptr. We have
+      // to make sure we don't initialize a std::string with nullptr. There's
+      // nothing to do on the else branch because we're already using an
+      // optional; it'll just remain empty.
+      const char* rawSampleFormat = av_get_sample_fmt_name(format);
+      if (rawSampleFormat != nullptr) {
+        streamMetadata.sampleFormat = std::string(rawSampleFormat);
+      }
       containerMetadata_.numAudioStreams++;
     }
 
@@ -944,8 +956,9 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
   TORCH_CHECK(
       frames.size() > 0 && firstFramePtsSeconds.has_value(),
       "No audio frames were decoded. ",
-      "This should probably not happen. ",
-      "Please report an issue on the TorchCodec repo.");
+      "This is probably because start_seconds is too high? ",
+      "Current value is ",
+      startSeconds);
 
   return AudioFramesOutput{torch::cat(frames, 1), *firstFramePtsSeconds};
 }
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -59,6 +59,7 @@ class VideoDecoder {
     std::optional<AVCodecID> codecId;
     std::optional<std::string> codecName;
     std::optional<double> durationSeconds;
+    std::optional<double> beginStreamFromHeader;
     std::optional<int64_t> numFrames;
     std::optional<int64_t> numKeyFrames;
     std::optional<double> averageFps;
@@ -238,7 +239,6 @@ class VideoDecoder {
       double startSeconds,
       double stopSeconds);
 
-  // TODO-AUDIO: Should accept sampleRate
   AudioFramesOutput getFramesPlayedInRangeAudio(
       double startSeconds,
       std::optional<double> stopSecondsOptional = std::nullopt);
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -473,6 +473,10 @@ std::string get_stream_json_metadata(
   if (streamMetadata.numFrames.has_value()) {
     map["numFrames"] = std::to_string(*streamMetadata.numFrames);
   }
+  if (streamMetadata.beginStreamFromHeader.has_value()) {
+    map["beginStreamFromHeader"] =
+        std::to_string(*streamMetadata.beginStreamFromHeader);
+  }
   if (streamMetadata.minPtsSecondsFromScan.has_value()) {
     map["minPtsSecondsFromScan"] =
         std::to_string(*streamMetadata.minPtsSecondsFromScan);
diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -22,37 +22,64 @@
 SPACES = "  "
 
 
-# TODO-AUDIO: docs below are mostly for video streams, we should edit them and /
-# or make sure they're OK for audio streams as well. Not sure how to best handle
-# docs for such class hierarchy.
-# TODO very related, none of these common fields in this base class show up in
-# the docs right now.
 @dataclass
 class StreamMetadata:
     duration_seconds_from_header: Optional[float]
     """Duration of the stream, in seconds, obtained from the header (float or
     None). This could be inaccurate."""
+    begin_stream_seconds_from_header: Optional[float]
+    """Beginning of the stream, in seconds, obtained from the header (float or
+    None). Usually, this is equal to 0."""
     bit_rate: Optional[float]
     """Bit rate of the stream, in seconds (float or None)."""
+    codec: Optional[str]
+    """Codec (str or None)."""
+    stream_index: int
+    """Index of the stream that this metadata refers to (int)."""
+
+    def __repr__(self):
+        s = self.__class__.__name__ + ":\n"
+        for field in dataclasses.fields(self):
+            s += f"{SPACES}{field.name}: {getattr(self, field.name)}\n"
+        return s
+
+
+@dataclass
+class VideoStreamMetadata(StreamMetadata):
+    """Metadata of a single video stream."""
+
     begin_stream_seconds_from_content: Optional[float]
     """Beginning of the stream, in seconds (float or None).
-    Conceptually, this corresponds to the first frame's :term:`pts`. It is
-    computed as min(frame.pts) across all frames in the stream. Usually, this is
-    equal to 0."""
+    Conceptually, this corresponds to the first frame's :term:`pts`. It is only
+    computed when a :term:`scan` is done as min(frame.pts) across all frames in
+    the stream. Usually, this is equal to 0."""
     end_stream_seconds_from_content: Optional[float]
     """End of the stream, in seconds (float or None).
     Conceptually, this corresponds to last_frame.pts + last_frame.duration. It
-    is computed as max(frame.pts + frame.duration) across all frames in the
-    stream. Note that no frame is played at this time value, so calling
-    :meth:`~torchcodec.decoders.VideoDecoder.get_frame_played_at` with
-    this value would result in an error. Retrieving the last frame is best done
-    by simply indexing the :class:`~torchcodec.decoders.VideoDecoder`
-    object with ``[-1]``.
+    is only computed when a :term:`scan` is done as max(frame.pts +
+    frame.duration) across all frames in the stream. Note that no frame is
+    played at this time value, so calling
+    :meth:`~torchcodec.decoders.VideoDecoder.get_frame_played_at` with this
+    value would result in an error. Retrieving the last frame is best done by
+    simply indexing the :class:`~torchcodec.decoders.VideoDecoder` object with
+    ``[-1]``.
     """
-    codec: Optional[str]
-    """Codec (str or None)."""
-    stream_index: int
-    """Index of the stream within the video (int)."""
+    width: Optional[int]
+    """Width of the frames (int or None)."""
+    height: Optional[int]
+    """Height of the frames (int or None)."""
+    num_frames_from_header: Optional[int]
+    """Number of frames, from the stream's metadata. This is potentially
+    inaccurate. We recommend using the ``num_frames`` attribute instead.
+    (int or None)."""
+    num_frames_from_content: Optional[int]
+    """Number of frames computed by TorchCodec by scanning the stream's
+    content (the scan doesn't involve decoding). This is more accurate
+    than ``num_frames_from_header``. We recommend using the
+    ``num_frames`` attribute instead. (int or None)."""
+    average_fps_from_header: Optional[float]
+    """Averate fps of the stream, obtained from the header (float or None).
+    We recommend using the ``average_fps`` attribute instead."""
 
     @property
     def duration_seconds(self) -> Optional[float]:
@@ -94,36 +121,6 @@ def end_stream_seconds(self) -> Optional[float]:
         else:
             return self.end_stream_seconds_from_content
 
-    def __repr__(self):
-        # Overridden because properites are not printed by default.
-        s = self.__class__.__name__ + ":\n"
-        s += f"{SPACES}duration_seconds: {self.duration_seconds}\n"
-        for field in dataclasses.fields(self):
-            s += f"{SPACES}{field.name}: {getattr(self, field.name)}\n"
-        return s
-
-
-@dataclass
-class VideoStreamMetadata(StreamMetadata):
-    """Metadata of a single video stream."""
-
-    width: Optional[int]
-    """Width of the frames (int or None)."""
-    height: Optional[int]
-    """Height of the frames (int or None)."""
-    num_frames_from_header: Optional[int]
-    """Number of frames, from the stream's metadata. This is potentially
-    inaccurate. We recommend using the ``num_frames`` attribute instead.
-    (int or None)."""
-    num_frames_from_content: Optional[int]
-    """Number of frames computed by TorchCodec by scanning the stream's
-    content (the scan doesn't involve decoding). This is more accurate
-    than ``num_frames_from_header``. We recommend using the
-    ``num_frames`` attribute instead. (int or None)."""
-    average_fps_from_header: Optional[float]
-    """Averate fps of the stream, obtained from the header (float or None).
-    We recommend using the ``average_fps`` attribute instead."""
-
     @property
     def num_frames(self) -> Optional[int]:
         """Number of frames in the stream. This corresponds to
@@ -154,6 +151,9 @@ def average_fps(self) -> Optional[float]:
 
     def __repr__(self):
         s = super().__repr__()
+        s += f"{SPACES}duration_seconds: {self.duration_seconds}\n"
+        s += f"{SPACES}begin_stream_seconds: {self.begin_stream_seconds}\n"
+        s += f"{SPACES}end_stream_seconds: {self.end_stream_seconds}\n"
         s += f"{SPACES}num_frames: {self.num_frames}\n"
         s += f"{SPACES}average_fps: {self.average_fps}\n"
         return s
@@ -224,14 +224,19 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
         common_meta = dict(
             duration_seconds_from_header=stream_dict.get("durationSeconds"),
             bit_rate=stream_dict.get("bitRate"),
-            begin_stream_seconds_from_content=stream_dict.get("minPtsSecondsFromScan"),
-            end_stream_seconds_from_content=stream_dict.get("maxPtsSecondsFromScan"),
+            begin_stream_seconds_from_header=stream_dict.get("beginStreamFromHeader"),
             codec=stream_dict.get("codec"),
             stream_index=stream_index,
         )
         if stream_dict["mediaType"] == "video":
             streams_metadata.append(
                 VideoStreamMetadata(
+                    begin_stream_seconds_from_content=stream_dict.get(
+                        "minPtsSecondsFromScan"
+                    ),
+                    end_stream_seconds_from_content=stream_dict.get(
+                        "maxPtsSecondsFromScan"
+                    ),
                     width=stream_dict.get("width"),
                     height=stream_dict.get("height"),
                     num_frames_from_header=stream_dict.get("numFrames"),
diff --git a/src/torchcodec/decoders/_decoder_utils.py b/src/torchcodec/decoders/_decoder_utils.py
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
diff --git a/test/decoders/test_metadata.py b/test/decoders/test_metadata.py