meta-pytorch
diff --git a/‎src/torchcodec/decoders/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/decoders/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/decoders/_audio_decoder.py‎
Lines changed: 2 additions & 6 deletions b/‎src/torchcodec/decoders/_audio_decoder.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 4 additions & 29 deletions b/‎src/torchcodec/decoders/_core/VideoDecoder.cpp‎
Lines changed: 4 additions & 29 deletions
diff --git a/‎src/torchcodec/decoders/_core/_metadata.py‎
Lines changed: 62 additions & 54 deletions b/‎src/torchcodec/decoders/_core/_metadata.py‎
Lines changed: 62 additions & 54 deletions
diff --git a/‎src/torchcodec/decoders/_decoder_utils.py‎
Lines changed: 1 addition & 17 deletions b/‎src/torchcodec/decoders/_decoder_utils.py‎
Lines changed: 1 addition & 17 deletions
diff --git a/‎src/torchcodec/decoders/_video_decoder.py‎
Lines changed: 14 additions & 3 deletions b/‎src/torchcodec/decoders/_video_decoder.py‎
Lines changed: 14 additions & 3 deletions
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._core import VideoStreamMetadata
+from ._core import AudioStreamMetadata, VideoStreamMetadata
 from ._video_decoder import VideoDecoder  # noqa
 
 SimpleVideoDecoder = VideoDecoder
@@ -5,15 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 
 from pathlib import Path
-from typing import Literal, Optional, Union
+from typing import Optional, Union
 
 from torch import Tensor
 
 from torchcodec.decoders import _core as core
 from torchcodec.decoders._decoder_utils import (
     create_decoder,
     get_and_validate_stream_metadata,
-    validate_seek_mode,
 )
 
 
@@ -25,17 +24,14 @@ def __init__(
         source: Union[str, Path, bytes, Tensor],
         *,
         stream_index: Optional[int] = None,
-        seek_mode: Literal["exact", "approximate"] = "exact",
     ):
-        validate_seek_mode(seek_mode)
-        self._decoder = create_decoder(source=source, seek_mode=seek_mode)
+        self._decoder = create_decoder(source=source, seek_mode="approximate")
 
         core.add_audio_stream(self._decoder, stream_index=stream_index)
 
         (
             self.metadata,
             self.stream_index,
-            self._num_frames,
             self._begin_stream_seconds,
             self._end_stream_seconds,
         ) = get_and_validate_stream_metadata(
 
@@ -466,30 +466,6 @@ void VideoDecoder::addStream(
             .value_or(avCodec));
   }
 
-  // TODO_FRAME_SIZE_APPROXIMATE_MODE
-  // For audio, we raise if seek_mode="approximate" and if the number of
-  // samples per frame is unknown (frame_size field of codec params). But that's
-  // quite limitting. Ultimately, the most common type of call will be to decode
-  // an entire file from start to end (possibly with some offsets for start and
-  // end). And for that, we shouldn't [need to] force the user to scan, because
-  // all this entails is a single call to seek(start) (if at all) and then just
-  // a bunch of consecutive calls to getNextFrame(). Maybe there should be a
-  // third seek mode for audio, e.g. seek_mode="contiguous" where we don't scan,
-  // and only allow calls to getFramesPlayedAt().
-  StreamMetadata& streamMetadata =
-      containerMetadata_.allStreamMetadata[activeStreamIndex_];
-  if (seekMode_ == SeekMode::approximate &&
-      !streamMetadata.averageFps.has_value()) {
-    std::string errMsg = "Seek mode is approximate, but stream " +
-        std::to_string(activeStreamIndex_) + "does not have ";
-    if (mediaType == AVMEDIA_TYPE_VIDEO) {
-      errMsg += "an average fps in its metadata.";
-    } else {
-      errMsg += "a constant number of samples per frame.";
-    }
-    throw std::runtime_error(errMsg);
-  }
-
   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(codecContext != nullptr);
   codecContext->thread_count =
@@ -565,13 +541,12 @@ void VideoDecoder::addVideoStream(
 }
 
 void VideoDecoder::addAudioStream(int streamIndex) {
+  TORCH_CHECK(
+      seekMode_ == SeekMode::approximate,
+      "seek_mode must be 'approximate' for audio streams.");
+
   addStream(streamIndex, AVMEDIA_TYPE_AUDIO);
 
-  // See TODO_FRAME_SIZE_BATCH_TENSOR_ALLOCATION
-  auto& streamInfo = streamInfos_[activeStreamIndex_];
-  TORCH_CHECK(
-      streamInfo.codecContext->frame_size > 0,
-      "No support for variable framerate yet.");
   containerMetadata_.allStreamMetadata[activeStreamIndex_].sampleRate =
       streamInfo.codecContext->sample_rate;
 }
 
@@ -19,6 +19,9 @@
 )
 
 
+SPACES = "  "
+
+
 # TODO-audio: docs below are mostly for video streams, we should edit them and /
 # or make sure they're OK for audio streams as well. Not sure how to best handle
 # docs for such class hierarchy.
@@ -29,15 +32,6 @@ class StreamMetadata:
     None). This could be inaccurate."""
     bit_rate: Optional[float]
     """Bit rate of the stream, in seconds (float or None)."""
-    num_frames_from_header: Optional[int]
-    """Number of frames, from the stream's metadata. This is potentially
-    inaccurate. We recommend using the ``num_frames`` attribute instead.
-    (int or None)."""
-    num_frames_from_content: Optional[int]
-    """Number of frames computed by TorchCodec by scanning the stream's
-    content (the scan doesn't involve decoding). This is more accurate
-    than ``num_frames_from_header``. We recommend using the
-    ``num_frames`` attribute instead. (int or None)."""
     begin_stream_seconds_from_content: Optional[float]
     """Beginning of the stream, in seconds (float or None).
     Conceptually, this corresponds to the first frame's :term:`pts`. It is
@@ -55,23 +49,9 @@ class StreamMetadata:
     """
     codec: Optional[str]
     """Codec (str or None)."""
-    average_fps_from_header: Optional[float]
-    """Averate fps of the stream, obtained from the header (float or None).
-    We recommend using the ``average_fps`` attribute instead."""
     stream_index: int
     """Index of the stream within the video (int)."""
 
-    @property
-    def num_frames(self) -> Optional[int]:
-        """Number of frames in the stream. This corresponds to
-        ``num_frames_from_content`` if a :term:`scan` was made, otherwise it
-        corresponds to ``num_frames_from_header``.
-        """
-        if self.num_frames_from_content is not None:
-            return self.num_frames_from_content
-        else:
-            return self.num_frames_from_header
-
     @property
     def duration_seconds(self) -> Optional[float]:
         """Duration of the stream in seconds. We try to calculate the duration
@@ -88,23 +68,6 @@ def duration_seconds(self) -> Optional[float]:
             - self.begin_stream_seconds_from_content
         )
 
-    @property
-    def average_fps(self) -> Optional[float]:
-        """Average fps of the stream. If a :term:`scan` was perfomed, this is
-        computed from the number of frames and the duration of the stream.
-        Otherwise we fall back to ``average_fps_from_header``.
-        """
-        if (
-            self.end_stream_seconds_from_content is None
-            or self.begin_stream_seconds_from_content is None
-            or self.num_frames is None
-        ):
-            return self.average_fps_from_header
-        return self.num_frames / (
-            self.end_stream_seconds_from_content
-            - self.begin_stream_seconds_from_content
-        )
-
     @property
     def begin_stream_seconds(self) -> float:
         """Beginning of the stream, in seconds (float). Conceptually, this
@@ -132,12 +95,9 @@ def end_stream_seconds(self) -> Optional[float]:
     def __repr__(self):
         # Overridden because properites are not printed by default.
         s = self.__class__.__name__ + ":\n"
-        spaces = "  "
-        s += f"{spaces}num_frames: {self.num_frames}\n"
-        s += f"{spaces}duration_seconds: {self.duration_seconds}\n"
-        s += f"{spaces}average_fps: {self.average_fps}\n"
+        s += f"{SPACES}duration_seconds: {self.duration_seconds}\n"
         for field in dataclasses.fields(self):
-            s += f"{spaces}{field.name}: {getattr(self, field.name)}\n"
+            s += f"{SPACES}{field.name}: {getattr(self, field.name)}\n"
         return s
 
 
@@ -149,17 +109,58 @@ class VideoStreamMetadata(StreamMetadata):
     """Width of the frames (int or None)."""
     height: Optional[int]
     """Height of the frames (int or None)."""
+    num_frames_from_header: Optional[int]
+    """Number of frames, from the stream's metadata. This is potentially
+    inaccurate. We recommend using the ``num_frames`` attribute instead.
+    (int or None)."""
+    num_frames_from_content: Optional[int]
+    """Number of frames computed by TorchCodec by scanning the stream's
+    content (the scan doesn't involve decoding). This is more accurate
+    than ``num_frames_from_header``. We recommend using the
+    ``num_frames`` attribute instead. (int or None)."""
+    average_fps_from_header: Optional[float]
+    """Averate fps of the stream, obtained from the header (float or None).
+    We recommend using the ``average_fps`` attribute instead."""
+
+    @property
+    def num_frames(self) -> Optional[int]:
+        """Number of frames in the stream. This corresponds to
+        ``num_frames_from_content`` if a :term:`scan` was made, otherwise it
+        corresponds to ``num_frames_from_header``.
+        """
+        if self.num_frames_from_content is not None:
+            return self.num_frames_from_content
+        else:
+            return self.num_frames_from_header
+
+    @property
+    def average_fps(self) -> Optional[float]:
+        """Average fps of the stream. If a :term:`scan` was perfomed, this is
+        computed from the number of frames and the duration of the stream.
+        Otherwise we fall back to ``average_fps_from_header``.
+        """
+        if (
+            self.end_stream_seconds_from_content is None
+            or self.begin_stream_seconds_from_content is None
+            or self.num_frames is None
+        ):
+            return self.average_fps_from_header
+        return self.num_frames / (
+            self.end_stream_seconds_from_content
+            - self.begin_stream_seconds_from_content
+        )
 
     def __repr__(self):
-        return super().__repr__()
+        s = super().__repr__()
+        s += f"{SPACES}num_frames: {self.num_frames}\n"
+        s += f"{SPACES}average_fps: {self.average_fps}\n"
+        return s
 
 
 @dataclass
 class AudioStreamMetadata(StreamMetadata):
     """Metadata of a single audio stream."""
 
-    # TODO-AUDIO do we expose the notion of frame here, like in fps? It's technically
-    # valid, but potentially is an FFmpeg-specific concept for audio
     # TODO-AUDIO Need sample rate and format and num_channels
     sample_rate: Optional[int]
 
@@ -192,6 +193,14 @@ def best_video_stream(self) -> VideoStreamMetadata:
         assert isinstance(metadata, VideoStreamMetadata)  # mypy <3
         return metadata
 
+    @property
+    def best_audio_stream(self) -> AudioStreamMetadata:
+        if self.best_audio_stream_index is None:
+            raise ValueError("The best audio stream is unknown.")
+        metadata = self.streams[self.best_audio_stream_index]
+        assert isinstance(metadata, AudioStreamMetadata)  # mypy <3
+        return metadata
+
 
 def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
     """Return container metadata from a decoder.
@@ -207,19 +216,19 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
         common_meta = dict(
             duration_seconds_from_header=stream_dict.get("durationSeconds"),
             bit_rate=stream_dict.get("bitRate"),
-            num_frames_from_header=stream_dict.get("numFrames"),
-            num_frames_from_content=stream_dict.get("numFramesFromScan"),
             begin_stream_seconds_from_content=stream_dict.get("minPtsSecondsFromScan"),
             end_stream_seconds_from_content=stream_dict.get("maxPtsSecondsFromScan"),
             codec=stream_dict.get("codec"),
-            average_fps_from_header=stream_dict.get("averageFps"),
             stream_index=stream_index,
         )
         if stream_dict["mediaType"] == "video":
             streams_metadata.append(
                 VideoStreamMetadata(
                     width=stream_dict.get("width"),
                     height=stream_dict.get("height"),
+                    num_frames_from_header=stream_dict.get("numFrames"),
+                    num_frames_from_content=stream_dict.get("numFramesFromScan"),
+                    average_fps_from_header=stream_dict.get("averageFps"),
                     **common_meta,
                 )
             )
@@ -232,9 +241,8 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
             )
         else:
             # This is neither a video nor audio stream. Could be e.g. subtitles.
-            # We still need to add an entry to streams_metadata to keep its
-            # length consistent with the number of streams, so we add a dummy
-            # entry.
+            # We still need to add a dummy entry so that len(streams_metadata)
+            # is consistent with the number of streams.
             streams_metadata.append(StreamMetadata(**common_meta))
 
     return ContainerMetadata(
 
@@ -17,15 +17,6 @@
 """
 
 
-def validate_seek_mode(seek_mode: str) -> None:
-    allowed_seek_modes = ("exact", "approximate")
-    if seek_mode not in allowed_seek_modes:
-        raise ValueError(
-            f"Invalid seek mode ({seek_mode}). "
-            f"Supported values are {', '.join(allowed_seek_modes)}."
-        )
-
-
 def create_decoder(
     *, source: Union[str, Path, bytes, Tensor], seek_mode: str
 ) -> Tensor:
@@ -49,7 +40,7 @@ def get_and_validate_stream_metadata(
     decoder: Tensor,
     stream_index: Optional[int] = None,
     media_type: str,
-) -> Tuple[core.VideoStreamMetadata, int]:
+) -> Tuple[core.VideoStreamMetadata, int, float, float]:
 
     if media_type not in ("video", "audio"):
         raise ValueError(f"Bad {media_type = }, should be audio or video")
@@ -75,12 +66,6 @@ def get_and_validate_stream_metadata(
 
     metadata = container_metadata.streams[stream_index]
 
-    if metadata.num_frames is None:
-        raise ValueError(
-            "The number of frames is unknown. " + ERROR_REPORTING_INSTRUCTIONS
-        )
-    num_frames = metadata.num_frames
-
     if metadata.begin_stream_seconds is None:
         raise ValueError(
             "The minimum pts value in seconds is unknown. "
@@ -97,7 +82,6 @@ def get_and_validate_stream_metadata(
     return (
         metadata,
         stream_index,
-        num_frames,
         begin_stream_seconds,
         end_stream_seconds,
     )
@@ -14,8 +14,8 @@
 from torchcodec.decoders import _core as core
 from torchcodec.decoders._decoder_utils import (
     create_decoder,
+    ERROR_REPORTING_INSTRUCTIONS,
     get_and_validate_stream_metadata,
-    validate_seek_mode,
 )
 
 
@@ -76,7 +76,13 @@ def __init__(
         device: Optional[Union[str, device]] = "cpu",
         seek_mode: Literal["exact", "approximate"] = "exact",
     ):
-        validate_seek_mode(seek_mode)
+        allowed_seek_modes = ("exact", "approximate")
+        if seek_mode not in allowed_seek_modes:
+            raise ValueError(
+                f"Invalid seek mode ({seek_mode}). "
+                f"Supported values are {', '.join(allowed_seek_modes)}."
+            )
+
         self._decoder = create_decoder(source=source, seek_mode=seek_mode)
 
         allowed_dimension_orders = ("NCHW", "NHWC")
@@ -100,13 +106,18 @@ def __init__(
         (
             self.metadata,
             self.stream_index,
-            self._num_frames,
             self._begin_stream_seconds,
             self._end_stream_seconds,
         ) = get_and_validate_stream_metadata(
             decoder=self._decoder, stream_index=stream_index, media_type="video"
         )
 
+        if self.metadata.num_frames is None:
+            raise ValueError(
+                "The number of frames is unknown. " + ERROR_REPORTING_INSTRUCTIONS
+            )
+        self._num_frames = self.metadata.num_frames
+
     def __len__(self) -> int:
         return self._num_frames