meta-pytorch
diff --git a/‎src/torchcodec/decoders/_audio_decoder.py‎
Lines changed: 17 additions & 114 deletions b/‎src/torchcodec/decoders/_audio_decoder.py‎
Lines changed: 17 additions & 114 deletions
diff --git a/‎src/torchcodec/decoders/_core/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎src/torchcodec/decoders/_core/__init__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/torchcodec/decoders/_core/_metadata.py‎
Lines changed: 42 additions & 111 deletions b/‎src/torchcodec/decoders/_core/_metadata.py‎
Lines changed: 42 additions & 111 deletions
@@ -5,136 +5,39 @@
 # LICENSE file in the root directory of this source tree.
 
 from pathlib import Path
-from typing import Literal, Optional, Tuple, Union
+from typing import Literal, Optional, Union
 
 from torch import Tensor
 
 from torchcodec.decoders import _core as core
-
-_ERROR_REPORTING_INSTRUCTIONS = """
-This should never happen. Please report an issue following the steps in
-https://github.com/pytorch/torchcodec/issues/new?assignees=&labels=&projects=&template=bug-report.yml.
-"""
+from torchcodec.decoders._decoder_utils import (
+    create_decoder,
+    get_and_validate_stream_metadata,
+    validate_seek_mode,
+)
 
 
 class AudioDecoder:
-    """A single-stream audio decoder.
-
-    TODO docs
-    """
+    """TODO-audio docs"""
 
     def __init__(
         self,
         source: Union[str, Path, bytes, Tensor],
         *,
-        sample_rate: Optional[int] = None,
         stream_index: Optional[int] = None,
         seek_mode: Literal["exact", "approximate"] = "exact",
     ):
-        if sample_rate is not None:
-            raise ValueError("TODO implement this")
-
-        # TODO unify validation with VideoDecoder?
-        allowed_seek_modes = ("exact", "approximate")
-        if seek_mode not in allowed_seek_modes:
-            raise ValueError(
-                f"Invalid seek mode ({seek_mode}). "
-                f"Supported values are {', '.join(allowed_seek_modes)}."
-            )
-
-        if isinstance(source, str):
-            self._decoder = core.create_from_file(source, seek_mode)
-        elif isinstance(source, Path):
-            self._decoder = core.create_from_file(str(source), seek_mode)
-        elif isinstance(source, bytes):
-            self._decoder = core.create_from_bytes(source, seek_mode)
-        elif isinstance(source, Tensor):
-            self._decoder = core.create_from_tensor(source, seek_mode)
-        else:
-            raise TypeError(
-                f"Unknown source type: {type(source)}. "
-                "Supported types are str, Path, bytes and Tensor."
-            )
+        validate_seek_mode(seek_mode)
+        self._decoder = create_decoder(source=source, seek_mode=seek_mode)
 
         core.add_audio_stream(self._decoder, stream_index=stream_index)
 
-        self.metadata, self.stream_index = _get_and_validate_stream_metadata(
-            self._decoder, stream_index
-        )
-
-        # if self.metadata.num_frames is None:
-        #     raise ValueError(
-        #         "The number of frames is unknown. " + _ERROR_REPORTING_INSTRUCTIONS
-        #     )
-        # self._num_frames = self.metadata.num_frames
-
-        # if self.metadata.begin_stream_seconds is None:
-        #     raise ValueError(
-        #         "The minimum pts value in seconds is unknown. "
-        #         + _ERROR_REPORTING_INSTRUCTIONS
-        #     )
-        # self._begin_stream_seconds = self.metadata.begin_stream_seconds
-
-        # if self.metadata.end_stream_seconds is None:
-        #     raise ValueError(
-        #         "The maximum pts value in seconds is unknown. "
-        #         + _ERROR_REPORTING_INSTRUCTIONS
-        #     )
-        # self._end_stream_seconds = self.metadata.end_stream_seconds
-
-    # TODO we need to have a default for stop_seconds.
-    def get_samples_played_in_range(
-        self, start_seconds: float, stop_seconds: float
-    ) -> Tensor:
-        """
-        TODO DOCS
-        """
-        # if not start_seconds <= stop_seconds:
-        #     raise ValueError(
-        #         f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
-        #     )
-        # if not self._begin_stream_seconds <= start_seconds < self._end_stream_seconds:
-        #     raise ValueError(
-        #         f"Invalid start seconds: {start_seconds}. "
-        #         f"It must be greater than or equal to {self._begin_stream_seconds} "
-        #         f"and less than or equal to {self._end_stream_seconds}."
-        #     )
-        # if not stop_seconds <= self._end_stream_seconds:
-        #     raise ValueError(
-        #         f"Invalid stop seconds: {stop_seconds}. "
-        #         f"It must be less than or equal to {self._end_stream_seconds}."
-        #     )
-
-        frames, *_ = core.get_frames_by_pts_in_range(
-            self._decoder,
-            start_seconds=start_seconds,
-            stop_seconds=stop_seconds,
+        (
+            self.metadata,
+            self.stream_index,
+            self._num_frames,
+            self._begin_stream_seconds,
+            self._end_stream_seconds,
+        ) = get_and_validate_stream_metadata(
+            decoder=self._decoder, stream_index=stream_index, media_type="audio"
         )
-        # TODO need to return view on this to account for samples instead of
-        # frames
-        return frames
-
-
-def _get_and_validate_stream_metadata(
-    decoder: Tensor,
-    stream_index: Optional[int] = None,
-) -> Tuple[core.AudioStreamMetadata, int]:
-
-    # TODO should this still be called `get_video_metadata`?
-    container_metadata = core.get_video_metadata(decoder)
-
-    if stream_index is None:
-        best_stream_index = container_metadata.best_audio_stream_index
-        if best_stream_index is None:
-            raise ValueError(
-                "The best audio stream is unknown and there is no specified stream. "
-                + _ERROR_REPORTING_INSTRUCTIONS
-            )
-        stream_index = best_stream_index
-
-    # This should be logically true because of the above conditions, but type checker
-    # is not clever enough.
-    assert stream_index is not None
-
-    stream_metadata = container_metadata.streams[stream_index]
-    return (stream_metadata, stream_index)
@@ -7,9 +7,9 @@
 
 from ._metadata import (
     AudioStreamMetadata,
-    get_video_metadata,
-    get_video_metadata_from_header,
-    VideoMetadata,
+    ContainerMetadata,
+    get_container_metadata,
+    get_container_metadata_from_header,
     VideoStreamMetadata,
 )
 from .video_decoder_ops import (
 
@@ -19,10 +19,9 @@
 )
 
 
+# TODO-audio: docs below are mostly for video streams.
 @dataclass
-class VideoStreamMetadata:
-    """Metadata of a single video stream."""
-
+class StreamMetadata:
     duration_seconds_from_header: Optional[float]
     """Duration of the stream, in seconds, obtained from the header (float or
     None). This could be inaccurate."""
@@ -54,10 +53,6 @@ class VideoStreamMetadata:
     """
     codec: Optional[str]
     """Codec (str or None)."""
-    width: Optional[int]
-    """Width of the frames (int or None)."""
-    height: Optional[int]
-    """Height of the frames (int or None)."""
     average_fps_from_header: Optional[float]
     """Averate fps of the stream, obtained from the header (float or None).
     We recommend using the ``average_fps`` attribute instead."""
@@ -145,109 +140,37 @@ def __repr__(self):
 
 
 @dataclass
-class AudioStreamMetadata:
-    # TODO do we expose the notion of frame here, like in fps? It's technically
-    # valid, but potentially is an FFmpeg-specific concept for audio
-    # TODO Need sample rate and format
-    sample_rate: Optional[int]
-    duration_seconds_from_header: Optional[float]
-    bit_rate: Optional[float]
-    num_frames_from_header: Optional[int]
-    num_frames_from_content: Optional[int]
-    begin_stream_seconds_from_content: Optional[float]
-    end_stream_seconds_from_content: Optional[float]
-    codec: Optional[str]
-    average_fps_from_header: Optional[float]
-    stream_index: int
-
-    @property
-    def num_frames(self) -> Optional[int]:
-        """Number of frames in the stream. This corresponds to
-        ``num_frames_from_content`` if a :term:`scan` was made, otherwise it
-        corresponds to ``num_frames_from_header``.
-        """
-        if self.num_frames_from_content is not None:
-            return self.num_frames_from_content
-        else:
-            return self.num_frames_from_header
+class VideoStreamMetadata(StreamMetadata):
+    """Metadata of a single video stream."""
 
-    @property
-    def duration_seconds(self) -> Optional[float]:
-        """Duration of the stream in seconds. We try to calculate the duration
-        from the actual frames if a :term:`scan` was performed. Otherwise we
-        fall back to ``duration_seconds_from_header``.
-        """
-        if (
-            self.end_stream_seconds_from_content is None
-            or self.begin_stream_seconds_from_content is None
-        ):
-            return self.duration_seconds_from_header
-        return (
-            self.end_stream_seconds_from_content
-            - self.begin_stream_seconds_from_content
-        )
+    width: Optional[int]
+    """Width of the frames (int or None)."""
+    height: Optional[int]
+    """Height of the frames (int or None)."""
 
-    @property
-    def average_fps(self) -> Optional[float]:
-        """Average fps of the stream. If a :term:`scan` was perfomed, this is
-        computed from the number of frames and the duration of the stream.
-        Otherwise we fall back to ``average_fps_from_header``.
-        """
-        if (
-            self.end_stream_seconds_from_content is None
-            or self.begin_stream_seconds_from_content is None
-            or self.num_frames is None
-        ):
-            return self.average_fps_from_header
-        return self.num_frames / (
-            self.end_stream_seconds_from_content
-            - self.begin_stream_seconds_from_content
-        )
+    def __repr__(self):
+        return super().__repr__()
 
-    @property
-    def begin_stream_seconds(self) -> float:
-        """Beginning of the stream, in seconds (float). Conceptually, this
-        corresponds to the first frame's :term:`pts`. If
-        ``begin_stream_seconds_from_content`` is not None, then it is returned.
-        Otherwise, this value is 0.
-        """
-        if self.begin_stream_seconds_from_content is None:
-            return 0
-        else:
-            return self.begin_stream_seconds_from_content
 
-    @property
-    def end_stream_seconds(self) -> Optional[float]:
-        """End of the stream, in seconds (float or None).
-        Conceptually, this corresponds to last_frame.pts + last_frame.duration.
-        If ``end_stream_seconds_from_content`` is not None, then that value is
-        returned. Otherwise, returns ``duration_seconds``.
-        """
-        if self.end_stream_seconds_from_content is None:
-            return self.duration_seconds
-        else:
-            return self.end_stream_seconds_from_content
+@dataclass
+class AudioStreamMetadata(StreamMetadata):
+    # TODO-AUDIO do we expose the notion of frame here, like in fps? It's technically
+    # valid, but potentially is an FFmpeg-specific concept for audio
+    # TODO-AUDIO Need sample rate and format and num_channels
+    sample_rate: Optional[int]
 
     def __repr__(self):
-        # Overridden because properites are not printed by default.
-        s = self.__class__.__name__ + ":\n"
-        spaces = "  "
-        s += f"{spaces}num_frames: {self.num_frames}\n"
-        s += f"{spaces}duration_seconds: {self.duration_seconds}\n"
-        s += f"{spaces}average_fps: {self.average_fps}\n"
-        for field in dataclasses.fields(self):
-            s += f"{spaces}{field.name}: {getattr(self, field.name)}\n"
-        return s
+        return super().__repr__()
 
 
 @dataclass
-class VideoMetadata:
+class ContainerMetadata:
     duration_seconds_from_header: Optional[float]
     bit_rate_from_header: Optional[float]
     best_video_stream_index: Optional[int]
     best_audio_stream_index: Optional[int]
 
-    streams: List[Union[VideoStreamMetadata, AudioStreamMetadata]]
+    streams: List[StreamMetadata]
 
     @property
     def duration_seconds(self) -> Optional[float]:
@@ -266,15 +189,15 @@ def best_video_stream(self) -> VideoStreamMetadata:
         return metadata
 
 
-def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata:
-    """Return video metadata from a video decoder.
+def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
+    """Return container metadata from a decoder.
 
     The accuracy of the metadata and the availability of some returned fields
     depends on whether a full scan was performed by the decoder.
     """
 
     container_dict = json.loads(_get_container_json_metadata(decoder))
-    streams_metadata: List[Union[VideoStreamMetadata, AudioStreamMetadata]] = []
+    streams_metadata: List[StreamMetadata] = []
     for stream_index in range(container_dict["numStreams"]):
         stream_dict = json.loads(_get_stream_json_metadata(decoder, stream_index))
         common_meta = dict(
@@ -288,25 +211,29 @@ def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata:
             average_fps_from_header=stream_dict.get("averageFps"),
             stream_index=stream_index,
         )
-        if stream_dict["mediaType"] == "audio":
+        if stream_dict["mediaType"] == "video":
             streams_metadata.append(
-                AudioStreamMetadata(
-                    sample_rate=stream_dict.get("sampleRate"),
+                VideoStreamMetadata(
+                    width=stream_dict.get("width"),
+                    height=stream_dict.get("height"),
                     **common_meta,
                 )
             )
-        else:
-            # TODO we're adding a VideoStreamMetadata for all non-audio streams,
-            # including streams like subtitles, which makes little sense.
+        elif stream_dict["mediaType"] == "audio":
             streams_metadata.append(
-                VideoStreamMetadata(
-                    width=stream_dict.get("width"),
-                    height=stream_dict.get("height"),
+                AudioStreamMetadata(
+                    sample_rate=stream_dict.get("sampleRate"),
                     **common_meta,
                 )
             )
+        else:
+            # This is neither a video nor audio stream. Could be e.g. subtitles.
+            # We still need to add an entry to streams_metadata to keep its
+            # length consistent with the number of streams, so we add a dummy
+            # entry.
+            streams_metadata.append(StreamMetadata(**common_meta))
 
-    return VideoMetadata(
+    return ContainerMetadata(
         duration_seconds_from_header=container_dict.get("durationSeconds"),
         bit_rate_from_header=container_dict.get("bitRate"),
         best_video_stream_index=container_dict.get("bestVideoStreamIndex"),
@@ -315,5 +242,9 @@ def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata:
     )
 
 
-def get_video_metadata_from_header(filename: Union[str, pathlib.Path]) -> VideoMetadata:
-    return get_video_metadata(create_from_file(str(filename), seek_mode="approximate"))
+def get_container_metadata_from_header(
+    filename: Union[str, pathlib.Path]
+) -> ContainerMetadata:
+    return get_container_metadata(
+        create_from_file(str(filename), seek_mode="approximate")
+    )