meta-pytorch
diff --git a/‎.github/workflows/build_ffmpeg.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build_ffmpeg.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packaging/build_ffmpeg.sh‎
Lines changed: 14 additions & 2 deletions b/‎packaging/build_ffmpeg.sh‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/_frame.py‎
Lines changed: 26 additions & 1 deletion b/‎src/torchcodec/_frame.py‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎src/torchcodec/decoders/_audio_decoder.py‎
Lines changed: 68 additions & 0 deletions b/‎src/torchcodec/decoders/_audio_decoder.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 11 additions & 4 deletions b/‎src/torchcodec/decoders/_core/FFMPEGCommon.cpp‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 2 additions & 1 deletion b/‎src/torchcodec/decoders/_core/FFMPEGCommon.h‎
Lines changed: 2 additions & 1 deletion
@@ -34,7 +34,7 @@ jobs:
       contents: read
     with:
       job-name: Build
-      upload-artifact: ffmpeg-lgpl
+      upload-artifact: ffmpeg-lgpl-linux_x86_64-${{ matrix.ffmpeg-version }}
       repository: pytorch/torchcodec
       script: |
         export FFMPEG_VERSION="${{ matrix.ffmpeg-version }}"
@@ -56,7 +56,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       job-name: Build
-      upload-artifact: ffmpeg-lgpl
+      upload-artifact: ffmpeg-lgpl-macos-${{ matrix.ffmpeg-version }}
       repository: pytorch/torchcodec
       runner: macos-14-xlarge
       script: |
 
@@ -152,7 +152,7 @@ format you want. Refer to Nvidia's GPU support matrix for more details
    the CUDA Toolkit.
 
 2. Install or compile FFmpeg with NVDEC support.
-   TorchCodec with CUDA should work with FFmpeg versions in [5, 7].
+   TorchCodec with CUDA should work with FFmpeg versions in [4, 7].
 
    If FFmpeg is not already installed, or you need a more recent version, an
    easy way to install it is to use `conda`:
 
@@ -61,7 +61,8 @@ tar -xf ffmpeg.tar.gz --strip-components 1
     --enable-avfilter \
     --enable-avformat \
     --enable-avutil \
-    --enable-swscale
+    --enable-swscale \
+    --enable-swresample
 
 make -j install
 ls ${prefix}/*
@@ -78,27 +79,31 @@ if [[ "$(uname)" == Darwin ]]; then
         avdevice=libavdevice.58
         avfilter=libavfilter.7
         swscale=libswscale.5
+        swresample=libswresample.3
     elif [[ ${major_ver} == 5 ]]; then
         avutil=libavutil.57
         avcodec=libavcodec.59
         avformat=libavformat.59
         avdevice=libavdevice.59
         avfilter=libavfilter.8
         swscale=libswscale.6
+        swresample=libswresample.4
     elif [[ ${major_ver} == 6 ]]; then
         avutil=libavutil.58
         avcodec=libavcodec.60
         avformat=libavformat.60
         avdevice=libavdevice.60
         avfilter=libavfilter.9
         swscale=libswscale.7
+        swresample=libswresample.4
     elif [[ ${major_ver} == 7 ]]; then
         avutil=libavutil.59
         avcodec=libavcodec.61
         avformat=libavformat.61
         avdevice=libavdevice.61
         avfilter=libavfilter.10
         swscale=libswscale.8
+        swresample=libswresample.5
     else
         printf "Error: unexpected FFmpeg major version: %s\n"  ${major_ver}
         exit 1;
@@ -120,7 +125,7 @@ if [[ "$(uname)" == Darwin ]]; then
     fi
 
     # list up the paths to fix
-    for lib in ${avcodec} ${avdevice} ${avfilter} ${avformat} ${avutil} ${swscale}; do
+    for lib in ${avcodec} ${avdevice} ${avfilter} ${avformat} ${avutil} ${swscale} ${swresample}; do
         ${otool} -l ${prefix}/lib/${lib}.dylib | grep -B2 ${prefix}
     done
 
@@ -155,6 +160,13 @@ if [[ "$(uname)" == Darwin ]]; then
         ${prefix}/lib/${swscale}.dylib
     ${otool} -l ${prefix}/lib/${swscale}.dylib | grep -B2 ${prefix}
 
+    ${install_name_tool} \
+        -change ${prefix}/lib/${avutil}.dylib @rpath/${avutil}.dylib \
+        -delete_rpath ${prefix}/lib \
+        -id @rpath/${swresample}.dylib \
+        ${prefix}/lib/${swresample}.dylib
+    ${otool} -l ${prefix}/lib/${swresample}.dylib | grep -B2 ${prefix}
+
     ${install_name_tool} \
         -change ${prefix}/lib/${avcodec}.dylib @rpath/${avcodec}.dylib \
         -change ${prefix}/lib/${avutil}.dylib @rpath/${avutil}.dylib \
 
@@ -1,5 +1,5 @@
 [project]
-name = "TorchCodec"
+name = "torchcodec"
 description = "A video decoder for PyTorch"
 readme = "README.md"
 requires-python = ">=3.8"
 
@@ -6,7 +6,7 @@
 
 # Note: usort wants to put Frame and FrameBatch after decoders and samplers,
 # but that results in circular import.
-from ._frame import Frame, FrameBatch  # usort:skip # noqa
+from ._frame import AudioSamples, Frame, FrameBatch  # usort:skip # noqa
 from . import decoders, samplers  # noqa
 
 try:
 
@@ -12,7 +12,7 @@
 
 
 def _frame_repr(self):
-    # Utility to replace Frame and FrameBatch __repr__ method. This prints the
+    # Utility to replace __repr__ method of dataclasses below. This prints the
     # shape of the .data tensor rather than printing the (potentially very long)
     # data tensor itself.
     s = self.__class__.__name__ + ":\n"
@@ -114,3 +114,28 @@ def __len__(self):
 
     def __repr__(self):
         return _frame_repr(self)
+
+
+@dataclass
+class AudioSamples(Iterable):
+    """Audio samples with associated metadata."""
+
+    # TODO-AUDIO: docs
+    data: Tensor
+    pts_seconds: float
+    sample_rate: int
+
+    def __post_init__(self):
+        # This is called after __init__() when a Frame is created. We can run
+        # input validation checks here.
+        if not self.data.ndim == 2:
+            raise ValueError(f"data must be 2-dimensional, got {self.data.shape = }")
+        self.pts_seconds = float(self.pts_seconds)
+        self.sample_rate = int(self.sample_rate)
+
+    def __iter__(self) -> Iterator[Union[Tensor, float]]:
+        for field in dataclasses.fields(self):
+            yield getattr(self, field.name)
+
+    def __repr__(self):
+        return _frame_repr(self)
@@ -9,6 +9,7 @@
 
 from torch import Tensor
 
+from torchcodec import AudioSamples
 from torchcodec.decoders import _core as core
 from torchcodec.decoders._decoder_utils import (
     create_decoder,
@@ -37,3 +38,70 @@ def __init__(
         ) = get_and_validate_stream_metadata(
             decoder=self._decoder, stream_index=stream_index, media_type="audio"
         )
+        assert isinstance(self.metadata, core.AudioStreamMetadata)  # mypy
+
+    def get_samples_played_in_range(
+        self, start_seconds: float, stop_seconds: Optional[float] = None
+    ) -> AudioSamples:
+        """TODO-AUDIO docs"""
+        if stop_seconds is not None and not start_seconds <= stop_seconds:
+            raise ValueError(
+                f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
+            )
+        if not self._begin_stream_seconds <= start_seconds < self._end_stream_seconds:
+            raise ValueError(
+                f"Invalid start seconds: {start_seconds}. "
+                f"It must be greater than or equal to {self._begin_stream_seconds} "
+                f"and less than or equal to {self._end_stream_seconds}."
+            )
+        frames, first_pts = core.get_frames_by_pts_in_range_audio(
+            self._decoder,
+            start_seconds=start_seconds,
+            stop_seconds=stop_seconds,
+        )
+        first_pts = first_pts.item()
+
+        # x = frame boundaries
+        #
+        #            first_pts                                    last_pts
+        #                v                                            v
+        # ....x..........x..........x...........x..........x..........x.....
+        #                    ^                                 ^
+        #               start_seconds                      stop_seconds
+        #
+        # We want to return the samples in [start_seconds, stop_seconds). But
+        # because the core API is based on frames, the `frames` tensor contains
+        # the samples in [first_pts, last_pts)
+        # So we do some basic math to figure out the position of the view that
+        # we'll return.
+
+        # TODO: sample_rate is either the original one from metadata, or the
+        # user-specified one (NIY)
+        assert isinstance(self.metadata, core.AudioStreamMetadata)  # mypy
+        sample_rate = self.metadata.sample_rate
+
+        # TODO: metadata's sample_rate should probably not be Optional
+        assert sample_rate is not None  # mypy.
+
+        if first_pts < start_seconds:
+            offset_beginning = round((start_seconds - first_pts) * sample_rate)
+            output_pts_seconds = start_seconds
+        else:
+            # In normal cases we'll have first_pts <= start_pts, but in some
+            # edge cases it's possible to have first_pts > start_seconds,
+            # typically if the stream's first frame's pts isn't exactly 0.
+            offset_beginning = 0
+            output_pts_seconds = first_pts
+
+        num_samples = frames.shape[1]
+        last_pts = first_pts + num_samples / self.metadata.sample_rate
+        if stop_seconds is not None and stop_seconds < last_pts:
+            offset_end = num_samples - round((last_pts - stop_seconds) * sample_rate)
+        else:
+            offset_end = num_samples
+
+        return AudioSamples(
+            data=frames[:, offset_beginning:offset_end],
+            pts_seconds=output_pts_seconds,
+            sample_rate=sample_rate,
+        )
@@ -60,15 +60,22 @@ int64_t getDuration(const AVFrame* frame) {
 #endif
 }
 
-int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext) {
+int getNumChannels(const AVFrame* avFrame) {
 #if LIBAVFILTER_VERSION_MAJOR > 8 || \
     (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
-  int numChannels = avCodecContext->ch_layout.nb_channels;
+  return avFrame->ch_layout.nb_channels;
 #else
-  int numChannels = avCodecContext->channels;
+  return av_get_channel_layout_nb_channels(avFrame->channel_layout);
 #endif
+}
 
-  return static_cast<int64_t>(numChannels);
+int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
+#if LIBAVFILTER_VERSION_MAJOR > 8 || \
+    (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
+  return avCodecContext->ch_layout.nb_channels;
+#else
+  return avCodecContext->channels;
+#endif
 }
 
 AVIOBytesContext::AVIOBytesContext(
 
@@ -139,7 +139,8 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
 int64_t getDuration(const UniqueAVFrame& frame);
 int64_t getDuration(const AVFrame* frame);
 
-int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext);
+int getNumChannels(const AVFrame* avFrame);
+int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();