Merge branch 'main' of github.com:pytorch/torchcodec into refactor_decoder_transforms

scotts · scotts · commit a9664c5f7e6c · 2025-12-03T12:32:38.000-08:00
diff --git a/.github/workflows/build_ffmpeg.yaml b/.github/workflows/build_ffmpeg.yaml
@@ -48,6 +48,33 @@ jobs:
         mkdir -p "${artifact_dir}"
         mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
 
+  LGPL-Linux-aarch64:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg-version: ["4.4.4", "5.1.4", "6.1.1", "7.0.1", "8.0"]
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      job-name: Build
+      upload-artifact: ffmpeg-lgpl-linux_aarch64-${{ matrix.ffmpeg-version }}
+      repository: meta-pytorch/torchcodec
+      runner: linux.arm64.2xlarge
+      docker-image: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg-version }}"
+        export FFMPEG_ROOT="${PWD}/ffmpeg"
+
+        packaging/build_ffmpeg.sh
+
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux_aarch64"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
   LGPL-macOS:
     strategy:
       fail-fast: false
diff --git a/.github/workflows/linux_cuda_aarch64_wheel.yaml b/.github/workflows/linux_cuda_aarch64_wheel.yaml
@@ -0,0 +1,58 @@
+name: Build and test Linux CUDA aarch64 wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: write
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux-aarch64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cpu: disable
+      with-xpu: disable
+      with-rocm: disable
+      with-cuda: enable
+      build-python-only: "disable"
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+    name: Build and Upload wheel
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: meta-pytorch/torchcodec
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: packaging/pre_build_script.sh
+      post-script: packaging/post_build_script.sh
+      smoke-test-script: packaging/fake_smoke_test.py
+      package-name: torchcodec
+      trigger-event: ${{ github.event_name }}
+      architecture: aarch64
+      build-platform: "python-build-package"
+      build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 ENABLE_CUDA=1 python -m build --wheel -vvv --no-isolation"
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -3,6 +3,7 @@ sphinx==5.0.0
 sphinx_design
 sphinx_copybutton
 sphinx-tabs
+sphinx-sitemap
 matplotlib
 torchvision
 ipython
diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst
@@ -14,4 +14,5 @@ For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
     :template: dataclass.rst
 
     DecoderTransform
+    RandomCrop
     Resize
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -55,6 +55,7 @@
     "sphinx_tabs.tabs",
     "sphinx_design",
     "sphinx_copybutton",
+    "sphinx_sitemap",
 ]
 
 
@@ -216,6 +217,15 @@ def __call__(self, filename):
     "matplotlib": ("https://matplotlib.org/stable/", None),
 }
 
+# sitemap config
+html_baseurl = "https://meta-pytorch.org/torchcodec/stable/"
+sitemap_locales = [None]
+sitemap_excludes = [
+    "search.html",
+    "genindex.html",
+]
+sitemap_url_scheme = "{link}"
+
 
 def inject_minigalleries(app, what, name, obj, options, lines):
     """Inject a minigallery into a docstring.
diff --git a/examples/decoding/approximate_mode.py b/examples/decoding/approximate_mode.py
@@ -66,7 +66,7 @@
 # Performance: ``VideoDecoder`` creation
 # --------------------------------------
 #
-# In terms of performance, the ``seek_mode`` parameter ultimately affects the
+# In terms of performance, the ``seek_mode`` parameter mainly affects the
 # **creation** of a :class:`~torchcodec.decoders.VideoDecoder` object. The
 # longer the video, the higher the performance gain.
 
@@ -104,7 +104,7 @@ def bench(f, average_over=50, warmup=2, **f_kwargs):
 # ---------------------------------------------
 #
 # Strictly speaking the ``seek_mode`` parameter only affects the performance of
-# the :class:`~torchcodec.decoders.VideoDecoder` creation. It does not have a
+# the :class:`~torchcodec.decoders.VideoDecoder` creation. It usually does not have a
 # direct effect on the performance of frame decoding or sampling.  **However**,
 # because frame decoding and sampling patterns typically involve the creation of
 # the :class:`~torchcodec.decoders.VideoDecoder` (one per video), ``seek_mode``
@@ -168,20 +168,21 @@ def sample_clips(seek_mode):
 # duration), and also builds an internal index of frames and key-frames. This
 # internal index is potentially more accurate than the one in the file's
 # headers, which leads to more accurate seeking behavior.
-# Without the scan, TorchCodec relies only on the metadata contained in the
-# file, which may not always be as accurate.
+# Without the scan (in approximate mode), TorchCodec relies only on the metadata
+# contained in the file, which may not always be as accurate. In some rare
+# cases, relying on this less accurate data may also lead to slower frame
+# decoding, because it can involve unnecessary seeks.
 #
 # Which mode should I use?
 # ------------------------
 #
 # The general rule of thumb is as follows:
 #
 # - If you really care about exactness of frame seeking, use "exact".
-# - If you can sacrifice exactness of seeking for speed, which is usually the
-#   case when doing clip sampling, use "approximate".
-# - If your videos don't have variable framerate and their metadata is correct,
-#   then "approximate" mode is a net win: it will be just as accurate as the
-#   "exact" mode while still being significantly faster.
+# - If your videos are short (less then a few minutes) then "exact" will usually
+#   be preferable, as the scan's fixed cost will be negligible.
+# - For long videos, if you can sacrifice exactness of seeking for speed, which
+#   is usually the case when doing clip sampling, consider using "approximate".
 
 # %%
 shutil.rmtree(temp_dir)
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1094,13 +1094,6 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
   // Returns true if we can avoid seeking in the AVFormatContext based on
   // heuristics that rely on the target cursor_ and the last decoded frame.
   // Seeking is expensive, so we try to avoid it when possible.
-  // Note that this function itself isn't always that cheap to call: in
-  // particular the calls to getKeyFrameIndexForPts below in approximate mode
-  // are sometimes slow.
-  // TODO we should understand why (is it because it reads the file?) and
-  // potentially optimize it. E.g. we may not want to ever seek, or even *check*
-  // if we need to seek in some cases, like if we're going to decode 80% of the
-  // frames anyway.
   const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     // For audio, we only need to seek if a backwards seek was requested
@@ -1147,10 +1140,10 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
   // I    P     P    P    I    P    P    P    I    P    P    I    P
   //                           x              j         y
   // (2) is only more efficient than (1) if there is an I frame between x and y.
-  int lastKeyFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts_);
-  int targetKeyFrameIndex = getKeyFrameIndexForPts(cursor_);
-  return lastKeyFrameIndex >= 0 && targetKeyFrameIndex >= 0 &&
-      lastKeyFrameIndex == targetKeyFrameIndex;
+  int lastKeyFrame = getKeyFrameIdentifier(lastDecodedAvFramePts_);
+  int targetKeyFrame = getKeyFrameIdentifier(cursor_);
+  return lastKeyFrame >= 0 && targetKeyFrame >= 0 &&
+      lastKeyFrame == targetKeyFrame;
 }
 
 // This method looks at currentPts and desiredPts and seeks in the
@@ -1367,7 +1360,19 @@ torch::Tensor SingleStreamDecoder::maybePermuteHWC2CHW(
 // PTS <-> INDEX CONVERSIONS
 // --------------------------------------------------------------------------
 
-int SingleStreamDecoder::getKeyFrameIndexForPts(int64_t pts) const {
+int SingleStreamDecoder::getKeyFrameIdentifier(int64_t pts) const {
+  // This function "identifies" a key frame for a given pts value.
+  // We use the term "identifier" rather than "index" because the nature of the
+  // index that is returned depends on various factors:
+  // - If seek_mode is exact, we return the index of the key frame in the
+  //   scanned key-frame vector (streamInfo.keyFrames). So the returned value is
+  //   in [0, num_key_frames).
+  // - If seek_mode is approximate, we use av_index_search_timestamp() which
+  //   may return a value in [0, num_key_frames) like for mkv, but also a value
+  //   in [0, num_frames) like for mp4. It really depends on the container.
+  //
+  //  The range of the "identifier" doesn't matter that much, for now we only
+  //  use it to uniquely identify a key frame in canWeAvoidSeeking().
   const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
   if (streamInfo.keyFrames.empty()) {
     return av_index_search_timestamp(
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -282,7 +282,7 @@ class SingleStreamDecoder {
   // PTS <-> INDEX CONVERSIONS
   // --------------------------------------------------------------------------
 
-  int getKeyFrameIndexForPts(int64_t pts) const;
+  int getKeyFrameIdentifier(int64_t pts) const;
 
   // Returns the key frame index of the presentation timestamp using our index.
   // We build this index by scanning the file in
diff --git a/src/torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake b/src/torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake
@@ -15,31 +15,59 @@ set(
 )
 
 if (LINUX)
-    set(
-        platform_url
-        ${base_url}/linux_x86_64
-    )
+    if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
+        set(
+            platform_url
+            ${base_url}/linux_aarch64
+        )
 
-    set(
-        f4_sha256
-        1a083f1922443bedb5243d04896383b8c606778a7ddb9d886c8303e55339fe0c
-    )
-    set(
-        f5_sha256
-        65d6ad54082d94dcb3f801d73df2265e0e1bb303c7afbce7723e3b77ccd0e207
-    )
-    set(
-        f6_sha256
-        8bd5939c2f4a4b072e837e7870c13fe7d13824e5ff087ab534e4db4e90b7be9c
-    )
-    set(
-        f7_sha256
-        1cb946d8b7c6393c2c3ebe1f900b8de7a2885fe614c45d4ec32c9833084f2f26
-    )
-    set(
-        f8_sha256
-        c55b3c1a4b5e4d5fdd7c632bea3ab6f45b4e37cc8e0999dda3f84a8ed8defad8
-    )
+        set(
+            f4_sha256
+            a310a2ed9ffe555fd3278dae15065541098dd35e124564671dcda6a6620ac842
+        )
+        set(
+            f5_sha256
+            89ca7996bccbc2db49adaa401d20fdbabffe0e1b4e07a0f81d6b143e858b7c8d
+        )
+        set(
+            f6_sha256
+            ae44c67b4587d061b8e9cc8990ca891ee013fe52ad79e5016ba29871562621da
+        )
+        set(
+            f7_sha256
+            948e2cac66ca6f68ff526d5e84138e94bce0f1a7c83f502d15d85d0bd3ddc112
+        )
+        set(
+            f8_sha256
+            b9cfd99ae75a14e58300854967d4dc49de0b3daa551df51ea1f52a3f08d2c8af
+        )
+    elseif (LINUX)  # assume x86_64
+        set(
+            platform_url
+            ${base_url}/linux_x86_64
+        )
+
+        set(
+            f4_sha256
+            1a083f1922443bedb5243d04896383b8c606778a7ddb9d886c8303e55339fe0c
+        )
+        set(
+            f5_sha256
+            65d6ad54082d94dcb3f801d73df2265e0e1bb303c7afbce7723e3b77ccd0e207
+        )
+        set(
+            f6_sha256
+            8bd5939c2f4a4b072e837e7870c13fe7d13824e5ff087ab534e4db4e90b7be9c
+        )
+        set(
+            f7_sha256
+            1cb946d8b7c6393c2c3ebe1f900b8de7a2885fe614c45d4ec32c9833084f2f26
+        )
+        set(
+            f8_sha256
+            c55b3c1a4b5e4d5fdd7c632bea3ab6f45b4e37cc8e0999dda3f84a8ed8defad8
+        )
+  endif()
 elseif (APPLE)
     set(
         platform_url
diff --git a/src/torchcodec/_internally_replaced_utils.py b/src/torchcodec/_internally_replaced_utils.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import importlib
+import importlib.util
 import sys
 from pathlib import Path
 from types import ModuleType
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -113,6 +113,9 @@ def _make_transform_spec(
     ) -> str:
         return f"resize, {self.size[0]}, {self.size[1]}"
 
+    def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
+        return f"resize, {self.size[0]}, {self.size[1]}"
+
     def _get_output_dims(self) -> Optional[Tuple[Optional[int], Optional[int]]]:
         return (self.size[0], self.size[1])