Skip to content

Commit a9664c5

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into refactor_decoder_transforms
2 parents 5017760 + 38fa96c commit a9664c5

File tree

11 files changed

+181
-46
lines changed

11 files changed

+181
-46
lines changed

.github/workflows/build_ffmpeg.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,33 @@ jobs:
4848
mkdir -p "${artifact_dir}"
4949
mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
5050
51+
LGPL-Linux-aarch64:
52+
strategy:
53+
fail-fast: false
54+
matrix:
55+
ffmpeg-version: ["4.4.4", "5.1.4", "6.1.1", "7.0.1", "8.0"]
56+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
57+
permissions:
58+
id-token: write
59+
contents: read
60+
with:
61+
job-name: Build
62+
upload-artifact: ffmpeg-lgpl-linux_aarch64-${{ matrix.ffmpeg-version }}
63+
repository: meta-pytorch/torchcodec
64+
runner: linux.arm64.2xlarge
65+
docker-image: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64
66+
script: |
67+
export FFMPEG_VERSION="${{ matrix.ffmpeg-version }}"
68+
export FFMPEG_ROOT="${PWD}/ffmpeg"
69+
70+
packaging/build_ffmpeg.sh
71+
72+
tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
73+
74+
artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux_aarch64"
75+
mkdir -p "${artifact_dir}"
76+
mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
77+
5178
LGPL-macOS:
5279
strategy:
5380
fail-fast: false
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
name: Build and test Linux CUDA aarch64 wheels
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- nightly
8+
- main
9+
- release/*
10+
tags:
11+
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
12+
workflow_dispatch:
13+
14+
concurrency:
15+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
16+
cancel-in-progress: true
17+
18+
permissions:
19+
id-token: write
20+
contents: write
21+
22+
defaults:
23+
run:
24+
shell: bash -l -eo pipefail {0}
25+
26+
jobs:
27+
generate-matrix:
28+
uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
29+
with:
30+
package-type: wheel
31+
os: linux-aarch64
32+
test-infra-repository: pytorch/test-infra
33+
test-infra-ref: main
34+
with-cpu: disable
35+
with-xpu: disable
36+
with-rocm: disable
37+
with-cuda: enable
38+
build-python-only: "disable"
39+
build:
40+
needs: generate-matrix
41+
strategy:
42+
fail-fast: false
43+
name: Build and Upload wheel
44+
uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
45+
with:
46+
repository: meta-pytorch/torchcodec
47+
ref: ""
48+
test-infra-repository: pytorch/test-infra
49+
test-infra-ref: main
50+
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
51+
pre-script: packaging/pre_build_script.sh
52+
post-script: packaging/post_build_script.sh
53+
smoke-test-script: packaging/fake_smoke_test.py
54+
package-name: torchcodec
55+
trigger-event: ${{ github.event_name }}
56+
architecture: aarch64
57+
build-platform: "python-build-package"
58+
build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 ENABLE_CUDA=1 python -m build --wheel -vvv --no-isolation"

docs/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ sphinx==5.0.0
33
sphinx_design
44
sphinx_copybutton
55
sphinx-tabs
6+
sphinx-sitemap
67
matplotlib
78
torchvision
89
ipython

docs/source/api_ref_transforms.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
1414
:template: dataclass.rst
1515

1616
DecoderTransform
17+
RandomCrop
1718
Resize

docs/source/conf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
"sphinx_tabs.tabs",
5656
"sphinx_design",
5757
"sphinx_copybutton",
58+
"sphinx_sitemap",
5859
]
5960

6061

@@ -216,6 +217,15 @@ def __call__(self, filename):
216217
"matplotlib": ("https://matplotlib.org/stable/", None),
217218
}
218219

220+
# sitemap config
221+
html_baseurl = "https://meta-pytorch.org/torchcodec/stable/"
222+
sitemap_locales = [None]
223+
sitemap_excludes = [
224+
"search.html",
225+
"genindex.html",
226+
]
227+
sitemap_url_scheme = "{link}"
228+
219229

220230
def inject_minigalleries(app, what, name, obj, options, lines):
221231
"""Inject a minigallery into a docstring.

examples/decoding/approximate_mode.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
# Performance: ``VideoDecoder`` creation
6767
# --------------------------------------
6868
#
69-
# In terms of performance, the ``seek_mode`` parameter ultimately affects the
69+
# In terms of performance, the ``seek_mode`` parameter mainly affects the
7070
# **creation** of a :class:`~torchcodec.decoders.VideoDecoder` object. The
7171
# longer the video, the higher the performance gain.
7272

@@ -104,7 +104,7 @@ def bench(f, average_over=50, warmup=2, **f_kwargs):
104104
# ---------------------------------------------
105105
#
106106
# Strictly speaking the ``seek_mode`` parameter only affects the performance of
107-
# the :class:`~torchcodec.decoders.VideoDecoder` creation. It does not have a
107+
# the :class:`~torchcodec.decoders.VideoDecoder` creation. It usually does not have a
108108
# direct effect on the performance of frame decoding or sampling. **However**,
109109
# because frame decoding and sampling patterns typically involve the creation of
110110
# the :class:`~torchcodec.decoders.VideoDecoder` (one per video), ``seek_mode``
@@ -168,20 +168,21 @@ def sample_clips(seek_mode):
168168
# duration), and also builds an internal index of frames and key-frames. This
169169
# internal index is potentially more accurate than the one in the file's
170170
# headers, which leads to more accurate seeking behavior.
171-
# Without the scan, TorchCodec relies only on the metadata contained in the
172-
# file, which may not always be as accurate.
171+
# Without the scan (in approximate mode), TorchCodec relies only on the metadata
172+
# contained in the file, which may not always be as accurate. In some rare
173+
# cases, relying on this less accurate data may also lead to slower frame
174+
# decoding, because it can involve unnecessary seeks.
173175
#
174176
# Which mode should I use?
175177
# ------------------------
176178
#
177179
# The general rule of thumb is as follows:
178180
#
179181
# - If you really care about exactness of frame seeking, use "exact".
180-
# - If you can sacrifice exactness of seeking for speed, which is usually the
181-
# case when doing clip sampling, use "approximate".
182-
# - If your videos don't have variable framerate and their metadata is correct,
183-
# then "approximate" mode is a net win: it will be just as accurate as the
184-
# "exact" mode while still being significantly faster.
182+
# - If your videos are short (less then a few minutes) then "exact" will usually
183+
# be preferable, as the scan's fixed cost will be negligible.
184+
# - For long videos, if you can sacrifice exactness of seeking for speed, which
185+
# is usually the case when doing clip sampling, consider using "approximate".
185186

186187
# %%
187188
shutil.rmtree(temp_dir)

src/torchcodec/_core/SingleStreamDecoder.cpp

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,13 +1094,6 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
10941094
// Returns true if we can avoid seeking in the AVFormatContext based on
10951095
// heuristics that rely on the target cursor_ and the last decoded frame.
10961096
// Seeking is expensive, so we try to avoid it when possible.
1097-
// Note that this function itself isn't always that cheap to call: in
1098-
// particular the calls to getKeyFrameIndexForPts below in approximate mode
1099-
// are sometimes slow.
1100-
// TODO we should understand why (is it because it reads the file?) and
1101-
// potentially optimize it. E.g. we may not want to ever seek, or even *check*
1102-
// if we need to seek in some cases, like if we're going to decode 80% of the
1103-
// frames anyway.
11041097
const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
11051098
if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
11061099
// For audio, we only need to seek if a backwards seek was requested
@@ -1147,10 +1140,10 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
11471140
// I P P P I P P P I P P I P
11481141
// x j y
11491142
// (2) is only more efficient than (1) if there is an I frame between x and y.
1150-
int lastKeyFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts_);
1151-
int targetKeyFrameIndex = getKeyFrameIndexForPts(cursor_);
1152-
return lastKeyFrameIndex >= 0 && targetKeyFrameIndex >= 0 &&
1153-
lastKeyFrameIndex == targetKeyFrameIndex;
1143+
int lastKeyFrame = getKeyFrameIdentifier(lastDecodedAvFramePts_);
1144+
int targetKeyFrame = getKeyFrameIdentifier(cursor_);
1145+
return lastKeyFrame >= 0 && targetKeyFrame >= 0 &&
1146+
lastKeyFrame == targetKeyFrame;
11541147
}
11551148

11561149
// This method looks at currentPts and desiredPts and seeks in the
@@ -1367,7 +1360,19 @@ torch::Tensor SingleStreamDecoder::maybePermuteHWC2CHW(
13671360
// PTS <-> INDEX CONVERSIONS
13681361
// --------------------------------------------------------------------------
13691362

1370-
int SingleStreamDecoder::getKeyFrameIndexForPts(int64_t pts) const {
1363+
int SingleStreamDecoder::getKeyFrameIdentifier(int64_t pts) const {
1364+
// This function "identifies" a key frame for a given pts value.
1365+
// We use the term "identifier" rather than "index" because the nature of the
1366+
// index that is returned depends on various factors:
1367+
// - If seek_mode is exact, we return the index of the key frame in the
1368+
// scanned key-frame vector (streamInfo.keyFrames). So the returned value is
1369+
// in [0, num_key_frames).
1370+
// - If seek_mode is approximate, we use av_index_search_timestamp() which
1371+
// may return a value in [0, num_key_frames) like for mkv, but also a value
1372+
// in [0, num_frames) like for mp4. It really depends on the container.
1373+
//
1374+
// The range of the "identifier" doesn't matter that much, for now we only
1375+
// use it to uniquely identify a key frame in canWeAvoidSeeking().
13711376
const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
13721377
if (streamInfo.keyFrames.empty()) {
13731378
return av_index_search_timestamp(

src/torchcodec/_core/SingleStreamDecoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ class SingleStreamDecoder {
282282
// PTS <-> INDEX CONVERSIONS
283283
// --------------------------------------------------------------------------
284284

285-
int getKeyFrameIndexForPts(int64_t pts) const;
285+
int getKeyFrameIdentifier(int64_t pts) const;
286286

287287
// Returns the key frame index of the presentation timestamp using our index.
288288
// We build this index by scanning the file in

src/torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake

Lines changed: 52 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,31 +15,59 @@ set(
1515
)
1616

1717
if (LINUX)
18-
set(
19-
platform_url
20-
${base_url}/linux_x86_64
21-
)
18+
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
19+
set(
20+
platform_url
21+
${base_url}/linux_aarch64
22+
)
2223

23-
set(
24-
f4_sha256
25-
1a083f1922443bedb5243d04896383b8c606778a7ddb9d886c8303e55339fe0c
26-
)
27-
set(
28-
f5_sha256
29-
65d6ad54082d94dcb3f801d73df2265e0e1bb303c7afbce7723e3b77ccd0e207
30-
)
31-
set(
32-
f6_sha256
33-
8bd5939c2f4a4b072e837e7870c13fe7d13824e5ff087ab534e4db4e90b7be9c
34-
)
35-
set(
36-
f7_sha256
37-
1cb946d8b7c6393c2c3ebe1f900b8de7a2885fe614c45d4ec32c9833084f2f26
38-
)
39-
set(
40-
f8_sha256
41-
c55b3c1a4b5e4d5fdd7c632bea3ab6f45b4e37cc8e0999dda3f84a8ed8defad8
42-
)
24+
set(
25+
f4_sha256
26+
a310a2ed9ffe555fd3278dae15065541098dd35e124564671dcda6a6620ac842
27+
)
28+
set(
29+
f5_sha256
30+
89ca7996bccbc2db49adaa401d20fdbabffe0e1b4e07a0f81d6b143e858b7c8d
31+
)
32+
set(
33+
f6_sha256
34+
ae44c67b4587d061b8e9cc8990ca891ee013fe52ad79e5016ba29871562621da
35+
)
36+
set(
37+
f7_sha256
38+
948e2cac66ca6f68ff526d5e84138e94bce0f1a7c83f502d15d85d0bd3ddc112
39+
)
40+
set(
41+
f8_sha256
42+
b9cfd99ae75a14e58300854967d4dc49de0b3daa551df51ea1f52a3f08d2c8af
43+
)
44+
elseif (LINUX) # assume x86_64
45+
set(
46+
platform_url
47+
${base_url}/linux_x86_64
48+
)
49+
50+
set(
51+
f4_sha256
52+
1a083f1922443bedb5243d04896383b8c606778a7ddb9d886c8303e55339fe0c
53+
)
54+
set(
55+
f5_sha256
56+
65d6ad54082d94dcb3f801d73df2265e0e1bb303c7afbce7723e3b77ccd0e207
57+
)
58+
set(
59+
f6_sha256
60+
8bd5939c2f4a4b072e837e7870c13fe7d13824e5ff087ab534e4db4e90b7be9c
61+
)
62+
set(
63+
f7_sha256
64+
1cb946d8b7c6393c2c3ebe1f900b8de7a2885fe614c45d4ec32c9833084f2f26
65+
)
66+
set(
67+
f8_sha256
68+
c55b3c1a4b5e4d5fdd7c632bea3ab6f45b4e37cc8e0999dda3f84a8ed8defad8
69+
)
70+
endif()
4371
elseif (APPLE)
4472
set(
4573
platform_url

src/torchcodec/_internally_replaced_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import importlib
8+
import importlib.util
89
import sys
910
from pathlib import Path
1011
from types import ModuleType

0 commit comments

Comments
 (0)