Skip to content

Commit b639a46

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into mac_wheels_ci
2 parents b4fff9a + c91e33e commit b639a46

File tree

78 files changed

+2972
-567
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+2972
-567
lines changed

.github/ISSUE_TEMPLATE/bug-report.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ body:
1818
# All necessary imports at the beginning
1919
import torch
2020
import torchcodec
21-
from torchcodec.decoders import SimpleVideoDecoder
21+
from torchcodec.decoders import VideoDecoder
2222
2323
# A succinct reproducing example trimmed down to the essential parts:
24-
decoder = SimpleVideoDecoder("path/to/video.mp4") # Help! This fails!
24+
decoder = VideoDecoder("path/to/video.mp4") # Help! This fails!
2525
# ...
2626
```
2727

.github/workflows/cuda_tests.yaml

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
name: Test on Linux CUDA
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- nightly
8+
- main
9+
- release/*
10+
workflow_dispatch:
11+
12+
jobs:
13+
tests:
14+
strategy:
15+
matrix:
16+
python_version: ["3.9"]
17+
# TODO: Add more cuda versions.
18+
cuda_arch_version: ["12.4"]
19+
# TODO: Get ffmpeg 4 to work. Currently fails to build with nvcc.
20+
ffmpeg_version: ["origin/release/5.1", "origin/release/6.1", "origin/release/7.1"]
21+
fail-fast: false
22+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
23+
with:
24+
runner: linux.g5.4xlarge.nvidia.gpu
25+
repository: pytorch/torchcodec
26+
gpu-arch-type: cuda
27+
gpu-arch-version: ${{ matrix.cuda_arch_version }}
28+
timeout: 120
29+
30+
script: |
31+
echo '::group::Install prereqs'
32+
nvidia-smi
33+
conda create --yes --name test python=${{ matrix.python_version }}
34+
conda activate test
35+
conda install --quiet --yes pip cmake pkg-config nasm
36+
pip install --quiet --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
37+
conda install --quiet --yes nvidia::libnpp
38+
echo '::endgroup::'
39+
40+
echo '::group::Build FFMPEG'
41+
# Build and install FFMPEG from source with CUDA enabled.
42+
# The one on conda doesn't have CUDA enabled.
43+
# Sub-step: install nvidia headers. Reference this link for details:
44+
# https://docs.nvidia.com/video-technologies/video-codec-sdk/12.1/ffmpeg-with-nvidia-gpu/index.html
45+
git clone --quiet https://git.videolan.org/git/ffmpeg/nv-codec-headers.git
46+
47+
pushd nv-codec-headers
48+
make --silent PREFIX=$CONDA_PREFIX -j install
49+
popd
50+
51+
# Now build FFMPEG from source with CUDA enabled.
52+
git clone --quiet https://git.ffmpeg.org/ffmpeg.git ffmpeg/
53+
pushd ffmpeg
54+
git checkout ${{ matrix.ffmpeg_version }}
55+
which pkg-config
56+
pkg-config --list-all
57+
./configure --prefix=$CONDA_PREFIX --enable-nonfree --enable-cuda-nvcc --disable-static --enable-shared --optflags=-fno-omit-frame-pointer --disable-stripping --enable-cuvid --enable-rpath
58+
make --silent -j install
59+
popd
60+
echo '::endgroup::'
61+
62+
echo '::group::Build TorchCodec'
63+
CMAKE_BUILD_PARALLEL_LEVEL=8 CXXFLAGS="" LDFLAGS="-Wl,--allow-shlib-undefined -Wl,-rpath,$CONDA_PREFIX/lib -Wl,-rpath-link,$CONDA_PREFIX/lib -L$CONDA_PREFIX/lib" CMAKE_BUILD_TYPE=Release ENABLE_CUDA=1 ENABLE_NVTX=1 pip install -e ".[dev]" --no-build-isolation -vv --debug
64+
echo '::endgroup::'
65+
66+
echo '::group::Test TorchCodec'
67+
# Ensure our compiled ffmpeg binary is in the path.
68+
which ffmpeg
69+
# We skip certain tests because they are not relevant to GPU decoding and they always fail with
70+
# a custom FFMPEG build.
71+
pytest -k "not (test_get_metadata or get_ffmpeg_version)" -vvv
72+
echo '::endgroup::'
73+
74+
python benchmarks/decoders/gpu_benchmark.py --devices=cuda:0 --resize_devices=none
75+
python benchmarks/decoders/gpu_benchmark.py --devices=cuda:0 --resize_devices=none --num_threads=5 --num_videos=100
76+
conda deactivate

.github/workflows/linux_wheel.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,4 @@ jobs:
124124
python test/decoders/manual_smoke_test.py
125125
- name: Run Python tests
126126
run: |
127-
pytest test
127+
pytest test -vvv

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ detailed example, [check out our
3131
documentation](https://pytorch.org/torchcodec/stable/generated_examples/)!
3232

3333
```python
34-
from torchcodec.decoders import SimpleVideoDecoder
34+
from torchcodec.decoders import VideoDecoder
3535

36-
decoder = SimpleVideoDecoder("path/to/video.mp4")
36+
decoder = VideoDecoder("path/to/video.mp4")
3737

3838
decoder.metadata
3939
# VideoStreamMetadata:

benchmarks/decoders/BenchmarkDecodersMain.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,8 @@ void runNDecodeIterationsWithCustomOps(
145145
/*height=*/std::nullopt,
146146
/*thread_count=*/std::nullopt,
147147
/*dimension_order=*/std::nullopt,
148-
/*stream_index=*/std::nullopt);
148+
/*stream_index=*/std::nullopt,
149+
/*device=*/std::nullopt);
149150

150151
for (double pts : ptsList) {
151152
seekFrameOp.call(decoderTensor, pts);

benchmarks/decoders/benchmark_decoders.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
import json
1111
import os
1212
import timeit
13+
from pathlib import Path
1314

1415
import torch
1516
import torch.utils.benchmark as benchmark
16-
from torchcodec.decoders import SimpleVideoDecoder
17+
from torchcodec.decoders import VideoDecoder
1718

1819
from torchcodec.decoders._core import (
1920
_add_video_stream,
@@ -206,10 +207,10 @@ def get_frames_from_video(self, video_file, pts_list):
206207
metadata = json.loads(get_json_metadata(decoder))
207208
average_fps = metadata["averageFps"]
208209
best_video_stream = metadata["bestVideoStreamIndex"]
209-
indexes_list = [int(pts * average_fps) for pts in pts_list]
210+
indices_list = [int(pts * average_fps) for pts in pts_list]
210211
frames = []
211212
frames = get_frames_at_indices(
212-
decoder, stream_index=best_video_stream, frame_indices=indexes_list
213+
decoder, stream_index=best_video_stream, frame_indices=indices_list
213214
)
214215
return frames
215216

@@ -303,9 +304,8 @@ def get_test_resource_path(filename: str) -> str:
303304
resource = importlib.resources.files(__package__).joinpath(filename)
304305
with importlib.resources.as_file(resource) as path:
305306
return os.fspath(path)
306-
return os.path.join(
307-
os.path.dirname(__file__), "..", "..", "test", "resources", filename
308-
)
307+
308+
return str(Path(__file__).parent / f"../../test/resources/{filename}")
309309

310310

311311
def create_torchcodec_decoder_from_file(video_file):
@@ -404,9 +404,9 @@ def main() -> None:
404404
results = []
405405
for decoder_name, decoder in decoder_dict.items():
406406
for video_path in args.bm_video_paths.split(","):
407-
# We only use the SimpleVideoDecoder to get the metadata and get
407+
# We only use the VideoDecoder to get the metadata and get
408408
# the list of PTS values to seek to.
409-
simple_decoder = SimpleVideoDecoder(video_path)
409+
simple_decoder = VideoDecoder(video_path)
410410
duration = simple_decoder.metadata.duration_seconds
411411
pts_list = [
412412
i * duration / num_uniform_samples for i in range(num_uniform_samples)
@@ -453,7 +453,7 @@ def main() -> None:
453453

454454
first_video_path = args.bm_video_paths.split(",")[0]
455455
if args.bm_video_creation:
456-
simple_decoder = SimpleVideoDecoder(first_video_path)
456+
simple_decoder = VideoDecoder(first_video_path)
457457
metadata = simple_decoder.metadata
458458
metadata_string = f"{metadata.codec} {metadata.width}x{metadata.height}, {metadata.duration_seconds}s {metadata.average_fps}fps"
459459
creation_result = benchmark.Timer(
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import argparse
2+
import time
3+
from concurrent.futures import ThreadPoolExecutor
4+
from pathlib import Path
5+
6+
import torch
7+
8+
import torch.utils.benchmark as benchmark
9+
10+
import torchcodec
11+
import torchvision.transforms.v2.functional as F
12+
13+
RESIZED_WIDTH = 256
14+
RESIZED_HEIGHT = 256
15+
16+
17+
def transfer_and_resize_frame(frame, resize_device_string):
18+
# This should be a no-op if the frame is already on the target device.
19+
frame = frame.to(resize_device_string)
20+
frame = F.resize(frame, (RESIZED_HEIGHT, RESIZED_WIDTH))
21+
return frame
22+
23+
24+
def decode_full_video(video_path, decode_device_string, resize_device_string):
25+
# We use the core API instead of SimpleVideoDecoder because the core API
26+
# allows us to natively resize as part of the decode step.
27+
print(f"{decode_device_string=} {resize_device_string=}")
28+
decoder = torchcodec.decoders._core.create_from_file(video_path)
29+
num_threads = None
30+
if "cuda" in decode_device_string:
31+
num_threads = 1
32+
width = None
33+
height = None
34+
if "native" in resize_device_string:
35+
width = RESIZED_WIDTH
36+
height = RESIZED_HEIGHT
37+
torchcodec.decoders._core._add_video_stream(
38+
decoder,
39+
stream_index=-1,
40+
device=decode_device_string,
41+
num_threads=num_threads,
42+
width=width,
43+
height=height,
44+
)
45+
46+
start_time = time.time()
47+
frame_count = 0
48+
while True:
49+
try:
50+
frame, *_ = torchcodec.decoders._core.get_next_frame(decoder)
51+
if resize_device_string != "none" and "native" not in resize_device_string:
52+
frame = transfer_and_resize_frame(frame, resize_device_string)
53+
54+
frame_count += 1
55+
except Exception as e:
56+
print("EXCEPTION", e)
57+
break
58+
59+
end_time = time.time()
60+
elapsed = end_time - start_time
61+
fps = frame_count / (end_time - start_time)
62+
print(
63+
f"****** DECODED full video {decode_device_string=} {frame_count=} {elapsed=} {fps=}"
64+
)
65+
return frame_count, end_time - start_time
66+
67+
68+
def decode_videos_using_threads(
69+
video_path,
70+
decode_device_string,
71+
resize_device_string,
72+
num_videos,
73+
num_threads,
74+
use_multiple_gpus,
75+
):
76+
executor = ThreadPoolExecutor(max_workers=num_threads)
77+
for i in range(num_videos):
78+
actual_decode_device = decode_device_string
79+
if "cuda" in decode_device_string and use_multiple_gpus:
80+
actual_decode_device = f"cuda:{i % torch.cuda.device_count()}"
81+
executor.submit(
82+
decode_full_video, video_path, actual_decode_device, resize_device_string
83+
)
84+
executor.shutdown(wait=True)
85+
86+
87+
def main():
88+
parser = argparse.ArgumentParser()
89+
parser.add_argument(
90+
"--devices",
91+
default="cuda:0,cpu",
92+
type=str,
93+
help="Comma-separated devices to test decoding on.",
94+
)
95+
parser.add_argument(
96+
"--resize_devices",
97+
default="cuda:0,cpu,native,none",
98+
type=str,
99+
help="Comma-separated devices to test preroc (resize) on. Use 'none' to specify no resize.",
100+
)
101+
parser.add_argument(
102+
"--video",
103+
type=str,
104+
default=str(Path(__file__).parent / "../../test/resources/nasa_13013.mp4"),
105+
)
106+
parser.add_argument(
107+
"--use_torch_benchmark",
108+
action=argparse.BooleanOptionalAction,
109+
default=True,
110+
help=(
111+
"Use pytorch benchmark to measure decode time with warmup and "
112+
"autorange. Without this we just run one iteration without warmup "
113+
"to measure the cold start time."
114+
),
115+
)
116+
parser.add_argument(
117+
"--num_threads",
118+
type=int,
119+
default=1,
120+
help="Number of threads to use for decoding. Only used when --use_torch_benchmark is set.",
121+
)
122+
parser.add_argument(
123+
"--num_videos",
124+
type=int,
125+
default=50,
126+
help="Number of videos to decode in parallel. Only used when --num_threads is set.",
127+
)
128+
parser.add_argument(
129+
"--use_multiple_gpus",
130+
action=argparse.BooleanOptionalAction,
131+
default=True,
132+
help=("Use multiple GPUs to decode multiple videos in multi-threaded mode."),
133+
)
134+
args = parser.parse_args()
135+
video_path = args.video
136+
137+
if not args.use_torch_benchmark:
138+
for device in args.devices.split(","):
139+
print("Testing on", device)
140+
decode_full_video(video_path, device)
141+
return
142+
143+
resize_devices = args.resize_devices.split(",")
144+
resize_devices = [d for d in resize_devices if d != ""]
145+
if len(resize_devices) == 0:
146+
resize_devices.append("none")
147+
148+
label = "Decode+Resize Time"
149+
150+
results = []
151+
for decode_device_string in args.devices.split(","):
152+
for resize_device_string in resize_devices:
153+
decode_label = decode_device_string
154+
if "cuda" in decode_label:
155+
# Shorten "cuda:0" to "cuda"
156+
decode_label = "cuda"
157+
resize_label = resize_device_string
158+
if "cuda" in resize_device_string:
159+
# Shorten "cuda:0" to "cuda"
160+
resize_label = "cuda"
161+
print("decode_device", decode_device_string)
162+
print("resize_device", resize_device_string)
163+
if args.num_threads > 1:
164+
t = benchmark.Timer(
165+
stmt="decode_videos_using_threads(video_path, decode_device_string, resize_device_string, num_videos, num_threads, use_multiple_gpus)",
166+
globals={
167+
"decode_device_string": decode_device_string,
168+
"video_path": video_path,
169+
"decode_full_video": decode_full_video,
170+
"decode_videos_using_threads": decode_videos_using_threads,
171+
"resize_device_string": resize_device_string,
172+
"num_videos": args.num_videos,
173+
"num_threads": args.num_threads,
174+
"use_multiple_gpus": args.use_multiple_gpus,
175+
},
176+
label=label,
177+
description=f"threads={args.num_threads} work={args.num_videos} video={Path(video_path).name}",
178+
sub_label=f"D={decode_label} R={resize_label} T={args.num_threads} W={args.num_videos}",
179+
).blocked_autorange()
180+
results.append(t)
181+
else:
182+
t = benchmark.Timer(
183+
stmt="decode_full_video(video_path, decode_device_string, resize_device_string)",
184+
globals={
185+
"decode_device_string": decode_device_string,
186+
"video_path": video_path,
187+
"decode_full_video": decode_full_video,
188+
"resize_device_string": resize_device_string,
189+
},
190+
label=label,
191+
description=f"video={Path(video_path).name}",
192+
sub_label=f"D={decode_label} R={resize_label}",
193+
).blocked_autorange()
194+
results.append(t)
195+
compare = benchmark.Compare(results)
196+
compare.print()
197+
print("Key: D=Decode, R=Resize T=threads W=work (number of videos to decode)")
198+
print("Native resize is done as part of the decode step")
199+
print("none resize means there is no resize step -- native or otherwise")
200+
201+
202+
if __name__ == "__main__":
203+
main()

0 commit comments

Comments
 (0)