Skip to content

Commit 329e130

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into fallback-expose
2 parents 0c7cafb + 262c457 commit 329e130

File tree

3 files changed

+335
-161
lines changed

3 files changed

+335
-161
lines changed
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import math
2+
from argparse import ArgumentParser
3+
from pathlib import Path
4+
from time import perf_counter_ns
5+
6+
import torch
7+
from torch import Tensor
8+
from torchcodec._core import add_video_stream, create_from_file, get_frames_by_pts
9+
from torchcodec.decoders import VideoDecoder
10+
from torchvision.transforms import v2
11+
12+
DEFAULT_NUM_EXP = 20
13+
14+
15+
def bench(f, *args, num_exp=DEFAULT_NUM_EXP, warmup=1) -> Tensor:
16+
17+
for _ in range(warmup):
18+
f(*args)
19+
20+
times = []
21+
for _ in range(num_exp):
22+
start = perf_counter_ns()
23+
f(*args)
24+
end = perf_counter_ns()
25+
times.append(end - start)
26+
return torch.tensor(times).float()
27+
28+
29+
def report_stats(times: Tensor, unit: str = "ms", prefix: str = "") -> float:
30+
mul = {
31+
"ns": 1,
32+
"µs": 1e-3,
33+
"ms": 1e-6,
34+
"s": 1e-9,
35+
}[unit]
36+
times = times * mul
37+
std = times.std().item()
38+
med = times.median().item()
39+
mean = times.mean().item()
40+
min = times.min().item()
41+
max = times.max().item()
42+
print(
43+
f"{prefix:<45} {med = :.2f}, {mean = :.2f} +- {std:.2f}, {min = :.2f}, {max = :.2f} - in {unit}"
44+
)
45+
46+
47+
def torchvision_resize(
48+
path: Path, pts_seconds: list[float], dims: tuple[int, int]
49+
) -> None:
50+
decoder = create_from_file(str(path), seek_mode="approximate")
51+
add_video_stream(decoder)
52+
raw_frames, *_ = get_frames_by_pts(decoder, timestamps=pts_seconds)
53+
return v2.functional.resize(raw_frames, size=dims)
54+
55+
56+
def torchvision_crop(
57+
path: Path, pts_seconds: list[float], dims: tuple[int, int], x: int, y: int
58+
) -> None:
59+
decoder = create_from_file(str(path), seek_mode="approximate")
60+
add_video_stream(decoder)
61+
raw_frames, *_ = get_frames_by_pts(decoder, timestamps=pts_seconds)
62+
return v2.functional.crop(raw_frames, top=y, left=x, height=dims[0], width=dims[1])
63+
64+
65+
def decoder_native_resize(
66+
path: Path, pts_seconds: list[float], dims: tuple[int, int]
67+
) -> None:
68+
decoder = create_from_file(str(path), seek_mode="approximate")
69+
add_video_stream(decoder, transform_specs=f"resize, {dims[0]}, {dims[1]}")
70+
return get_frames_by_pts(decoder, timestamps=pts_seconds)[0]
71+
72+
73+
def decoder_native_crop(
74+
path: Path, pts_seconds: list[float], dims: tuple[int, int], x: int, y: int
75+
) -> None:
76+
decoder = create_from_file(str(path), seek_mode="approximate")
77+
add_video_stream(decoder, transform_specs=f"crop, {dims[0]}, {dims[1]}, {x}, {y}")
78+
return get_frames_by_pts(decoder, timestamps=pts_seconds)[0]
79+
80+
81+
def main():
82+
parser = ArgumentParser()
83+
parser.add_argument("--path", type=str, help="path to file", required=True)
84+
parser.add_argument(
85+
"--num-exp",
86+
type=int,
87+
default=DEFAULT_NUM_EXP,
88+
help="number of runs to average over",
89+
)
90+
91+
args = parser.parse_args()
92+
path = Path(args.path)
93+
94+
metadata = VideoDecoder(path).metadata
95+
duration = metadata.duration_seconds
96+
97+
print(
98+
f"Benchmarking {path.name}, duration: {duration}, codec: {metadata.codec}, averaging over {args.num_exp} runs:"
99+
)
100+
101+
input_height = metadata.height
102+
input_width = metadata.width
103+
fraction_of_total_frames_to_sample = [0.005, 0.01, 0.05, 0.1]
104+
fraction_of_input_dimensions = [0.5, 0.25, 0.125]
105+
106+
for num_fraction in fraction_of_total_frames_to_sample:
107+
num_frames_to_sample = math.ceil(metadata.num_frames * num_fraction)
108+
print(
109+
f"Sampling {num_fraction * 100}%, {num_frames_to_sample}, of {metadata.num_frames} frames"
110+
)
111+
uniform_timestamps = [
112+
i * duration / num_frames_to_sample for i in range(num_frames_to_sample)
113+
]
114+
115+
for dims_fraction in fraction_of_input_dimensions:
116+
dims = (int(input_height * dims_fraction), int(input_width * dims_fraction))
117+
118+
times = bench(
119+
torchvision_resize, path, uniform_timestamps, dims, num_exp=args.num_exp
120+
)
121+
report_stats(times, prefix=f"torchvision_resize({dims})")
122+
123+
times = bench(
124+
decoder_native_resize,
125+
path,
126+
uniform_timestamps,
127+
dims,
128+
num_exp=args.num_exp,
129+
)
130+
report_stats(times, prefix=f"decoder_native_resize({dims})")
131+
print()
132+
133+
center_x = (input_height - dims[0]) // 2
134+
center_y = (input_width - dims[1]) // 2
135+
times = bench(
136+
torchvision_crop,
137+
path,
138+
uniform_timestamps,
139+
dims,
140+
center_x,
141+
center_y,
142+
num_exp=args.num_exp,
143+
)
144+
report_stats(
145+
times, prefix=f"torchvision_crop({dims}, {center_x}, {center_y})"
146+
)
147+
148+
times = bench(
149+
decoder_native_crop,
150+
path,
151+
uniform_timestamps,
152+
dims,
153+
center_x,
154+
center_y,
155+
num_exp=args.num_exp,
156+
)
157+
report_stats(
158+
times, prefix=f"decoder_native_crop({dims}, {center_x}, {center_y})"
159+
)
160+
print()
161+
162+
163+
if __name__ == "__main__":
164+
main()

test/generate_reference_resources.py

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,20 @@
66

77
import subprocess
88
from pathlib import Path
9+
from typing import Optional
910

1011
import numpy as np
1112

1213
import torch
1314
from PIL import Image
1415

15-
from .utils import sanitize_filtergraph_expression
16+
from .utils import AV1_VIDEO, H265_VIDEO, NASA_VIDEO, TestVideo
1617

1718
# Run this script to update the resources used in unit tests. The resources are all derived
1819
# from source media already checked into the repo.
1920

20-
SCRIPT_DIR = Path(__file__).resolve().parent
21-
TORCHCODEC_PATH = SCRIPT_DIR.parent
22-
RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
2321

24-
25-
def convert_image_to_tensor(image_path):
22+
def convert_image_to_tensor(image_path: str) -> None:
2623
image_path = Path(image_path)
2724
if not image_path.exists():
2825
return
@@ -37,37 +34,56 @@ def convert_image_to_tensor(image_path):
3734
image_path.unlink()
3835

3936

40-
def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
37+
def generate_frame_by_index(
38+
video: TestVideo,
39+
*,
40+
frame_index: int,
41+
stream_index: int,
42+
filters: Optional[str] = None,
43+
) -> None:
44+
# Note that we are using 0-based index naming. As a result, we are
45+
# generating files one-by-one, giving the actual file name that we want.
46+
# ffmpeg does have an option to generate multiple files for us, but it uses
47+
# 1-based indexing. We can't use 1-based indexing because we want to match
48+
# the 0-based indexing in our tests.
49+
base_path = video.get_base_path_by_index(
50+
frame_index, stream_index=stream_index, filters=filters
51+
)
52+
output_bmp = f"{base_path}.bmp"
53+
4154
# Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
4255
# which always happens BEFORE any of the filters that we receive as input. We do this to
4356
# ensure that the color conversion happens BEFORE the filters, matching the behavior of the
4457
# torchcodec filtergraph implementation.
4558
#
4659
# Not doing this would result in the color conversion happening AFTER the filters, which
4760
# would result in different color values for the same frame.
48-
filtergraph = f"select='eq(n\\,{frame})',format=rgb24"
61+
filtergraph = f"select='eq(n\\,{frame_index})',format=rgb24"
4962
if filters is not None:
5063
filtergraph = filtergraph + f",{filters}"
5164

5265
cmd = [
5366
"ffmpeg",
5467
"-y",
5568
"-i",
56-
video_path,
69+
video.path,
5770
"-map",
58-
f"0:{stream}",
71+
f"0:{stream_index}",
5972
"-vf",
6073
filtergraph,
6174
"-fps_mode",
6275
"passthrough",
6376
"-update",
6477
"1",
65-
output_path,
78+
output_bmp,
6679
]
6780
subprocess.run(cmd, check=True)
81+
convert_image_to_tensor(output_bmp)
6882

6983

70-
def get_frame_by_timestamp(video_path, timestamp, output_path):
84+
def generate_frame_by_timestamp(
85+
video_path: str, timestamp: float, output_path: str
86+
) -> None:
7187
cmd = [
7288
"ffmpeg",
7389
"-y",
@@ -80,40 +96,32 @@ def get_frame_by_timestamp(video_path, timestamp, output_path):
8096
output_path,
8197
]
8298
subprocess.run(cmd, check=True)
99+
convert_image_to_tensor(output_path)
83100

84101

85102
def generate_nasa_13013_references():
86-
VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4"
87-
88103
# Note: The naming scheme used here must match the naming scheme used to load
89104
# tensors in ./utils.py.
90-
STREAMS = [0, 3]
91-
FRAMES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
92-
for stream in STREAMS:
93-
for frame in FRAMES:
94-
# Note that we are using 0-based index naming. Asking ffmpeg to number output
95-
# frames would result in 1-based index naming. We enforce 0-based index naming
96-
# so that the name of reference frames matches the index when accessing that
97-
# frame in the Python decoder.
98-
output_bmp = f"{VIDEO_PATH}.stream{stream}.frame{frame:06d}.bmp"
99-
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=stream)
100-
convert_image_to_tensor(output_bmp)
105+
streams = [0, 3]
106+
frames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 25, 30, 35, 386, 387, 388, 389]
107+
for stream in streams:
108+
for frame in frames:
109+
generate_frame_by_index(NASA_VIDEO, frame_index=frame, stream_index=stream)
101110

102111
# Extract individual frames at specific timestamps, including the last frame of the video.
103112
seek_timestamp = [6.0, 6.1, 10.0, 12.979633]
104113
timestamp_name = [f"{seek_timestamp:06f}" for seek_timestamp in seek_timestamp]
105114
for timestamp, name in zip(seek_timestamp, timestamp_name):
106-
output_bmp = f"{VIDEO_PATH}.time{name}.bmp"
107-
get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp)
108-
convert_image_to_tensor(output_bmp)
115+
output_bmp = f"{NASA_VIDEO.path}.time{name}.bmp"
116+
generate_frame_by_timestamp(NASA_VIDEO.path, timestamp, output_bmp)
109117

110118
# Extract frames with specific filters. We have tests that assume these exact filters.
111-
FRAMES = [0, 15, 200, 389]
119+
frames = [0, 15, 200, 389]
112120
crop_filter = "crop=300:200:50:35:exact=1"
113-
for frame in FRAMES:
114-
output_bmp = f"{VIDEO_PATH}.{sanitize_filtergraph_expression(crop_filter)}.stream3.frame{frame:06d}.bmp"
115-
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter)
116-
convert_image_to_tensor(output_bmp)
121+
for frame in frames:
122+
generate_frame_by_index(
123+
NASA_VIDEO, frame_index=frame, stream_index=3, filters=crop_filter
124+
)
117125

118126

119127
def generate_h265_video_references():
@@ -122,25 +130,18 @@ def generate_h265_video_references():
122130
# ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265 --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
123131
# ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y
124132
# Note that this video only has 1 stream, at index 0.
125-
VIDEO_PATH = RESOURCES_DIR / "h265_video.mp4"
126-
FRAMES = [5]
127-
for frame in FRAMES:
128-
output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp"
129-
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
130-
convert_image_to_tensor(output_bmp)
133+
frames = [5]
134+
for frame in frames:
135+
generate_frame_by_index(H265_VIDEO, frame_index=frame, stream_index=0)
131136

132137

133138
def generate_av1_video_references():
134139
# This video was generated by running the following:
135140
# ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv
136141
# Note that this video only has 1 stream, at index 0.
137-
VIDEO_PATH = RESOURCES_DIR / "av1_video.mkv"
138-
FRAMES = [10]
139-
140-
for frame in FRAMES:
141-
output_bmp = f"{VIDEO_PATH}.stream0.frame{frame:06d}.bmp"
142-
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
143-
convert_image_to_tensor(output_bmp)
142+
frames = [10]
143+
for frame in frames:
144+
generate_frame_by_index(AV1_VIDEO, frame_index=frame, stream_index=0)
144145

145146

146147
def main():

0 commit comments

Comments
 (0)