Skip to content

Commit a3b271a

Browse files
author
pytorchbot
committed
2024-10-31 nightly release (68939a9)
1 parent 639314d commit a3b271a

File tree

11 files changed

+182
-69
lines changed

11 files changed

+182
-69
lines changed

.github/workflows/linux_cuda_wheel.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ jobs:
9696
${CONDA_RUN} conda info
9797
${CONDA_RUN} nvidia-smi
9898
${CONDA_RUN} conda list
99+
- name: Assert ffmpeg exists
100+
run: |
101+
${CONDA_RUN} ffmpeg -buildconf
99102
- name: Update pip
100103
run: ${CONDA_RUN} python -m pip install --upgrade pip
101104
- name: Install PyTorch
Lines changed: 67 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import argparse
12
from pathlib import Path
23
from time import perf_counter_ns
34

@@ -45,51 +46,76 @@ def report_stats(times, num_frames, unit="ms"):
4546
return med, fps
4647

4748

48-
def sample(sampler, **kwargs):
49-
decoder = VideoDecoder(VIDEO_PATH)
49+
def sample(decoder, sampler, **kwargs):
5050
return sampler(
5151
decoder,
5252
num_frames_per_clip=10,
5353
**kwargs,
5454
)
5555

5656

57-
VIDEO_PATH = Path(__file__).parent / "../../test/resources/nasa_13013.mp4"
58-
NUM_EXP = 30
59-
60-
for num_clips in (1, 50):
61-
print("-" * 10)
62-
print(f"{num_clips = }")
63-
64-
print("clips_at_random_indices ", end="")
65-
times, num_frames = bench(
66-
sample, clips_at_random_indices, num_clips=num_clips, num_exp=NUM_EXP, warmup=2
67-
)
68-
report_stats(times, num_frames, unit="ms")
69-
70-
print("clips_at_regular_indices ", end="")
71-
times, num_frames = bench(
72-
sample, clips_at_regular_indices, num_clips=num_clips, num_exp=NUM_EXP, warmup=2
73-
)
74-
report_stats(times, num_frames, unit="ms")
75-
76-
print("clips_at_random_timestamps ", end="")
77-
times, num_frames = bench(
78-
sample,
79-
clips_at_random_timestamps,
80-
num_clips=num_clips,
81-
num_exp=NUM_EXP,
82-
warmup=2,
83-
)
84-
report_stats(times, num_frames, unit="ms")
85-
86-
print("clips_at_regular_timestamps ", end="")
87-
seconds_between_clip_starts = 13 / num_clips # approximate. video is 13s long
88-
times, num_frames = bench(
89-
sample,
90-
clips_at_regular_timestamps,
91-
seconds_between_clip_starts=seconds_between_clip_starts,
92-
num_exp=NUM_EXP,
93-
warmup=2,
94-
)
95-
report_stats(times, num_frames, unit="ms")
57+
def run_sampler_benchmarks(device, video):
58+
NUM_EXP = 30
59+
60+
for num_clips in (1, 50):
61+
print("-" * 10)
62+
print(f"{num_clips = }")
63+
64+
print("clips_at_random_indices ", end="")
65+
decoder = VideoDecoder(video, device=device)
66+
times, num_frames = bench(
67+
sample,
68+
decoder,
69+
clips_at_random_indices,
70+
num_clips=num_clips,
71+
num_exp=NUM_EXP,
72+
warmup=2,
73+
)
74+
report_stats(times, num_frames, unit="ms")
75+
76+
print("clips_at_regular_indices ", end="")
77+
times, num_frames = bench(
78+
sample,
79+
decoder,
80+
clips_at_regular_indices,
81+
num_clips=num_clips,
82+
num_exp=NUM_EXP,
83+
warmup=2,
84+
)
85+
report_stats(times, num_frames, unit="ms")
86+
87+
print("clips_at_random_timestamps ", end="")
88+
times, num_frames = bench(
89+
sample,
90+
decoder,
91+
clips_at_random_timestamps,
92+
num_clips=num_clips,
93+
num_exp=NUM_EXP,
94+
warmup=2,
95+
)
96+
report_stats(times, num_frames, unit="ms")
97+
98+
print("clips_at_regular_timestamps ", end="")
99+
seconds_between_clip_starts = 13 / num_clips # approximate. video is 13s long
100+
times, num_frames = bench(
101+
sample,
102+
decoder,
103+
clips_at_regular_timestamps,
104+
seconds_between_clip_starts=seconds_between_clip_starts,
105+
num_exp=NUM_EXP,
106+
warmup=2,
107+
)
108+
report_stats(times, num_frames, unit="ms")
109+
110+
111+
def main():
112+
DEFAULT_VIDEO_PATH = Path(__file__).parent / "../../test/resources/nasa_13013.mp4"
113+
parser = argparse.ArgumentParser()
114+
parser.add_argument("--device", type=str, default="cpu")
115+
parser.add_argument("--video", type=str, default=str(DEFAULT_VIDEO_PATH))
116+
args = parser.parse_args()
117+
run_sampler_benchmarks(args.device, args.video)
118+
119+
120+
if __name__ == "__main__":
121+
main()

src/torchcodec/decoders/_core/CPUOnlyDevice.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ void convertAVFrameToDecodedOutputOnCuda(
1919
const VideoDecoder::VideoStreamDecoderOptions& options,
2020
AVCodecContext* codecContext,
2121
VideoDecoder::RawDecodedOutput& rawOutput,
22-
VideoDecoder::DecodedOutput& output) {
22+
VideoDecoder::DecodedOutput& output,
23+
std::optional<torch::Tensor> preAllocatedOutputTensor) {
2324
throwUnsupportedDeviceError(device);
2425
}
2526

src/torchcodec/decoders/_core/CudaDevice.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@ void convertAVFrameToDecodedOutputOnCuda(
201201
const VideoDecoder::VideoStreamDecoderOptions& options,
202202
AVCodecContext* codecContext,
203203
VideoDecoder::RawDecodedOutput& rawOutput,
204-
VideoDecoder::DecodedOutput& output) {
204+
VideoDecoder::DecodedOutput& output,
205+
std::optional<torch::Tensor> preAllocatedOutputTensor) {
205206
AVFrame* src = rawOutput.frame.get();
206207

207208
TORCH_CHECK(
@@ -213,7 +214,21 @@ void convertAVFrameToDecodedOutputOnCuda(
213214
NppiSize oSizeROI = {width, height};
214215
Npp8u* input[2] = {src->data[0], src->data[1]};
215216
torch::Tensor& dst = output.frame;
216-
dst = allocateDeviceTensor({height, width, 3}, options.device);
217+
if (preAllocatedOutputTensor.has_value()) {
218+
dst = preAllocatedOutputTensor.value();
219+
auto shape = dst.sizes();
220+
TORCH_CHECK(
221+
(shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
222+
(shape[2] == 3),
223+
"Expected tensor of shape ",
224+
height,
225+
"x",
226+
width,
227+
"x3, got ",
228+
shape);
229+
} else {
230+
dst = allocateDeviceTensor({height, width, 3}, options.device);
231+
}
217232

218233
// Use the user-requested GPU for running the NPP kernel.
219234
c10::cuda::CUDAGuard deviceGuard(device);

src/torchcodec/decoders/_core/DeviceInterface.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ void convertAVFrameToDecodedOutputOnCuda(
3737
const VideoDecoder::VideoStreamDecoderOptions& options,
3838
AVCodecContext* codecContext,
3939
VideoDecoder::RawDecodedOutput& rawOutput,
40-
VideoDecoder::DecodedOutput& output);
40+
VideoDecoder::DecodedOutput& output,
41+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
4142

4243
void releaseContextOnCuda(
4344
const torch::Device& device,

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ VideoDecoder::BatchDecodedOutput::BatchDecodedOutput(
196196
options.height.value_or(*metadata.height),
197197
options.width.value_or(*metadata.width),
198198
3},
199-
{torch::kUInt8})),
199+
at::TensorOptions(options.device).dtype(torch::kUInt8))),
200200
ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
201201
durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {}
202202

@@ -855,17 +855,18 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
855855
output.duration = getDuration(frame);
856856
output.durationSeconds = ptsToSeconds(
857857
getDuration(frame), formatContext_->streams[streamIndex]->time_base);
858+
// TODO: we should fold preAllocatedOutputTensor into RawDecodedOutput.
858859
if (streamInfo.options.device.type() == torch::kCPU) {
859860
convertAVFrameToDecodedOutputOnCPU(
860861
rawOutput, output, preAllocatedOutputTensor);
861862
} else if (streamInfo.options.device.type() == torch::kCUDA) {
862-
// TODO: handle pre-allocated output tensor
863863
convertAVFrameToDecodedOutputOnCuda(
864864
streamInfo.options.device,
865865
streamInfo.options,
866866
streamInfo.codecContext.get(),
867867
rawOutput,
868-
output);
868+
output,
869+
preAllocatedOutputTensor);
869870
} else {
870871
TORCH_CHECK(
871872
false, "Invalid device type: " + streamInfo.options.device.str());
@@ -1007,10 +1008,8 @@ void VideoDecoder::validateFrameIndex(
10071008

10081009
VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
10091010
int streamIndex,
1010-
int64_t frameIndex,
1011-
std::optional<torch::Tensor> preAllocatedOutputTensor) {
1012-
auto output = getFrameAtIndexInternal(
1013-
streamIndex, frameIndex, preAllocatedOutputTensor);
1011+
int64_t frameIndex) {
1012+
auto output = getFrameAtIndexInternal(streamIndex, frameIndex);
10141013
output.frame = MaybePermuteHWC2CHW(streamIndex, output.frame);
10151014
return output;
10161015
}

src/torchcodec/decoders/_core/VideoDecoder.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,11 @@ class VideoDecoder {
225225
// seconds=5.999, etc.
226226
DecodedOutput getFramePlayedAtTimestampNoDemux(double seconds);
227227

228-
DecodedOutput getFrameAtIndex(
228+
DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex);
229+
// This is morally private but needs to be exposed for C++ tests. Once
230+
// getFrameAtIndex supports the preAllocatedOutputTensor parameter, we can
231+
// move it back to private.
232+
DecodedOutput getFrameAtIndexInternal(
229233
int streamIndex,
230234
int64_t frameIndex,
231235
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
@@ -387,10 +391,6 @@ class VideoDecoder {
387391
DecodedOutput& output,
388392
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
389393

390-
DecodedOutput getFrameAtIndexInternal(
391-
int streamIndex,
392-
int64_t frameIndex,
393-
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
394394
DecodedOutput getNextFrameOutputNoDemuxInternal(
395395
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
396396

src/torchcodec/decoders/_video_decoder.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99
from typing import Literal, Optional, Tuple, Union
1010

11-
from torch import Tensor
11+
from torch import device, Tensor
1212

1313
from torchcodec import Frame, FrameBatch
1414
from torchcodec.decoders import _core as core
@@ -36,19 +36,20 @@ class VideoDecoder:
3636
This can be either "NCHW" (default) or "NHWC", where N is the batch
3737
size, C is the number of channels, H is the height, and W is the
3838
width of the frames.
39-
num_ffmpeg_threads (int, optional): The number of threads to use for decoding.
40-
Use 1 for single-threaded decoding which may be best if you are running multiple
41-
instances of ``VideoDecoder`` in parallel. Use a higher number for multi-threaded
42-
decoding which is best if you are running a single instance of ``VideoDecoder``.
43-
Default: 1.
44-
4539
.. note::
4640
4741
Frames are natively decoded in NHWC format by the underlying
4842
FFmpeg implementation. Converting those into NCHW format is a
4943
cheap no-copy operation that allows these frames to be
5044
transformed using the `torchvision transforms
5145
<https://pytorch.org/vision/stable/transforms.html>`_.
46+
num_ffmpeg_threads (int, optional): The number of threads to use for decoding.
47+
Use 1 for single-threaded decoding which may be best if you are running multiple
48+
instances of ``VideoDecoder`` in parallel. Use a higher number for multi-threaded
49+
decoding which is best if you are running a single instance of ``VideoDecoder``.
50+
Default: 1.
51+
device (str or torch.device, optional): The device to use for decoding. Default: "cpu".
52+
5253
5354
Attributes:
5455
metadata (VideoStreamMetadata): Metadata of the video stream.
@@ -64,6 +65,7 @@ def __init__(
6465
stream_index: Optional[int] = None,
6566
dimension_order: Literal["NCHW", "NHWC"] = "NCHW",
6667
num_ffmpeg_threads: int = 1,
68+
device: Optional[Union[str, device]] = "cpu",
6769
):
6870
if isinstance(source, str):
6971
self._decoder = core.create_from_file(source)
@@ -92,6 +94,7 @@ def __init__(
9294
stream_index=stream_index,
9395
dimension_order=dimension_order,
9496
num_threads=num_ffmpeg_threads,
97+
device=device,
9598
)
9699

97100
self.metadata, self.stream_index = _get_and_validate_stream_metadata(

test/decoders/VideoDecoderTest.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorFilterGraph) {
400400
bestVideoStreamIndex,
401401
VideoDecoder::VideoStreamDecoderOptions(
402402
"color_conversion_library=filtergraph"));
403-
auto output = ourDecoder->getFrameAtIndex(
403+
auto output = ourDecoder->getFrameAtIndexInternal(
404404
bestVideoStreamIndex, 0, preAllocatedOutputTensor);
405405
EXPECT_EQ(output.frame.data_ptr(), preAllocatedOutputTensor.data_ptr());
406406
}
@@ -418,7 +418,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorSwscale) {
418418
bestVideoStreamIndex,
419419
VideoDecoder::VideoStreamDecoderOptions(
420420
"color_conversion_library=swscale"));
421-
auto output = ourDecoder->getFrameAtIndex(
421+
auto output = ourDecoder->getFrameAtIndexInternal(
422422
bestVideoStreamIndex, 0, preAllocatedOutputTensor);
423423
EXPECT_EQ(output.frame.data_ptr(), preAllocatedOutputTensor.data_ptr());
424424
}

0 commit comments

Comments
 (0)