2024-10-31 nightly release (68939a9)

pytorchbot · pytorchbot · commit a3b271acaf77 · 2024-10-31T11:35:27.000Z
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
@@ -96,6 +96,9 @@ jobs:
           ${CONDA_RUN} conda info
           ${CONDA_RUN} nvidia-smi
           ${CONDA_RUN} conda list
+      - name: Assert ffmpeg exists
+        run: |
+          ${CONDA_RUN} ffmpeg -buildconf
       - name: Update pip
         run: ${CONDA_RUN} python -m pip install --upgrade pip
       - name: Install PyTorch
diff --git a/benchmarks/samplers/benchmark_samplers.py b/benchmarks/samplers/benchmark_samplers.py
@@ -1,3 +1,4 @@
+import argparse
 from pathlib import Path
 from time import perf_counter_ns
 
@@ -45,51 +46,76 @@ def report_stats(times, num_frames, unit="ms"):
     return med, fps
 
 
-def sample(sampler, **kwargs):
-    decoder = VideoDecoder(VIDEO_PATH)
+def sample(decoder, sampler, **kwargs):
     return sampler(
         decoder,
         num_frames_per_clip=10,
         **kwargs,
     )
 
 
-VIDEO_PATH = Path(__file__).parent / "../../test/resources/nasa_13013.mp4"
-NUM_EXP = 30
-
-for num_clips in (1, 50):
-    print("-" * 10)
-    print(f"{num_clips = }")
-
-    print("clips_at_random_indices     ", end="")
-    times, num_frames = bench(
-        sample, clips_at_random_indices, num_clips=num_clips, num_exp=NUM_EXP, warmup=2
-    )
-    report_stats(times, num_frames, unit="ms")
-
-    print("clips_at_regular_indices    ", end="")
-    times, num_frames = bench(
-        sample, clips_at_regular_indices, num_clips=num_clips, num_exp=NUM_EXP, warmup=2
-    )
-    report_stats(times, num_frames, unit="ms")
-
-    print("clips_at_random_timestamps  ", end="")
-    times, num_frames = bench(
-        sample,
-        clips_at_random_timestamps,
-        num_clips=num_clips,
-        num_exp=NUM_EXP,
-        warmup=2,
-    )
-    report_stats(times, num_frames, unit="ms")
-
-    print("clips_at_regular_timestamps ", end="")
-    seconds_between_clip_starts = 13 / num_clips  # approximate. video is 13s long
-    times, num_frames = bench(
-        sample,
-        clips_at_regular_timestamps,
-        seconds_between_clip_starts=seconds_between_clip_starts,
-        num_exp=NUM_EXP,
-        warmup=2,
-    )
-    report_stats(times, num_frames, unit="ms")
+def run_sampler_benchmarks(device, video):
+    NUM_EXP = 30
+
+    for num_clips in (1, 50):
+        print("-" * 10)
+        print(f"{num_clips = }")
+
+        print("clips_at_random_indices     ", end="")
+        decoder = VideoDecoder(video, device=device)
+        times, num_frames = bench(
+            sample,
+            decoder,
+            clips_at_random_indices,
+            num_clips=num_clips,
+            num_exp=NUM_EXP,
+            warmup=2,
+        )
+        report_stats(times, num_frames, unit="ms")
+
+        print("clips_at_regular_indices    ", end="")
+        times, num_frames = bench(
+            sample,
+            decoder,
+            clips_at_regular_indices,
+            num_clips=num_clips,
+            num_exp=NUM_EXP,
+            warmup=2,
+        )
+        report_stats(times, num_frames, unit="ms")
+
+        print("clips_at_random_timestamps  ", end="")
+        times, num_frames = bench(
+            sample,
+            decoder,
+            clips_at_random_timestamps,
+            num_clips=num_clips,
+            num_exp=NUM_EXP,
+            warmup=2,
+        )
+        report_stats(times, num_frames, unit="ms")
+
+        print("clips_at_regular_timestamps ", end="")
+        seconds_between_clip_starts = 13 / num_clips  # approximate. video is 13s long
+        times, num_frames = bench(
+            sample,
+            decoder,
+            clips_at_regular_timestamps,
+            seconds_between_clip_starts=seconds_between_clip_starts,
+            num_exp=NUM_EXP,
+            warmup=2,
+        )
+        report_stats(times, num_frames, unit="ms")
+
+
+def main():
+    DEFAULT_VIDEO_PATH = Path(__file__).parent / "../../test/resources/nasa_13013.mp4"
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", type=str, default="cpu")
+    parser.add_argument("--video", type=str, default=str(DEFAULT_VIDEO_PATH))
+    args = parser.parse_args()
+    run_sampler_benchmarks(args.device, args.video)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp b/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp
@@ -19,7 +19,8 @@ void convertAVFrameToDecodedOutputOnCuda(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     AVCodecContext* codecContext,
     VideoDecoder::RawDecodedOutput& rawOutput,
-    VideoDecoder::DecodedOutput& output) {
+    VideoDecoder::DecodedOutput& output,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   throwUnsupportedDeviceError(device);
 }
 
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -201,7 +201,8 @@ void convertAVFrameToDecodedOutputOnCuda(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     AVCodecContext* codecContext,
     VideoDecoder::RawDecodedOutput& rawOutput,
-    VideoDecoder::DecodedOutput& output) {
+    VideoDecoder::DecodedOutput& output,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   AVFrame* src = rawOutput.frame.get();
 
   TORCH_CHECK(
@@ -213,7 +214,21 @@ void convertAVFrameToDecodedOutputOnCuda(
   NppiSize oSizeROI = {width, height};
   Npp8u* input[2] = {src->data[0], src->data[1]};
   torch::Tensor& dst = output.frame;
-  dst = allocateDeviceTensor({height, width, 3}, options.device);
+  if (preAllocatedOutputTensor.has_value()) {
+    dst = preAllocatedOutputTensor.value();
+    auto shape = dst.sizes();
+    TORCH_CHECK(
+        (shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
+            (shape[2] == 3),
+        "Expected tensor of shape ",
+        height,
+        "x",
+        width,
+        "x3, got ",
+        shape);
+  } else {
+    dst = allocateDeviceTensor({height, width, 3}, options.device);
+  }
 
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device);
diff --git a/src/torchcodec/decoders/_core/DeviceInterface.h b/src/torchcodec/decoders/_core/DeviceInterface.h
@@ -37,7 +37,8 @@ void convertAVFrameToDecodedOutputOnCuda(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     AVCodecContext* codecContext,
     VideoDecoder::RawDecodedOutput& rawOutput,
-    VideoDecoder::DecodedOutput& output);
+    VideoDecoder::DecodedOutput& output,
+    std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
 void releaseContextOnCuda(
     const torch::Device& device,
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -196,7 +196,7 @@ VideoDecoder::BatchDecodedOutput::BatchDecodedOutput(
            options.height.value_or(*metadata.height),
            options.width.value_or(*metadata.width),
            3},
-          {torch::kUInt8})),
+          at::TensorOptions(options.device).dtype(torch::kUInt8))),
       ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
       durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {}
 
@@ -855,17 +855,18 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
   output.duration = getDuration(frame);
   output.durationSeconds = ptsToSeconds(
       getDuration(frame), formatContext_->streams[streamIndex]->time_base);
+  // TODO: we should fold preAllocatedOutputTensor into RawDecodedOutput.
   if (streamInfo.options.device.type() == torch::kCPU) {
     convertAVFrameToDecodedOutputOnCPU(
         rawOutput, output, preAllocatedOutputTensor);
   } else if (streamInfo.options.device.type() == torch::kCUDA) {
-    // TODO: handle pre-allocated output tensor
     convertAVFrameToDecodedOutputOnCuda(
         streamInfo.options.device,
         streamInfo.options,
         streamInfo.codecContext.get(),
         rawOutput,
-        output);
+        output,
+        preAllocatedOutputTensor);
   } else {
     TORCH_CHECK(
         false, "Invalid device type: " + streamInfo.options.device.str());
@@ -1007,10 +1008,8 @@ void VideoDecoder::validateFrameIndex(
 
 VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
     int streamIndex,
-    int64_t frameIndex,
-    std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  auto output = getFrameAtIndexInternal(
-      streamIndex, frameIndex, preAllocatedOutputTensor);
+    int64_t frameIndex) {
+  auto output = getFrameAtIndexInternal(streamIndex, frameIndex);
   output.frame = MaybePermuteHWC2CHW(streamIndex, output.frame);
   return output;
 }
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -225,7 +225,11 @@ class VideoDecoder {
   // seconds=5.999, etc.
   DecodedOutput getFramePlayedAtTimestampNoDemux(double seconds);
 
-  DecodedOutput getFrameAtIndex(
+  DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex);
+  // This is morally private but needs to be exposed for C++ tests. Once
+  // getFrameAtIndex supports the preAllocatedOutputTensor parameter, we can
+  // move it back to private.
+  DecodedOutput getFrameAtIndexInternal(
       int streamIndex,
       int64_t frameIndex,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
@@ -387,10 +391,6 @@ class VideoDecoder {
       DecodedOutput& output,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
-  DecodedOutput getFrameAtIndexInternal(
-      int streamIndex,
-      int64_t frameIndex,
-      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
   DecodedOutput getNextFrameOutputNoDemuxInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 from typing import Literal, Optional, Tuple, Union
 
-from torch import Tensor
+from torch import device, Tensor
 
 from torchcodec import Frame, FrameBatch
 from torchcodec.decoders import _core as core
@@ -36,19 +36,20 @@ class VideoDecoder:
             This can be either "NCHW" (default) or "NHWC", where N is the batch
             size, C is the number of channels, H is the height, and W is the
             width of the frames.
-        num_ffmpeg_threads (int, optional): The number of threads to use for decoding.
-            Use 1 for single-threaded decoding which may be best if you are running multiple
-            instances of ``VideoDecoder`` in parallel. Use a higher number for multi-threaded
-            decoding which is best if you are running a single instance of ``VideoDecoder``.
-            Default: 1.
-
             .. note::
 
                 Frames are natively decoded in NHWC format by the underlying
                 FFmpeg implementation. Converting those into NCHW format is a
                 cheap no-copy operation that allows these frames to be
                 transformed using the `torchvision transforms
                 <https://pytorch.org/vision/stable/transforms.html>`_.
+        num_ffmpeg_threads (int, optional): The number of threads to use for decoding.
+            Use 1 for single-threaded decoding which may be best if you are running multiple
+            instances of ``VideoDecoder`` in parallel. Use a higher number for multi-threaded
+            decoding which is best if you are running a single instance of ``VideoDecoder``.
+            Default: 1.
+        device (str or torch.device, optional): The device to use for decoding. Default: "cpu".
+
 
     Attributes:
         metadata (VideoStreamMetadata): Metadata of the video stream.
@@ -64,6 +65,7 @@ def __init__(
         stream_index: Optional[int] = None,
         dimension_order: Literal["NCHW", "NHWC"] = "NCHW",
         num_ffmpeg_threads: int = 1,
+        device: Optional[Union[str, device]] = "cpu",
     ):
         if isinstance(source, str):
             self._decoder = core.create_from_file(source)
@@ -92,6 +94,7 @@ def __init__(
             stream_index=stream_index,
             dimension_order=dimension_order,
             num_threads=num_ffmpeg_threads,
+            device=device,
         )
 
         self.metadata, self.stream_index = _get_and_validate_stream_metadata(
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
@@ -400,7 +400,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorFilterGraph) {
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamDecoderOptions(
           "color_conversion_library=filtergraph"));
-  auto output = ourDecoder->getFrameAtIndex(
+  auto output = ourDecoder->getFrameAtIndexInternal(
       bestVideoStreamIndex, 0, preAllocatedOutputTensor);
   EXPECT_EQ(output.frame.data_ptr(), preAllocatedOutputTensor.data_ptr());
 }
@@ -418,7 +418,7 @@ TEST_P(VideoDecoderTest, PreAllocatedTensorSwscale) {
       bestVideoStreamIndex,
       VideoDecoder::VideoStreamDecoderOptions(
           "color_conversion_library=swscale"));
-  auto output = ourDecoder->getFrameAtIndex(
+  auto output = ourDecoder->getFrameAtIndexInternal(
       bestVideoStreamIndex, 0, preAllocatedOutputTensor);
   EXPECT_EQ(output.frame.data_ptr(), preAllocatedOutputTensor.data_ptr());
 }
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
diff --git a/test/utils.py b/test/utils.py

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,8 @@ void convertAVFrameToDecodedOutputOnCuda(`
`19`	`19`	`const VideoDecoder::VideoStreamDecoderOptions& options,`
`20`	`20`	`AVCodecContext* codecContext,`
`21`	`21`	`VideoDecoder::RawDecodedOutput& rawOutput,`
`22`		`- VideoDecoder::DecodedOutput& output) {`
	`22`	`+ VideoDecoder::DecodedOutput& output,`
	`23`	`+ std::optional<torch::Tensor> preAllocatedOutputTensor) {`
`23`	`24`	`throwUnsupportedDeviceError(device);`
`24`	`25`	`}`
`25`	`26`