meta-pytorch
diff --git a/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/cuda_tests.yaml‎
Lines changed: 76 additions & 0 deletions b/‎.github/workflows/cuda_tests.yaml‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎.github/workflows/linux_wheel.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/linux_wheel.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/decoders/BenchmarkDecodersMain.cpp‎
Lines changed: 2 additions & 1 deletion b/‎benchmarks/decoders/BenchmarkDecodersMain.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/decoders/benchmark_decoders.py‎
Lines changed: 9 additions & 9 deletions b/‎benchmarks/decoders/benchmark_decoders.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎benchmarks/decoders/gpu_benchmark.py‎
Lines changed: 203 additions & 0 deletions b/‎benchmarks/decoders/gpu_benchmark.py‎
Lines changed: 203 additions & 0 deletions
@@ -18,10 +18,10 @@ body:
       # All necessary imports at the beginning
       import torch
       import torchcodec
-      from torchcodec.decoders import SimpleVideoDecoder
+      from torchcodec.decoders import VideoDecoder
 
       # A succinct reproducing example trimmed down to the essential parts:
-      decoder = SimpleVideoDecoder("path/to/video.mp4")  # Help! This fails!
+      decoder = VideoDecoder("path/to/video.mp4")  # Help! This fails!
       # ...
       ```
 
 
@@ -0,0 +1,76 @@
+name: Test on Linux CUDA
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  tests:
+    strategy:
+      matrix:
+        python_version: ["3.9"]
+        # TODO: Add more cuda versions.
+        cuda_arch_version: ["12.4"]
+        # TODO: Get ffmpeg 4 to work. Currently fails to build with nvcc.
+        ffmpeg_version: ["origin/release/5.1", "origin/release/6.1", "origin/release/7.1"]
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      repository: pytorch/torchcodec
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda_arch_version }}
+      timeout: 120
+
+      script: |
+        echo '::group::Install prereqs'
+        nvidia-smi
+        conda create --yes --name test python=${{ matrix.python_version }}
+        conda activate test
+        conda install --quiet --yes pip cmake pkg-config nasm
+        pip install --quiet --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
+        conda install --quiet --yes nvidia::libnpp
+        echo '::endgroup::'
+
+        echo '::group::Build FFMPEG'
+        # Build and install FFMPEG from source with CUDA enabled.
+        # The one on conda doesn't have CUDA enabled.
+        # Sub-step: install nvidia headers. Reference this link for details:
+        # https://docs.nvidia.com/video-technologies/video-codec-sdk/12.1/ffmpeg-with-nvidia-gpu/index.html
+        git clone --quiet https://git.videolan.org/git/ffmpeg/nv-codec-headers.git
+
+        pushd nv-codec-headers
+        make --silent PREFIX=$CONDA_PREFIX -j install
+        popd
+
+        # Now build FFMPEG from source with CUDA enabled.
+        git clone --quiet https://git.ffmpeg.org/ffmpeg.git ffmpeg/
+        pushd ffmpeg
+        git checkout ${{ matrix.ffmpeg_version }}
+        which pkg-config
+        pkg-config --list-all
+        ./configure --prefix=$CONDA_PREFIX --enable-nonfree --enable-cuda-nvcc --disable-static --enable-shared --optflags=-fno-omit-frame-pointer --disable-stripping --enable-cuvid --enable-rpath
+        make --silent -j install
+        popd
+        echo '::endgroup::'
+
+        echo '::group::Build TorchCodec'
+        CMAKE_BUILD_PARALLEL_LEVEL=8 CXXFLAGS="" LDFLAGS="-Wl,--allow-shlib-undefined -Wl,-rpath,$CONDA_PREFIX/lib -Wl,-rpath-link,$CONDA_PREFIX/lib -L$CONDA_PREFIX/lib" CMAKE_BUILD_TYPE=Release ENABLE_CUDA=1 ENABLE_NVTX=1 pip install -e ".[dev]" --no-build-isolation -vv --debug
+        echo '::endgroup::'
+
+        echo '::group::Test TorchCodec'
+        # Ensure our compiled ffmpeg binary is in the path.
+        which ffmpeg
+        # We skip certain tests because they are not relevant to GPU decoding and they always fail with
+        # a custom FFMPEG build.
+        pytest -k "not (test_get_metadata or get_ffmpeg_version)" -vvv
+        echo '::endgroup::'
+
+        python benchmarks/decoders/gpu_benchmark.py --devices=cuda:0 --resize_devices=none
+        python benchmarks/decoders/gpu_benchmark.py --devices=cuda:0 --resize_devices=none --num_threads=5 --num_videos=100
+        conda deactivate
@@ -124,4 +124,4 @@ jobs:
           python test/decoders/manual_smoke_test.py
       - name: Run Python tests
         run: |
-          pytest test
+          pytest test -vvv
@@ -31,9 +31,9 @@ detailed example, [check out our
 documentation](https://pytorch.org/torchcodec/stable/generated_examples/)!
 
 ```python
-from torchcodec.decoders import SimpleVideoDecoder
+from torchcodec.decoders import VideoDecoder
 
-decoder = SimpleVideoDecoder("path/to/video.mp4")
+decoder = VideoDecoder("path/to/video.mp4")
 
 decoder.metadata
 # VideoStreamMetadata:
 
@@ -145,7 +145,8 @@ void runNDecodeIterationsWithCustomOps(
         /*height=*/std::nullopt,
         /*thread_count=*/std::nullopt,
         /*dimension_order=*/std::nullopt,
-        /*stream_index=*/std::nullopt);
+        /*stream_index=*/std::nullopt,
+        /*device=*/std::nullopt);
 
     for (double pts : ptsList) {
       seekFrameOp.call(decoderTensor, pts);
 
@@ -10,10 +10,11 @@
 import json
 import os
 import timeit
+from pathlib import Path
 
 import torch
 import torch.utils.benchmark as benchmark
-from torchcodec.decoders import SimpleVideoDecoder
+from torchcodec.decoders import VideoDecoder
 
 from torchcodec.decoders._core import (
     _add_video_stream,
@@ -206,10 +207,10 @@ def get_frames_from_video(self, video_file, pts_list):
         metadata = json.loads(get_json_metadata(decoder))
         average_fps = metadata["averageFps"]
         best_video_stream = metadata["bestVideoStreamIndex"]
-        indexes_list = [int(pts * average_fps) for pts in pts_list]
+        indices_list = [int(pts * average_fps) for pts in pts_list]
         frames = []
         frames = get_frames_at_indices(
-            decoder, stream_index=best_video_stream, frame_indices=indexes_list
+            decoder, stream_index=best_video_stream, frame_indices=indices_list
         )
         return frames
 
@@ -303,9 +304,8 @@ def get_test_resource_path(filename: str) -> str:
         resource = importlib.resources.files(__package__).joinpath(filename)
         with importlib.resources.as_file(resource) as path:
             return os.fspath(path)
-    return os.path.join(
-        os.path.dirname(__file__), "..", "..", "test", "resources", filename
-    )
+
+    return str(Path(__file__).parent / f"../../test/resources/{filename}")
 
 
 def create_torchcodec_decoder_from_file(video_file):
@@ -404,9 +404,9 @@ def main() -> None:
     results = []
     for decoder_name, decoder in decoder_dict.items():
         for video_path in args.bm_video_paths.split(","):
-            # We only use the SimpleVideoDecoder to get the metadata and get
+            # We only use the VideoDecoder to get the metadata and get
             # the list of PTS values to seek to.
-            simple_decoder = SimpleVideoDecoder(video_path)
+            simple_decoder = VideoDecoder(video_path)
             duration = simple_decoder.metadata.duration_seconds
             pts_list = [
                 i * duration / num_uniform_samples for i in range(num_uniform_samples)
@@ -453,7 +453,7 @@ def main() -> None:
 
     first_video_path = args.bm_video_paths.split(",")[0]
     if args.bm_video_creation:
-        simple_decoder = SimpleVideoDecoder(first_video_path)
+        simple_decoder = VideoDecoder(first_video_path)
         metadata = simple_decoder.metadata
         metadata_string = f"{metadata.codec} {metadata.width}x{metadata.height}, {metadata.duration_seconds}s {metadata.average_fps}fps"
         creation_result = benchmark.Timer(
 
@@ -0,0 +1,203 @@
+import argparse
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import torch
+
+import torch.utils.benchmark as benchmark
+
+import torchcodec
+import torchvision.transforms.v2.functional as F
+
+RESIZED_WIDTH = 256
+RESIZED_HEIGHT = 256
+
+
+def transfer_and_resize_frame(frame, resize_device_string):
+    # This should be a no-op if the frame is already on the target device.
+    frame = frame.to(resize_device_string)
+    frame = F.resize(frame, (RESIZED_HEIGHT, RESIZED_WIDTH))
+    return frame
+
+
+def decode_full_video(video_path, decode_device_string, resize_device_string):
+    # We use the core API instead of SimpleVideoDecoder because the core API
+    # allows us to natively resize as part of the decode step.
+    print(f"{decode_device_string=} {resize_device_string=}")
+    decoder = torchcodec.decoders._core.create_from_file(video_path)
+    num_threads = None
+    if "cuda" in decode_device_string:
+        num_threads = 1
+    width = None
+    height = None
+    if "native" in resize_device_string:
+        width = RESIZED_WIDTH
+        height = RESIZED_HEIGHT
+    torchcodec.decoders._core._add_video_stream(
+        decoder,
+        stream_index=-1,
+        device=decode_device_string,
+        num_threads=num_threads,
+        width=width,
+        height=height,
+    )
+
+    start_time = time.time()
+    frame_count = 0
+    while True:
+        try:
+            frame, *_ = torchcodec.decoders._core.get_next_frame(decoder)
+            if resize_device_string != "none" and "native" not in resize_device_string:
+                frame = transfer_and_resize_frame(frame, resize_device_string)
+
+            frame_count += 1
+        except Exception as e:
+            print("EXCEPTION", e)
+            break
+
+    end_time = time.time()
+    elapsed = end_time - start_time
+    fps = frame_count / (end_time - start_time)
+    print(
+        f"****** DECODED full video {decode_device_string=} {frame_count=} {elapsed=} {fps=}"
+    )
+    return frame_count, end_time - start_time
+
+
+def decode_videos_using_threads(
+    video_path,
+    decode_device_string,
+    resize_device_string,
+    num_videos,
+    num_threads,
+    use_multiple_gpus,
+):
+    executor = ThreadPoolExecutor(max_workers=num_threads)
+    for i in range(num_videos):
+        actual_decode_device = decode_device_string
+        if "cuda" in decode_device_string and use_multiple_gpus:
+            actual_decode_device = f"cuda:{i % torch.cuda.device_count()}"
+        executor.submit(
+            decode_full_video, video_path, actual_decode_device, resize_device_string
+        )
+    executor.shutdown(wait=True)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--devices",
+        default="cuda:0,cpu",
+        type=str,
+        help="Comma-separated devices to test decoding on.",
+    )
+    parser.add_argument(
+        "--resize_devices",
+        default="cuda:0,cpu,native,none",
+        type=str,
+        help="Comma-separated devices to test preroc (resize) on. Use 'none' to specify no resize.",
+    )
+    parser.add_argument(
+        "--video",
+        type=str,
+        default=str(Path(__file__).parent / "../../test/resources/nasa_13013.mp4"),
+    )
+    parser.add_argument(
+        "--use_torch_benchmark",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=(
+            "Use pytorch benchmark to measure decode time with warmup and "
+            "autorange. Without this we just run one iteration without warmup "
+            "to measure the cold start time."
+        ),
+    )
+    parser.add_argument(
+        "--num_threads",
+        type=int,
+        default=1,
+        help="Number of threads to use for decoding. Only used when --use_torch_benchmark is set.",
+    )
+    parser.add_argument(
+        "--num_videos",
+        type=int,
+        default=50,
+        help="Number of videos to decode in parallel. Only used when --num_threads is set.",
+    )
+    parser.add_argument(
+        "--use_multiple_gpus",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=("Use multiple GPUs to decode multiple videos in multi-threaded mode."),
+    )
+    args = parser.parse_args()
+    video_path = args.video
+
+    if not args.use_torch_benchmark:
+        for device in args.devices.split(","):
+            print("Testing on", device)
+            decode_full_video(video_path, device)
+        return
+
+    resize_devices = args.resize_devices.split(",")
+    resize_devices = [d for d in resize_devices if d != ""]
+    if len(resize_devices) == 0:
+        resize_devices.append("none")
+
+    label = "Decode+Resize Time"
+
+    results = []
+    for decode_device_string in args.devices.split(","):
+        for resize_device_string in resize_devices:
+            decode_label = decode_device_string
+            if "cuda" in decode_label:
+                # Shorten "cuda:0" to "cuda"
+                decode_label = "cuda"
+            resize_label = resize_device_string
+            if "cuda" in resize_device_string:
+                # Shorten "cuda:0" to "cuda"
+                resize_label = "cuda"
+            print("decode_device", decode_device_string)
+            print("resize_device", resize_device_string)
+            if args.num_threads > 1:
+                t = benchmark.Timer(
+                    stmt="decode_videos_using_threads(video_path, decode_device_string, resize_device_string, num_videos, num_threads, use_multiple_gpus)",
+                    globals={
+                        "decode_device_string": decode_device_string,
+                        "video_path": video_path,
+                        "decode_full_video": decode_full_video,
+                        "decode_videos_using_threads": decode_videos_using_threads,
+                        "resize_device_string": resize_device_string,
+                        "num_videos": args.num_videos,
+                        "num_threads": args.num_threads,
+                        "use_multiple_gpus": args.use_multiple_gpus,
+                    },
+                    label=label,
+                    description=f"threads={args.num_threads} work={args.num_videos} video={Path(video_path).name}",
+                    sub_label=f"D={decode_label} R={resize_label} T={args.num_threads} W={args.num_videos}",
+                ).blocked_autorange()
+                results.append(t)
+            else:
+                t = benchmark.Timer(
+                    stmt="decode_full_video(video_path, decode_device_string, resize_device_string)",
+                    globals={
+                        "decode_device_string": decode_device_string,
+                        "video_path": video_path,
+                        "decode_full_video": decode_full_video,
+                        "resize_device_string": resize_device_string,
+                    },
+                    label=label,
+                    description=f"video={Path(video_path).name}",
+                    sub_label=f"D={decode_label} R={resize_label}",
+                ).blocked_autorange()
+                results.append(t)
+    compare = benchmark.Compare(results)
+    compare.print()
+    print("Key: D=Decode, R=Resize T=threads W=work (number of videos to decode)")
+    print("Native resize is done as part of the decode step")
+    print("none resize means there is no resize step -- native or otherwise")
+
+
+if __name__ == "__main__":
+    main()