Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 205 additions & 2 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,75 @@ jobs:
ls -al "${RUNNER_ARTIFACT_DIR}"
echo "::endgroup::"

export-gemma3-cuda-artifact:
name: export-gemma3-cuda-${{ matrix.quant.name }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
secrets: inherit
strategy:
fail-fast: false
matrix:
quant:
- name: "non-quantized"
artifact: "voxtral-cuda-export"
extra_args: ""
# TODO: enable gemma3 quantization
# - name: "quantized-int4-tile-packed"
# artifact: "voxtral-cuda-quantized-int4-tile-packed"
# extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
# - name: "quantized-int4-weight-only"
# artifact: "voxtral-cuda-quantized-int4-weight-only"
# # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
# extra_args: "--qlinear_encoder 4w"
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
upload-artifact: gemma3-cuda-export
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux

echo "::group::Setup ExecuTorch"
./install_executorch.sh
echo "::endgroup::"

echo "::group::Setup Huggingface"
pip install -U "huggingface_hub[cli]" accelerate
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
pip list
echo "::endgroup::"

echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
optimum-cli export executorch \
--model "google/gemma-3-4b-it" \
--task "multimodal-text-to-text" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
--max_seq_len 64 \
--output_dir ./

test -f model.pte
test -f aoti_cuda_blob.ptd
echo "::endgroup::"

echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
mkdir -p "${RUNNER_ARTIFACT_DIR}/"
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
ls -al "${RUNNER_ARTIFACT_DIR}/"
echo "::endgroup::"

benchmark-voxtral-cuda:
name: benchmark-voxtral-cuda
needs: export-voxtral-cuda-artifact
Expand Down Expand Up @@ -204,13 +273,63 @@ jobs:
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
-DEXECUTORCH_BUILD_TESTS=ON \
-Bcmake-out .
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
echo "::endgroup::"

echo "::group::Run Voxtral Benchmark"

export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd

echo "::endgroup::"

benchmark-gemma3-cuda:
name: benchmark-gemma3-cuda
needs: export-gemma3-cuda-artifact
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
download-artifact: gemma3-cuda-export
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux

echo "::group::Setup ExecuTorch Requirements"
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
pip list
echo "::endgroup::"

echo "::group::Prepare Gemma3 Artifacts"
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
ls -al model.pte aoti_cuda_blob.ptd
echo "::endgroup::"

echo "::group::Build Gemma3 Benchmark"
cmake -DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_CUDA=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
-DEXECUTORCH_BUILD_TESTS=ON \
-Bcmake-out .
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
echo "::endgroup::"

echo "::group::Run Gemma3 Benchmark"

export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd

echo "::endgroup::"

Expand Down Expand Up @@ -302,3 +421,87 @@ jobs:
exit $EXIT_CODE
fi
echo "::endgroup::"

test-gemma3-cuda-e2e:
name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
needs: export-gemma3-cuda-artifact
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
format:
- name: "non-quantized"
artifact: "gemma3-cuda-export"
# TODO: enable quantized gemma3.
# - name: "quantized-int4-tile-packed"
# artifact: "gemma3-cuda-quantized-int4-tile-packed"
# - name: "quantized-int4-weight-only"
# artifact: "gemma3-cuda-quantized-int4-weight-only"
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
download-artifact: ${{ matrix.format.artifact }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux

echo "::group::Setup ExecuTorch Requirements"
./install_requirements.sh
pip list
echo "::endgroup::"

echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
curl -L $TOKENIZER_URL -o tokenizer.json
ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
IMAGE_PATH="docs/source/_static/img/et-logo.png"
echo "::endgroup::"

echo "::group::Build Gemma3 Runner"
cmake --preset llm \
-DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out -S.
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release

cmake -DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
-Sexamples/models/gemma3 \
-Bcmake-out/examples/models/gemma3/
cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
echo "::endgroup::"

echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
--model_path model.pte \
--data_path aoti_cuda_blob.ptd \
--tokenizer_path tokenizer.json \
--image_path $IMAGE_PATH \
--temperature 0 2>&1)
EXIT_CODE=$?
set -e

echo "$OUTPUT"

if ! echo "$OUTPUT" | grep -iq "chip"; then
echo "Expected output 'chip' not found in output"
exit 1
fi

if [ $EXIT_CODE -ne 0 ]; then
echo "Unexpected exit code: $EXIT_CODE"
exit $EXIT_CODE
fi
echo "::endgroup::"
45 changes: 39 additions & 6 deletions backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,39 @@ find_package(CUDAToolkit REQUIRED)
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
find_package_torch()

# CUDA tensor maker for backends that support incontiguous tensors
set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
target_include_directories(
cuda_tensor_maker
PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
)
target_compile_options(
cuda_tensor_maker
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
)
# Ensure symbols are exported properly
if(APPLE)
target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
else()
target_link_options(
cuda_tensor_maker PUBLIC
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
)
endif()

# Link against ExecuTorch core libraries
target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
executorch_target_link_options_shared_lib(cuda_tensor_maker)

install(
TARGETS cuda_tensor_maker
EXPORT ExecuTorchTargets
DESTINATION lib
)

# CUDA-specific AOTI functionality
set(_aoti_cuda_sources
runtime/cuda_backend.cpp
Expand Down Expand Up @@ -62,20 +95,20 @@ target_link_options(
aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
)

# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch
# CUDA libraries
target_link_libraries(
aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
)
# If you need other CUDA libraries, link them similarly:
# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
executorch_target_link_options_shared_lib(aoti_cuda)

if(BUILD_TESTING)
# Add runtime
add_executable(voxtral_runner tests/voxtral_runner.cpp)
add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
target_link_libraries(
voxtral_runner PUBLIC aoti_cuda extension_module_static
extension_flat_tensor portable_ops_lib
multimodal_benchmark PUBLIC aoti_cuda extension_module_static
extension_flat_tensor portable_ops_lib
)
endif()

Expand Down
3 changes: 3 additions & 0 deletions backends/cuda/cuda_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ def preprocess(
user_input_placeholders.append(node.meta["val"])

options: dict[str, typing.Any] = {
# Disable this to support sdpa decomposition
# TODO(gasoonjia): remove it after pin bump to latest pytorch
"loop_ordering_after_fusion": False,
# Better model precision
"emulate_precision_casts": True,
# Embed CUDA kernel binaries directly into the compiled shared object
Expand Down
21 changes: 20 additions & 1 deletion backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@ runtime.cxx_library(
],
)

runtime.cxx_library(
name = "tensor_maker",
srcs = [
"tensor/tensor_maker.cpp",
],
headers = [
"tensor/tensor_maker.h",
],
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
link_whole = True,
supports_python_dlopen = True,
visibility = ["@EXECUTORCH_CLIENTS"],
deps = [
"//executorch/runtime/core:core",
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/exec_aten/util:tensor_util",
],
)

runtime.cxx_library(
name = "runtime_shims",
srcs = [
Expand All @@ -52,8 +71,8 @@ runtime.cxx_library(
compiler_flags = ["-Wno-global-constructors"],
visibility = ["@EXECUTORCH_CLIENTS"],
deps = [
":tensor_maker",
"//executorch/backends/aoti:common_shims",
"//executorch/extension/tensor:tensor",
"//executorch/runtime/core:core",
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/platform:platform",
Expand Down
Loading
Loading