Skip to content

Commit 697078b

Browse files
authored
[aoti-et] Add cuda delegate runtime code (#14827)
This pull request introduces comprehensive support for the CUDA backend in ExecuTorch, enabling model export, build, and runtime execution with CUDA acceleration. It adds new CMake build logic, implements the CUDA backend runtime, updates workflow automation for CUDA model testing, and improves type and error handling for CUDA-specific operations. **CUDA Backend Integration** * Added new CUDA backend build logic to `CMakeLists.txt`, including registration of the `aoti_cuda` backend and dependencies on common AOTI and CUDA-specific sources. (`CMakeLists.txt`, [[1]](diffhunk://#diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR590-R599); `backends/cuda/CMakeLists.txt`, [[2]](diffhunk://#diff-c2a6fbfdf4c7871966d5decf186dd0d6591d64d5e8a96abd126476942debe7fdR1-R63) * Implemented the `CudaBackend` runtime in `cuda_backend.cpp`, handling dynamic loading of model containers, GPU tensor management, and execution flow for CUDA kernels. (`backends/cuda/runtime/cuda_backend.cpp`, [backends/cuda/runtime/cuda_backend.cppR1-R383](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375R1-R383)) **Workflow and Testing Automation** * Updated and renamed the CUDA workflow file to add a matrix job for CUDA model testing, running tests for multiple models on GPU hardware. (`.github/workflows/cuda.yml`, [.github/workflows/cuda.ymlR64-R87](diffhunk://#diff-29abea04e0613c2569973e5c8e3c89e04846d408c855eeb1f3efcfae7cfa6f89R64-R87)) * Enhanced the CI test script to support CUDA backend selection, model export, and execution, including artifact preparation. (`.ci/scripts/test_model.sh`, [[1]](diffhunk://#diff-841b10bb60e2171b43fd26ab87545bb645f3a4f40a20b5dedb7447387dd133d0R66-R72) [[2]](diffhunk://#diff-841b10bb60e2171b43fd26ab87545bb645f3a4f40a20b5dedb7447387dd133d0R333-R339) [[3]](diffhunk://#diff-841b10bb60e2171b43fd26ab87545bb645f3a4f40a20b5dedb7447387dd133d0R392-R397) **Type and Error Handling Improvements** * Extended supported data types for the CUDA backend, adding `INT64` and updating error messages for unsupported dtypes. (`backends/cuda/runtime/shims/utils.h`, [[1]](diffhunk://#diff-f4873dd1770e339eb207c219bea2b72b3bad59fad941f39d7cfb8923cadd3541R43) [[2]](diffhunk://#diff-f4873dd1770e339eb207c219bea2b72b3bad59fad941f39d7cfb8923cadd3541R104) [[3]](diffhunk://#diff-f4873dd1770e339eb207c219bea2b72b3bad59fad941f39d7cfb8923cadd3541L116-R120) * Added new type aliases and fields for CUDA delegate and tensor handles to support runtime operations. (`backends/aoti/aoti_model_container.h`, [[1]](diffhunk://#diff-84caca41e72ad693665c930ab7d0c31e05f64b268f4d7ac37c17869149fad0c7R24) [[2]](diffhunk://#diff-84caca41e72ad693665c930ab7d0c31e05f64b268f4d7ac37c17869149fad0c7R78) **Miscellaneous** * Improved include paths for the AOTI common library to ensure proper header resolution. (`backends/aoti/CMakeLists.txt`, [backends/aoti/CMakeLists.txtL33-R35](diffhunk://#diff-c3d5933d211acc568c9bdf8e08d0ca99b01e50bca113307fbab4cbc4018fdf55L33-R35)) * Added copyright and documentation to the CUDA export scripts. (`examples/cuda/scripts/__init__.py`, [examples/cuda/scripts/__init__.pyR1-R7](diffhunk://#diff-2ef2a5794420089aeb5bf7cf3bcd4e82c722c408a171b83c2caafddc1ab55d84R1-R7))
1 parent fcd42bc commit 697078b

File tree

16 files changed

+644
-5
lines changed

16 files changed

+644
-5
lines changed

.ci/scripts/test_model.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ build_cmake_executor_runner() {
6363
${COMMON} \
6464
-B${CMAKE_OUTPUT_DIR} .
6565
cmake --build ${CMAKE_OUTPUT_DIR} -j4
66+
elif [[ "$backend_string_select" == "CUDA" ]]; then
67+
echo "Backend $backend_string_select selected"
68+
cmake -DCMAKE_BUILD_TYPE=Release \
69+
-DEXECUTORCH_BUILD_CUDA=ON \
70+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
71+
${COMMON} \
72+
-B${CMAKE_OUTPUT_DIR} .
73+
cmake --build ${CMAKE_OUTPUT_DIR} -j4
6674
else
6775
cmake -DCMAKE_BUILD_TYPE=Debug \
6876
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -323,6 +331,13 @@ test_model_with_mediatek() {
323331
EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit)
324332
}
325333

334+
test_model_with_cuda() {
335+
# Export a basic .pte and .ptd, then run the model.
336+
"${PYTHON_EXECUTABLE}" -m examples.cuda.scripts.export --model_name="${MODEL_NAME}" --output_dir "./"
337+
build_cmake_executor_runner "CUDA"
338+
./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" --data_path "./aoti_cuda_blob.ptd"
339+
}
340+
326341

327342
if [[ "${BACKEND}" == "portable" ]]; then
328343
echo "Testing ${MODEL_NAME} with portable kernels..."
@@ -375,6 +390,12 @@ elif [[ "${BACKEND}" == "mediatek" ]]; then
375390
if [[ $? -eq 0 ]]; then
376391
prepare_artifacts_upload
377392
fi
393+
elif [[ "${BACKEND}" == "cuda" ]]; then
394+
echo "Testing ${MODEL_NAME} with cuda..."
395+
test_model_with_cuda
396+
if [[ $? -eq 0 ]]; then
397+
prepare_artifacts_upload
398+
fi
378399
else
379400
set +e
380401
if [[ "${BACKEND}" == *"quantization"* ]]; then

.github/workflows/test-cuda-builds.yml renamed to .github/workflows/cuda.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,28 @@ jobs:
6161
else
6262
echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
6363
fi
64+
65+
test-models-cuda:
66+
name: test-models-cuda
67+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
68+
permissions:
69+
id-token: write
70+
contents: read
71+
strategy:
72+
fail-fast: false
73+
matrix:
74+
model: [linear, add, add_mul, resnet18]
75+
with:
76+
timeout: 90
77+
runner: linux.g5.4xlarge.nvidia.gpu
78+
gpu-arch-type: cuda
79+
gpu-arch-version: 12.6
80+
use-custom-docker-registry: false
81+
submodules: recursive
82+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
83+
script: |
84+
set -eux
85+
86+
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
87+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
88+
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda

.lintrunner.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ exclude_patterns = [
219219
'**/*.gif',
220220
'extension/llm/tokenizers',
221221
'extension/llm/tokenizers/**',
222+
'examples/cuda',
222223
# File contains @generated
223224
'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
224225
'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',

CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,16 @@ endif()
587587

588588
if(EXECUTORCH_BUILD_CORTEX_M)
589589
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
590+
list(APPEND _executorch_backends coretex_m_backend)
591+
endif()
592+
593+
if(EXECUTORCH_BUILD_CUDA)
594+
# Build common AOTI functionality (required for CUDA)
595+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
596+
# Build CUDA-specific AOTI functionality
597+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
598+
# Add aoti_cuda to backends - it already depends on aoti_common
599+
list(APPEND _executorch_backends aoti_cuda)
590600
endif()
591601

592602
if(EXECUTORCH_BUILD_EXTENSION_APPLE)

backends/aoti/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp)
3030
add_library(aoti_common STATIC ${_aoti_common_sources})
3131
target_include_directories(
3232
aoti_common
33-
PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
33+
PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
34+
$<INSTALL_INTERFACE:include>
35+
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
3436
# PyTorch AOTI headers from ExecuTorch's torch detection
3537
${TORCH_INCLUDE_DIRS}
3638
)

backends/aoti/aoti_model_container.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ using executorch::runtime::etensor::Tensor;
2121
extern "C" {
2222

2323
// Type definitions
24+
using AOTITensorHandle = Tensor*;
2425
using AOTIRuntimeError = Error;
2526

2627
// Forward declarations for AOT Inductor model container
@@ -74,6 +75,7 @@ extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
7475
// AOTI Delegate Handle structure
7576
struct AOTIDelegateHandle {
7677
void* so_handle;
78+
std::string so_path;
7779
AOTInductorModelContainerHandle container_handle;
7880
};
7981

backends/cuda/CMakeLists.txt

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
#
7+
# Build AOTI CUDA backend for runtime.
8+
#
9+
# ### Editing this file ###
10+
#
11+
# This file should be formatted with
12+
# ~~~
13+
# cmake-format -i CMakeLists.txt
14+
# ~~~
15+
# It should also be cmake-lint clean.
16+
#
17+
cmake_minimum_required(VERSION 3.29)
18+
19+
set(CMAKE_CXX_STANDARD 17)
20+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
21+
set(CMAKE_CUDA_STANDARD 17)
22+
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
23+
24+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
25+
26+
# Source root directory for executorch.
27+
if(NOT EXECUTORCH_ROOT)
28+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
29+
endif()
30+
31+
find_package(CUDAToolkit REQUIRED)
32+
33+
# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
34+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
35+
find_package_torch()
36+
37+
# CUDA-specific AOTI functionality
38+
set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp
39+
runtime/shims/tensor_attribute.cpp
40+
)
41+
add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
42+
target_include_directories(
43+
aoti_cuda
44+
PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
45+
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
46+
$<INSTALL_INTERFACE:include>
47+
# PyTorch AOTI headers from ExecutorTorch's torch detection
48+
${TORCH_INCLUDE_DIRS}
49+
)
50+
target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
51+
# Ensure symbols are exported properly
52+
target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
53+
54+
# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
55+
target_link_libraries(
56+
aoti_cuda
57+
PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
58+
# Link PyTorch libraries for AOTI CUDA functions
59+
${TORCH_LIBRARIES}
60+
)
61+
# If you need other CUDA libraries, link them similarly:
62+
# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
63+
executorch_target_link_options_shared_lib(aoti_cuda)
64+
65+
install(
66+
TARGETS aoti_cuda
67+
EXPORT ExecuTorchTargets
68+
DESTINATION lib
69+
)

0 commit comments

Comments
 (0)