pytorch
diff --git a/‎.ci/scripts/test_backend.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_backend.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 28 additions & 4 deletions b/‎.ci/scripts/test_model.sh‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎.ci/scripts/utils.sh‎
Lines changed: 4 additions & 3 deletions b/‎.ci/scripts/utils.sh‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎.github/workflows/test-cuda-builds.yml‎ renamed to ‎.github/workflows/cuda.yml‎
Lines changed: 25 additions & 0 deletions b/‎.github/workflows/test-cuda-builds.yml‎ renamed to ‎.github/workflows/cuda.yml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test-backend-arm.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/test-backend-arm.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 1 addition & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 19 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎backends/aoti/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions b/‎backends/aoti/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/aoti/aoti_model_container.h‎
Lines changed: 2 additions & 0 deletions b/‎backends/aoti/aoti_model_container.h‎
Lines changed: 2 additions & 0 deletions
@@ -59,6 +59,7 @@ fi
 if [[ "$FLOW" == *arm* ]]; then
     # Setup ARM deps.
     .ci/scripts/setup-arm-baremetal-tools.sh
+    source examples/arm/ethos-u-scratch/setup_path.sh
 
     if [[ "$FLOW" == *ethos_u* ]]; then
         # Prepare a test runner binary that can run on the Corstone-3x0 FVPs
 
@@ -48,22 +48,33 @@ prepare_artifacts_upload() {
   fi
 }
 
+
 build_cmake_executor_runner() {
   local backend_string_select="${1:-}"
   echo "Building executor_runner"
   rm -rf ${CMAKE_OUTPUT_DIR}
   mkdir ${CMAKE_OUTPUT_DIR}
+  # Common options:
+  COMMON="-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE"
   if [[ "$backend_string_select" == "XNNPACK" ]]; then
     echo "Backend $backend_string_select selected"
-    (cd ${CMAKE_OUTPUT_DIR} \
-      && cmake -DCMAKE_BUILD_TYPE=Release \
+    cmake -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  elif [[ "$backend_string_select" == "CUDA" ]]; then
+    echo "Backend $backend_string_select selected"
+    cmake -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_CUDA=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4
   else
     cmake -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        ${COMMON} \
         -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
   fi
@@ -320,6 +331,13 @@ test_model_with_mediatek() {
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit)
 }
 
+test_model_with_cuda() {
+  # Export a basic .pte and .ptd, then run the model.
+  "${PYTHON_EXECUTABLE}" -m examples.cuda.scripts.export --model_name="${MODEL_NAME}" --output_dir "./"
+  build_cmake_executor_runner "CUDA"
+  ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" --data_path "./aoti_cuda_blob.ptd"
+}
+
 
 if [[ "${BACKEND}" == "portable" ]]; then
   echo "Testing ${MODEL_NAME} with portable kernels..."
@@ -372,6 +390,12 @@ elif [[ "${BACKEND}" == "mediatek" ]]; then
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
+elif [[ "${BACKEND}" == "cuda" ]]; then
+  echo "Testing ${MODEL_NAME} with cuda..."
+  test_model_with_cuda
+  if [[ $? -eq 0 ]]; then
+    prepare_artifacts_upload
+  fi
 else
   set +e
   if [[ "${BACKEND}" == *"quantization"* ]]; then
 
@@ -125,14 +125,15 @@ build_executorch_runner_cmake() {
   clean_executorch_install_folders
   mkdir "${CMAKE_OUTPUT_DIR}"
 
-  pushd "${CMAKE_OUTPUT_DIR}" || return
   if [[ $1 == "Debug" ]]; then
       CXXFLAGS="-fsanitize=address,undefined"
   else
       CXXFLAGS=""
   fi
-  CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
-  popd || return
+  CXXFLAGS="$CXXFLAGS" retry cmake \
+    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+    -DCMAKE_BUILD_TYPE="${1:-Release}" \
+    -B${CMAKE_OUTPUT_DIR} .
 
   if [ "$(uname)" == "Darwin" ]; then
     CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
 
@@ -61,3 +61,28 @@ jobs:
           else
             echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
           fi
+
+  test-models-cuda:
+    name: test-models-cuda
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        model: [linear, add, add_mul, resnet18]
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
@@ -909,12 +909,12 @@ jobs:
       contents: read
     secrets: inherit
     with:
+      secrets-env: SAMSUNG_AI_LITECORE_KEY
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12-android
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
-      secrets-env: SAMSUNG_AI_LITECORE_KEY
       script: |
         set -ex
 
 
@@ -12,6 +12,9 @@ on:
     paths:
       - .github/workflows/test-backend-arm.yml
       - .github/workflows/_test_backend.yml
+      - .ci/scripts/test_backend.sh
+      - backends/test/suite/flow.py
+      - backends/test/suite/flows/arm.py
   workflow_dispatch:
 
 concurrency:
 
@@ -219,6 +219,7 @@ exclude_patterns = [
     '**/*.gif',
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
+    'examples/cuda',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 
@@ -226,7 +226,7 @@ if(EXECUTORCH_BUILD_CPUINFO)
   install(
     TARGETS cpuinfo
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -269,7 +269,7 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   install(
     TARGETS pthreadpool pthreadpool_interface fxdiv
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -587,6 +587,16 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  list(APPEND _executorch_backends coretex_m_backend)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
@@ -713,7 +723,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
   install(
     TARGETS torchao_ops_executorch torchao_kernels_aarch64
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -724,7 +734,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
     install(
       TARGETS kleidiai
       EXPORT ExecuTorchTargets
-      DESTINATION lib
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}
       INCLUDES
       DESTINATION ${_common_include_directories}
     )
@@ -1004,7 +1014,7 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
   install(
     TARGETS executorch_selected_kernels
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 else()
   # No selective build - link the full library.
@@ -1026,6 +1036,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
 
@@ -30,7 +30,9 @@ set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp)
 add_library(aoti_common STATIC ${_aoti_common_sources})
 target_include_directories(
   aoti_common
-  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
          # PyTorch AOTI headers from ExecuTorch's torch detection
          ${TORCH_INCLUDE_DIRS}
 )
@@ -50,5 +52,5 @@ executorch_target_link_options_shared_lib(aoti_common)
 install(
   TARGETS aoti_common
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
@@ -21,6 +21,7 @@ using executorch::runtime::etensor::Tensor;
 extern "C" {
 
 // Type definitions
+using AOTITensorHandle = Tensor*;
 using AOTIRuntimeError = Error;
 
 // Forward declarations for AOT Inductor model container
@@ -74,6 +75,7 @@ extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
 // AOTI Delegate Handle structure
 struct AOTIDelegateHandle {
   void* so_handle;
+  std::string so_path;
   AOTInductorModelContainerHandle container_handle;
 };