pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎backends/aoti/CMakeLists.txt‎
Lines changed: 17 additions & 31 deletions b/‎backends/aoti/CMakeLists.txt‎
Lines changed: 17 additions & 31 deletions
diff --git a/‎backends/aoti/README.md‎
Lines changed: 0 additions & 2 deletions b/‎backends/aoti/README.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/aoti/runtime/TARGETS‎ renamed to ‎backends/aoti/TARGETS‎ b/‎backends/aoti/runtime/TARGETS‎ renamed to ‎backends/aoti/TARGETS‎
diff --git a/‎backends/aoti/runtime/aoti_model_container.cpp‎ renamed to ‎backends/aoti/aoti_model_container.cpp‎ b/‎backends/aoti/runtime/aoti_model_container.cpp‎ renamed to ‎backends/aoti/aoti_model_container.cpp‎
diff --git a/‎backends/aoti/runtime/aoti_model_container.h‎ renamed to ‎backends/aoti/aoti_model_container.h‎
Lines changed: 1 addition & 1 deletion b/‎backends/aoti/runtime/aoti_model_container.h‎ renamed to ‎backends/aoti/aoti_model_container.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/runtime/shims/tensor_attribute.cpp‎ renamed to ‎backends/aoti/common_shims.cpp‎
Lines changed: 22 additions & 59 deletions b/‎backends/aoti/runtime/shims/tensor_attribute.cpp‎ renamed to ‎backends/aoti/common_shims.cpp‎
Lines changed: 22 additions & 59 deletions
diff --git a/‎backends/aoti/runtime/shims/tensor_attribute.h‎ renamed to ‎backends/aoti/common_shims.h‎
Lines changed: 17 additions & 7 deletions b/‎backends/aoti/runtime/shims/tensor_attribute.h‎ renamed to ‎backends/aoti/common_shims.h‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎backends/aoti/cuda/CMakeLists.txt‎
Lines changed: 70 additions & 0 deletions b/‎backends/aoti/cuda/CMakeLists.txt‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎backends/aoti/cuda/TARGETS‎
Lines changed: 3 additions & 0 deletions b/‎backends/aoti/cuda/TARGETS‎
Lines changed: 3 additions & 0 deletions
@@ -50,7 +50,6 @@
 cmake_minimum_required(VERSION 3.29)
 project(executorch)
 
-
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
@@ -592,9 +591,13 @@ if(EXECUTORCH_BUILD_CORTEX_M)
   list(APPEND _executorch_backends coretex_m_backend)
 endif()
 
-if(EXECUTORCH_BUILD_AOTI)
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
-  list(APPEND _executorch_backends aoti_backend)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
 
@@ -21,48 +21,34 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-find_package(CUDAToolkit REQUIRED)
-
 # Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
-set(_aoti_sources
-    runtime/aoti_backend.cpp
-    runtime/aoti_model_container.cpp
-    runtime/shims/memory.cpp
-    runtime/shims/tensor_attribute.cpp
-    runtime/shims/utils.cpp)
-add_library(aoti_backend STATIC ${_aoti_sources})
+# Common AOTI functionality (non-CUDA)
+set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp utils.cpp)
+add_library(aoti_common STATIC ${_aoti_common_sources})
 target_include_directories(
-  aoti_backend
-  PUBLIC
-    ${CUDAToolkit_INCLUDE_DIRS}
-    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
-    $<INSTALL_INTERFACE:include>
-    # PyTorch AOTI headers from ExecutorTorch's torch detection
-    ${TORCH_INCLUDE_DIRS}
+  aoti_common
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
 )
-target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC)
+target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
-target_link_options(aoti_backend PUBLIC -Wl,--export-dynamic)
+target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
 
-# Link against CUDA::cudart, PyTorch libraries and standard libraries
+# Link against PyTorch libraries and standard libraries
 target_link_libraries(
-  aoti_backend
-  PUBLIC
-    extension_tensor
-    CUDA::cudart
-    ${CMAKE_DL_LIBS}
-    # Link PyTorch libraries for AOTI CUDA functions
-    ${TORCH_LIBRARIES}
+  aoti_common
+  PUBLIC extension_tensor ${CMAKE_DL_LIBS}
+         # Link PyTorch libraries for AOTI functions
+         ${TORCH_LIBRARIES}
 )
-# If you need other CUDA libraries, link them similarly:
-# target_link_libraries(aoti_backend PUBLIC CUDA::cublas CUDA::cufft ...)
-# If you have a custom function, keep it
-executorch_target_link_options_shared_lib(aoti_backend)
+executorch_target_link_options_shared_lib(aoti_common)
+
 install(
-  TARGETS aoti_backend
+  TARGETS aoti_common
   EXPORT ExecuTorchTargets
   DESTINATION lib
 )
@@ -10,7 +10,7 @@
 
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
-#include "shims/memory.h"
+#include "cuda/runtime/shims/memory.h"
 
 namespace executorch {
 namespace backends {
 
@@ -6,20 +6,31 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "tensor_attribute.h"
+#include "common_shims.h"
+#include <executorch/runtime/platform/log.h>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
 #include <iostream>
-#include "utils.h"
+#include <stdexcept>
 
 namespace executorch {
 namespace backends {
 namespace aoti {
 
+namespace internal {
+// Constants for file operations
+const char* const TENSOR_OUTPUT_FILENAME =
+    "/home/gasoonjia/executorch/aoti_intermediate_output.txt";
+} // namespace internal
+
 // Global storage for tensor metadata
 std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
 std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
 
 extern "C" {
 
+// Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled() {
   // No autograd ever
   return false;
@@ -31,6 +42,7 @@ void aoti_torch_grad_mode_set_enabled(bool enabled) {
   }
 }
 
+// Tensor attribute operations
 AOTITorchError aoti_torch_get_data_ptr(
     AOTITensorHandle tensor,
     void** ret_data_ptr) {
@@ -69,12 +81,6 @@ AOTITorchError aoti_torch_get_dtype(
     int32_t* ret_dtype) {
   *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
 
-  // ASSERTION: Only float32 tensors are supported
-  AOTITorchError dtype_error = validate_dtype(*ret_dtype);
-  if (dtype_error != Error::Ok) {
-    return dtype_error;
-  }
-
   return Error::Ok;
 }
 
@@ -100,13 +106,6 @@ AOTITorchError aoti_torch_get_storage_size(
   throw std::runtime_error("Cannot get storage size on ETensor");
 }
 
-AOTITorchError aoti_torch_get_device_type(
-    AOTITensorHandle tensor,
-    int32_t* ret_device_type) {
-  // All tensors in aoti-cuda delegate are on CUDA
-  *ret_device_type = aoti_torch_device_type_cuda();
-  return Error::Ok;
-}
 
 AOTITorchError aoti_torch_get_device_index(
     AOTITensorHandle tensor,
@@ -121,6 +120,7 @@ AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim) {
   return Error::Ok;
 }
 
+// Device and layout utility functions
 int32_t aoti_torch_device_type_cpu() {
   // Let's say cpu is 0 for ET as well
   return 0;
@@ -132,60 +132,23 @@ __attribute__((__visibility__("default"))) int32_t aoti_torch_layout_strided() {
   return 0;
 }
 
-__attribute__((__visibility__("default"))) int32_t
-aoti_torch_device_type_cuda() {
-  // Let's say cuda is 1 for ET as well
-  return 1;
-}
-
 // Dtype constants - these return the PyTorch dtype codes
 // Currently only float32 is supported, but using robust enum-based approach
 __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() {
-  return static_cast<int32_t>(SupportedDTypes::FLOAT32);
+  return 6; // PyTorch's float32 dtype code
 }
 
-// Future dtype support (commented out for now):
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bool() {
-//   return static_cast<int32_t>(SupportedDTypes::BOOL);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_uint8() {
-//   return static_cast<int32_t>(SupportedDTypes::UINT8);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int8() {
-//   return static_cast<int32_t>(SupportedDTypes::INT8);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int16() {
-//   return static_cast<int32_t>(SupportedDTypes::INT16);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int32() {
-//   return static_cast<int32_t>(SupportedDTypes::INT32);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int64() {
-//   return static_cast<int32_t>(SupportedDTypes::INT64);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float16() {
-//   return static_cast<int32_t>(SupportedDTypes::FLOAT16);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float64() {
-//   return static_cast<int32_t>(SupportedDTypes::FLOAT64);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bfloat16() {
-//   return static_cast<int32_t>(SupportedDTypes::BFLOAT16);
-// }
-
+// Cleanup functions
 void cleanup_tensor_metadata() {
   tensor_to_sizes.clear();
   tensor_to_strides.clear();
 }
 
+void cleanup_aoti_tensor_output() {
+  // Clean up any tensor output related resources
+  // For now this is a no-op, but can be extended if needed
+}
+
 } // extern "C"
 
 } // namespace aoti
 
@@ -9,16 +9,30 @@
 #pragma once
 
 #include <cuda_runtime.h>
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <cstdint>
 #include <unordered_map>
 #include <vector>
-#include "types.h"
 
 namespace executorch {
 namespace backends {
 namespace aoti {
 
+// Common using declarations for ExecutorTorch types
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
 extern "C" {
 
+// Common AOTI type aliases
+// Note: AOTITensorHandle is aliased to Tensor* for ExecutorTorch compatibility
+using AOTITensorHandle = Tensor*;
+using AOTIRuntimeError = Error;
+using AOTITorchError = Error;
+
 // Global storage for tensor metadata
 extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
 extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
@@ -48,10 +62,6 @@ AOTITorchError aoti_torch_get_storage_size(
     AOTITensorHandle tensor,
     int64_t* ret_size);
 
-AOTITorchError aoti_torch_get_device_type(
-    AOTITensorHandle tensor,
-    int32_t* ret_device_type);
-
 AOTITorchError aoti_torch_get_device_index(
     AOTITensorHandle tensor,
     int32_t* ret_device_index);
@@ -60,16 +70,16 @@ AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim);
 
 // Utility functions for device and layout information
 int32_t aoti_torch_device_type_cpu();
-int32_t aoti_torch_device_type_cuda();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
 
 // Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled();
 void aoti_torch_grad_mode_set_enabled(bool enabled);
 
-// Cleanup function for clearing global state
+// Cleanup functions for clearing global state
 void cleanup_tensor_metadata();
+void cleanup_aoti_tensor_output();
 
 } // extern "C"
 
 
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI CUDA backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# CUDA-specific AOTI functionality
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp
+    runtime/utils.cpp)
+add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+target_include_directories(
+  aoti_cuda
+  PUBLIC
+    ${CUDAToolkit_INCLUDE_DIRS}
+    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+    $<INSTALL_INTERFACE:include>
+    # PyTorch AOTI headers from ExecutorTorch's torch detection
+    ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
+# Ensure symbols are exported properly
+target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
+
+# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+target_link_libraries(
+  aoti_cuda
+  PUBLIC
+    aoti_common
+    CUDA::cudart
+    ${CMAKE_DL_LIBS}
+    # Link PyTorch libraries for AOTI CUDA functions
+    ${TORCH_LIBRARIES}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
+executorch_target_link_options_shared_lib(aoti_cuda)
+
+
+install(
+  TARGETS aoti_cuda
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
@@ -0,0 +1,3 @@
+load("targets.bzl", "define_common_targets")
+
+define_common_targets()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+load("targets.bzl", "define_common_targets")`
	`2`	`+`
	`3`	`+define_common_targets()`