pytorch
diff --git a/‎backends/cuda/CMakeLists.txt‎
Lines changed: 8 additions & 40 deletions b/‎backends/cuda/CMakeLists.txt‎
Lines changed: 8 additions & 40 deletions
diff --git a/‎backends/cuda/runtime/shims/aoti_torch/c/macros.h‎
Lines changed: 0 additions & 1 deletion b/‎backends/cuda/runtime/shims/aoti_torch/c/macros.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/cuda/runtime/shims/aoti_torch/c/shim.h‎
Lines changed: 25 additions & 27 deletions b/‎backends/cuda/runtime/shims/aoti_torch/c/shim.h‎
Lines changed: 25 additions & 27 deletions
diff --git a/‎backends/cuda/runtime/slim/cuda/int4mm.cu‎
Lines changed: 60 additions & 0 deletions b/‎backends/cuda/runtime/slim/cuda/int4mm.cu‎
Lines changed: 60 additions & 0 deletions
@@ -16,6 +16,9 @@
 #
 cmake_minimum_required(VERSION 3.29)
 
+# Enable CUDA language support
+enable_language(CUDA)
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_STANDARD 17)
@@ -30,56 +33,21 @@ endif()
 
 find_package(CUDAToolkit REQUIRED)
 
-# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+# Use ExecuTorch's standard way to find PyTorch libraries for AOTI
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch_headers()
 
-# CUDA tensor maker for backends that support incontiguous tensors
-set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
-add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
-target_include_directories(
-  cuda_tensor_maker
-  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
-         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
-)
-target_compile_options(
-  cuda_tensor_maker
-  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
-         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
-)
-# Ensure symbols are exported properly
-if(APPLE)
-  target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
-else()
-  target_link_options(
-    cuda_tensor_maker PUBLIC
-    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
-  )
-endif()
-
-# Link against ExecuTorch core libraries
-target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
-executorch_target_link_options_shared_lib(cuda_tensor_maker)
-
-install(
-  TARGETS cuda_tensor_maker
-  EXPORT ExecuTorchTargets
-  DESTINATION lib
-)
-
 # CUDA-specific AOTI functionality
 set(_aoti_cuda_sources
     runtime/cuda_backend.cpp
     runtime/platform/platform.cpp
+    # runtime/slim/cuda/int4mm.cu
 )
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(
   aoti_cuda
-  PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
-         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+  PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
          $<INSTALL_INTERFACE:include>
-         # PyTorch AOTI headers from ExecutorTorch's torch detection
-         ${TORCH_INCLUDE_DIRS}
 )
 target_compile_options(
   aoti_cuda PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
@@ -96,9 +64,9 @@ target_link_options(
   aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, cuda_tensor_maker, and PyTorch CUDA libraries
+# Link against CUDA::cudart and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC executorch cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC executorch CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
 
@@ -36,4 +36,3 @@ using AOTITorchError = int32_t;
 // work without any change, e.g. c10::DeviceType::CUDA will actually refer to
 // executorch::backends::cuda::c10::DeviceType::CUDA
 using namespace executorch::backends::cuda;
-using namespace executorch::backends::cuda::c10;
@@ -16,8 +16,6 @@
 
 using AtenTensorOpaque = executorch::backends::cuda::slim::SlimTensor;
 using AtenTensorHandle = executorch::backends::cuda::slim::SlimTensor *;
-using namespace executorch::backends::cuda::c10;
-
 
 // AOTIProxyExecutorHandle isn't supported in standalone mode.
 // Just defining it to void* to make the code compile
@@ -30,7 +28,7 @@ extern "C" {
 // DeviceType
 #define AOTI_TORCH_DEVICE_TYPE_IMPL(device_str, device_type)                   \
   AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_##device_str() {            \
-    return (int32_t)DeviceType::device_type;                              \
+    return (int32_t) executorch::backends::cuda::c10::DeviceType::device_type;                              \
   }
 
 AOTI_TORCH_DEVICE_TYPE_IMPL(cpu, CPU)
@@ -42,7 +40,7 @@ AOTI_TORCH_DEVICE_TYPE_IMPL(xpu, XPU)
 // SclarType
 #define AOTI_TORCH_DTYPE_IMPL(dtype, stype)                                    \
   AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_##dtype() {                       \
-    return (int32_t)ScalarType::stype;                                    \
+    return (int32_t) executorch::backends::cuda::c10::ScalarType::stype;                                    \
   }
 
 AOTI_TORCH_DTYPE_IMPL(float8_e5m2, Float8_e5m2)
@@ -69,7 +67,7 @@ AOTI_TORCH_DTYPE_IMPL(complex128, ComplexDouble)
 
 #define AOTI_TORCH_LAYOUT_IMPL(name, enum)                                     \
   AOTI_TORCH_EXPORT int32_t aoti_torch_layout_##name() {                       \
-    return (int32_t)Layout::enum;                                         \
+    return (int32_t) executorch::backends::cuda::c10::Layout::enum;                                         \
   }
 
 AOTI_TORCH_LAYOUT_IMPL(strided, Strided)
@@ -84,7 +82,7 @@ AOTI_TORCH_LAYOUT_IMPL(jagged, Jagged)
 
 #define AOTI_TORCH_MEMORY_FORMAT_IMPL(name, enum)                              \
   AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_##name() {                \
-    return (int32_t)MemoryFormat::enum;                                   \
+    return (int32_t) executorch::backends::cuda::c10::MemoryFormat::enum;                                   \
   }
 
 AOTI_TORCH_MEMORY_FORMAT_IMPL(contiguous_format, Contiguous)
@@ -112,8 +110,8 @@ AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int16, int16_t)
 AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int32, int32_t)
 AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int64, int64_t)
 AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(bool, bool)
-AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex64, complex<float>)
-AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex128, complex<double>)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex64, executorch::backends::cuda::c10::complex<float>)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex128, executorch::backends::cuda::c10::complex<double>)
 #undef AOTI_TORCH_SCALAR_TO_TENSOR_IMPL
 
 AOTI_TORCH_EXPORT bool aoti_torch_grad_mode_is_enabled() { return false; }
@@ -201,13 +199,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
     const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype,
     int32_t device_type, int32_t device_index,
     AtenTensorHandle *ret_new_tensor) {
-  IntArrayRef sizes(sizes_ptr, ndim);
-  IntArrayRef strides(strides_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
   *ret_new_tensor =
       new executorch::backends::cuda::slim::SlimTensor(executorch::backends::cuda::slim::from_blob(
-          data, sizes, strides, static_cast<ScalarType>(dtype),
-          {static_cast<DeviceType>(device_type),
-           static_cast<DeviceIndex>(device_index)},
+          data, sizes, strides, static_cast<executorch::backends::cuda::c10::ScalarType>(dtype),
+          {static_cast<executorch::backends::cuda::c10::DeviceType>(device_type),
+           static_cast<executorch::backends::cuda::c10::DeviceIndex>(device_index)},
           storage_offset));
   return AOTI_TORCH_SUCCESS;
 }
@@ -218,13 +216,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     int32_t device_type, int32_t device_index, AtenTensorHandle *ret_new_tensor,
     int32_t layout, const uint8_t *opaque_metadata,
     int64_t opaque_metadata_size) {
-  IntArrayRef sizes(sizes_ptr, ndim);
-  IntArrayRef strides(strides_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
   *ret_new_tensor =
       new executorch::backends::cuda::slim::SlimTensor(executorch::backends::cuda::slim::from_blob(
-          data, sizes, strides, static_cast<ScalarType>(dtype),
-          {static_cast<DeviceType>(device_type),
-           static_cast<DeviceIndex>(device_index)},
+          data, sizes, strides, static_cast<executorch::backends::cuda::c10::ScalarType>(dtype),
+          {static_cast<executorch::backends::cuda::c10::DeviceType>(device_type),
+           static_cast<executorch::backends::cuda::c10::DeviceIndex>(device_index)},
           storage_offset));
   return AOTI_TORCH_SUCCESS;
 }
@@ -233,12 +231,12 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
     int64_t ndim, const int64_t *sizes_ptr, const int64_t *strides_ptr,
     int32_t dtype, int32_t device_type, int32_t device_index,
     AtenTensorHandle *ret_new_tensor) {
-  IntArrayRef sizes(sizes_ptr, ndim);
-  IntArrayRef strides(strides_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
   auto empty_strided = executorch::backends::cuda::slim::empty_strided(
-          sizes, strides, static_cast<ScalarType>(dtype),
-          {static_cast<DeviceType>(device_type),
-           static_cast<DeviceIndex>(device_index)});
+          sizes, strides, static_cast<executorch::backends::cuda::c10::ScalarType>(dtype),
+          {static_cast<executorch::backends::cuda::c10::DeviceType>(device_type),
+           static_cast<executorch::backends::cuda::c10::DeviceIndex>(device_index)});
   *ret_new_tensor =
       new executorch::backends::cuda::slim::SlimTensor(empty_strided);
   return AOTI_TORCH_SUCCESS;
@@ -248,8 +246,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
     AtenTensorHandle self, int64_t ndim, const int64_t *sizes_ptr,
     const int64_t *strides_ptr, int64_t offset_increment,
     AtenTensorHandle *ret_new_tensor) {
-  IntArrayRef sizes(sizes_ptr, ndim);
-  IntArrayRef strides(strides_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
+  executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
   *ret_new_tensor = new executorch::backends::cuda::slim::SlimTensor(
       self->storage(), sizes, strides, self->dtype(),
       self->storage_offset() + offset_increment);
@@ -259,8 +257,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_as_strided(AtenTensorHandle self, const int64_t *sizes_ptr,
                       const int64_t *strides_ptr, AtenTensorHandle *ret) {
-  IntArrayRef sizes(sizes_ptr, self->dim());
-  IntArrayRef strides(strides_ptr, self->dim());
+  executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, self->dim());
+  executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, self->dim());
   *ret = new executorch::backends::cuda::slim::SlimTensor(
       self->storage(), sizes, strides, self->dtype(), self->storage_offset());
   return AOTI_TORCH_SUCCESS;
 
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <executorch/backends/cuda/runtime/shims/aoti_torch/c/macros.h>
+#include <executorch/backends/cuda/runtime/slim/cuda/int4mm.h>
+#include <executorch/backends/cuda/runtime/slim/cuda/int4mm.cuh>
+
+namespace executorch::backends::cuda::slim::cuda {
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
+    AOTITensorHandle self,
+    AOTITensorHandle mat2,
+    int64_t qGroupSize,
+    AOTITensorHandle qScaleAndZeros,
+    AOTITensorHandle* ret0) {
+  // Validate input parameters first
+  // Only check for null pointers here, as the actual validation of tensor
+  // properties is done in _weight_int4pack_mm_cuda
+  //   ET_CHECK_OR_RETURN_ERROR(
+  //       self != nullptr,
+  //       InvalidArgument,
+  //       "aoti_torch_cuda__weight_int4pack_mm failed: self tensor is null");
+
+  //   ET_CHECK_OR_RETURN_ERROR(
+  //       mat2 != nullptr,
+  //       InvalidArgument,
+  //       "aoti_torch_cuda__weight_int4pack_mm failed: mat2 tensor is null");
+
+  //   ET_CHECK_OR_RETURN_ERROR(
+  //       qScaleAndZeros != nullptr,
+  //       InvalidArgument,
+  //       "aoti_torch_cuda__weight_int4pack_mm failed: qScaleAndZeros tensor is
+  //       null");
+
+  //   ET_CHECK_OR_RETURN_ERROR(
+  //       ret0 != nullptr,
+  //       InvalidArgument,
+  //       "aoti_torch_cuda__weight_int4pack_mm failed: ret0 is null");
+
+  *ret0 = _weight_int4pack_mm_cuda(*self, *mat2, qGroupSize, *qScaleAndZeros);
+  //   ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR();
+  //   return Error::Ok;
+  return AOTI_TORCH_SUCCESS;
+}
+
+#ifdef __cplusplus
+}
+#endif
+} // namespace executorch::backends::cuda::slim::cuda