Skip to content

Commit 8e80529

Browse files
committed
solve namespace conflict
1 parent ba170ae commit 8e80529

File tree

10 files changed

+1597
-253
lines changed

10 files changed

+1597
-253
lines changed

backends/cuda/CMakeLists.txt

Lines changed: 8 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
#
1717
cmake_minimum_required(VERSION 3.29)
1818

19+
# Enable CUDA language support
20+
enable_language(CUDA)
21+
1922
set(CMAKE_CXX_STANDARD 17)
2023
set(CMAKE_CXX_STANDARD_REQUIRED ON)
2124
set(CMAKE_CUDA_STANDARD 17)
@@ -30,56 +33,21 @@ endif()
3033

3134
find_package(CUDAToolkit REQUIRED)
3235

33-
# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
36+
# Use ExecuTorch's standard way to find PyTorch libraries for AOTI
3437
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
3538
find_package_torch_headers()
3639

37-
# CUDA tensor maker for backends that support incontiguous tensors
38-
set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
39-
add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
40-
target_include_directories(
41-
cuda_tensor_maker
42-
PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
43-
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
44-
)
45-
target_compile_options(
46-
cuda_tensor_maker
47-
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
48-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
49-
)
50-
# Ensure symbols are exported properly
51-
if(APPLE)
52-
target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
53-
else()
54-
target_link_options(
55-
cuda_tensor_maker PUBLIC
56-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
57-
)
58-
endif()
59-
60-
# Link against ExecuTorch core libraries
61-
target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
62-
executorch_target_link_options_shared_lib(cuda_tensor_maker)
63-
64-
install(
65-
TARGETS cuda_tensor_maker
66-
EXPORT ExecuTorchTargets
67-
DESTINATION lib
68-
)
69-
7040
# CUDA-specific AOTI functionality
7141
set(_aoti_cuda_sources
7242
runtime/cuda_backend.cpp
7343
runtime/platform/platform.cpp
44+
# runtime/slim/cuda/int4mm.cu
7445
)
7546
add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
7647
target_include_directories(
7748
aoti_cuda
78-
PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
79-
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
49+
PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
8050
$<INSTALL_INTERFACE:include>
81-
# PyTorch AOTI headers from ExecutorTorch's torch detection
82-
${TORCH_INCLUDE_DIRS}
8351
)
8452
target_compile_options(
8553
aoti_cuda PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
@@ -96,9 +64,9 @@ target_link_options(
9664
aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
9765
)
9866

99-
# Link against CUDA::cudart, cuda_tensor_maker, and PyTorch CUDA libraries
67+
# Link against CUDA::cudart and PyTorch CUDA libraries
10068
target_link_libraries(
101-
aoti_cuda PUBLIC executorch cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
69+
aoti_cuda PUBLIC executorch CUDA::cudart ${CMAKE_DL_LIBS}
10270
)
10371
# If you need other CUDA libraries, link them similarly:
10472
# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)

backends/cuda/runtime/shims/aoti_torch/c/macros.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,3 @@ using AOTITorchError = int32_t;
3636
// work without any change, e.g. c10::DeviceType::CUDA will actually refer to
3737
// executorch::backends::cuda::c10::DeviceType::CUDA
3838
using namespace executorch::backends::cuda;
39-
using namespace executorch::backends::cuda::c10;

backends/cuda/runtime/shims/aoti_torch/c/shim.h

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616

1717
using AtenTensorOpaque = executorch::backends::cuda::slim::SlimTensor;
1818
using AtenTensorHandle = executorch::backends::cuda::slim::SlimTensor *;
19-
using namespace executorch::backends::cuda::c10;
20-
2119

2220
// AOTIProxyExecutorHandle isn't supported in standalone mode.
2321
// Just defining it to void* to make the code compile
@@ -30,7 +28,7 @@ extern "C" {
3028
// DeviceType
3129
#define AOTI_TORCH_DEVICE_TYPE_IMPL(device_str, device_type) \
3230
AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_##device_str() { \
33-
return (int32_t)DeviceType::device_type; \
31+
return (int32_t) executorch::backends::cuda::c10::DeviceType::device_type; \
3432
}
3533

3634
AOTI_TORCH_DEVICE_TYPE_IMPL(cpu, CPU)
@@ -42,7 +40,7 @@ AOTI_TORCH_DEVICE_TYPE_IMPL(xpu, XPU)
4240
// SclarType
4341
#define AOTI_TORCH_DTYPE_IMPL(dtype, stype) \
4442
AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_##dtype() { \
45-
return (int32_t)ScalarType::stype; \
43+
return (int32_t) executorch::backends::cuda::c10::ScalarType::stype; \
4644
}
4745

4846
AOTI_TORCH_DTYPE_IMPL(float8_e5m2, Float8_e5m2)
@@ -69,7 +67,7 @@ AOTI_TORCH_DTYPE_IMPL(complex128, ComplexDouble)
6967

7068
#define AOTI_TORCH_LAYOUT_IMPL(name, enum) \
7169
AOTI_TORCH_EXPORT int32_t aoti_torch_layout_##name() { \
72-
return (int32_t)Layout::enum; \
70+
return (int32_t) executorch::backends::cuda::c10::Layout::enum; \
7371
}
7472

7573
AOTI_TORCH_LAYOUT_IMPL(strided, Strided)
@@ -84,7 +82,7 @@ AOTI_TORCH_LAYOUT_IMPL(jagged, Jagged)
8482

8583
#define AOTI_TORCH_MEMORY_FORMAT_IMPL(name, enum) \
8684
AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_##name() { \
87-
return (int32_t)MemoryFormat::enum; \
85+
return (int32_t) executorch::backends::cuda::c10::MemoryFormat::enum; \
8886
}
8987

9088
AOTI_TORCH_MEMORY_FORMAT_IMPL(contiguous_format, Contiguous)
@@ -112,8 +110,8 @@ AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int16, int16_t)
112110
AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int32, int32_t)
113111
AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int64, int64_t)
114112
AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(bool, bool)
115-
AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex64, complex<float>)
116-
AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex128, complex<double>)
113+
AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex64, executorch::backends::cuda::c10::complex<float>)
114+
AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(complex128, executorch::backends::cuda::c10::complex<double>)
117115
#undef AOTI_TORCH_SCALAR_TO_TENSOR_IMPL
118116

119117
AOTI_TORCH_EXPORT bool aoti_torch_grad_mode_is_enabled() { return false; }
@@ -201,13 +199,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
201199
const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype,
202200
int32_t device_type, int32_t device_index,
203201
AtenTensorHandle *ret_new_tensor) {
204-
IntArrayRef sizes(sizes_ptr, ndim);
205-
IntArrayRef strides(strides_ptr, ndim);
202+
executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
203+
executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
206204
*ret_new_tensor =
207205
new executorch::backends::cuda::slim::SlimTensor(executorch::backends::cuda::slim::from_blob(
208-
data, sizes, strides, static_cast<ScalarType>(dtype),
209-
{static_cast<DeviceType>(device_type),
210-
static_cast<DeviceIndex>(device_index)},
206+
data, sizes, strides, static_cast<executorch::backends::cuda::c10::ScalarType>(dtype),
207+
{static_cast<executorch::backends::cuda::c10::DeviceType>(device_type),
208+
static_cast<executorch::backends::cuda::c10::DeviceIndex>(device_index)},
211209
storage_offset));
212210
return AOTI_TORCH_SUCCESS;
213211
}
@@ -218,13 +216,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
218216
int32_t device_type, int32_t device_index, AtenTensorHandle *ret_new_tensor,
219217
int32_t layout, const uint8_t *opaque_metadata,
220218
int64_t opaque_metadata_size) {
221-
IntArrayRef sizes(sizes_ptr, ndim);
222-
IntArrayRef strides(strides_ptr, ndim);
219+
executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
220+
executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
223221
*ret_new_tensor =
224222
new executorch::backends::cuda::slim::SlimTensor(executorch::backends::cuda::slim::from_blob(
225-
data, sizes, strides, static_cast<ScalarType>(dtype),
226-
{static_cast<DeviceType>(device_type),
227-
static_cast<DeviceIndex>(device_index)},
223+
data, sizes, strides, static_cast<executorch::backends::cuda::c10::ScalarType>(dtype),
224+
{static_cast<executorch::backends::cuda::c10::DeviceType>(device_type),
225+
static_cast<executorch::backends::cuda::c10::DeviceIndex>(device_index)},
228226
storage_offset));
229227
return AOTI_TORCH_SUCCESS;
230228
}
@@ -233,12 +231,12 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
233231
int64_t ndim, const int64_t *sizes_ptr, const int64_t *strides_ptr,
234232
int32_t dtype, int32_t device_type, int32_t device_index,
235233
AtenTensorHandle *ret_new_tensor) {
236-
IntArrayRef sizes(sizes_ptr, ndim);
237-
IntArrayRef strides(strides_ptr, ndim);
234+
executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
235+
executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
238236
auto empty_strided = executorch::backends::cuda::slim::empty_strided(
239-
sizes, strides, static_cast<ScalarType>(dtype),
240-
{static_cast<DeviceType>(device_type),
241-
static_cast<DeviceIndex>(device_index)});
237+
sizes, strides, static_cast<executorch::backends::cuda::c10::ScalarType>(dtype),
238+
{static_cast<executorch::backends::cuda::c10::DeviceType>(device_type),
239+
static_cast<executorch::backends::cuda::c10::DeviceIndex>(device_index)});
242240
*ret_new_tensor =
243241
new executorch::backends::cuda::slim::SlimTensor(empty_strided);
244242
return AOTI_TORCH_SUCCESS;
@@ -248,8 +246,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
248246
AtenTensorHandle self, int64_t ndim, const int64_t *sizes_ptr,
249247
const int64_t *strides_ptr, int64_t offset_increment,
250248
AtenTensorHandle *ret_new_tensor) {
251-
IntArrayRef sizes(sizes_ptr, ndim);
252-
IntArrayRef strides(strides_ptr, ndim);
249+
executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, ndim);
250+
executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, ndim);
253251
*ret_new_tensor = new executorch::backends::cuda::slim::SlimTensor(
254252
self->storage(), sizes, strides, self->dtype(),
255253
self->storage_offset() + offset_increment);
@@ -259,8 +257,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
259257
AOTI_TORCH_EXPORT AOTITorchError
260258
aoti_torch_as_strided(AtenTensorHandle self, const int64_t *sizes_ptr,
261259
const int64_t *strides_ptr, AtenTensorHandle *ret) {
262-
IntArrayRef sizes(sizes_ptr, self->dim());
263-
IntArrayRef strides(strides_ptr, self->dim());
260+
executorch::backends::cuda::c10::IntArrayRef sizes(sizes_ptr, self->dim());
261+
executorch::backends::cuda::c10::IntArrayRef strides(strides_ptr, self->dim());
264262
*ret = new executorch::backends::cuda::slim::SlimTensor(
265263
self->storage(), sizes, strides, self->dtype(), self->storage_offset());
266264
return AOTI_TORCH_SUCCESS;
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <cuda.h>
10+
#include <cuda_runtime.h>
11+
12+
#include <executorch/backends/cuda/runtime/shims/aoti_torch/c/macros.h>
13+
#include <executorch/backends/cuda/runtime/slim/cuda/int4mm.h>
14+
#include <executorch/backends/cuda/runtime/slim/cuda/int4mm.cuh>
15+
16+
namespace executorch::backends::cuda::slim::cuda {
17+
#ifdef __cplusplus
18+
extern "C" {
19+
#endif
20+
21+
AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
22+
AOTITensorHandle self,
23+
AOTITensorHandle mat2,
24+
int64_t qGroupSize,
25+
AOTITensorHandle qScaleAndZeros,
26+
AOTITensorHandle* ret0) {
27+
// Validate input parameters first
28+
// Only check for null pointers here, as the actual validation of tensor
29+
// properties is done in _weight_int4pack_mm_cuda
30+
// ET_CHECK_OR_RETURN_ERROR(
31+
// self != nullptr,
32+
// InvalidArgument,
33+
// "aoti_torch_cuda__weight_int4pack_mm failed: self tensor is null");
34+
35+
// ET_CHECK_OR_RETURN_ERROR(
36+
// mat2 != nullptr,
37+
// InvalidArgument,
38+
// "aoti_torch_cuda__weight_int4pack_mm failed: mat2 tensor is null");
39+
40+
// ET_CHECK_OR_RETURN_ERROR(
41+
// qScaleAndZeros != nullptr,
42+
// InvalidArgument,
43+
// "aoti_torch_cuda__weight_int4pack_mm failed: qScaleAndZeros tensor is
44+
// null");
45+
46+
// ET_CHECK_OR_RETURN_ERROR(
47+
// ret0 != nullptr,
48+
// InvalidArgument,
49+
// "aoti_torch_cuda__weight_int4pack_mm failed: ret0 is null");
50+
51+
*ret0 = _weight_int4pack_mm_cuda(*self, *mat2, qGroupSize, *qScaleAndZeros);
52+
// ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR();
53+
// return Error::Ok;
54+
return AOTI_TORCH_SUCCESS;
55+
}
56+
57+
#ifdef __cplusplus
58+
}
59+
#endif
60+
} // namespace executorch::backends::cuda::slim::cuda

0 commit comments

Comments
 (0)