Skip to content

Commit d7c581e

Browse files
committed
[slimtensor] Add CUDA Storage with DeviceTraits and memory allocation
This diff adds CUDA storage infrastructure to SlimTensor, enabling GPU memory allocation and management. **Key changes:** 1. **`cuda/Guard.h`** - CUDAGuard RAII class: - Saves current CUDA device on construction, restores on destruction - Exception-safe device context switching - Constructors accept device index or Device object 2. **`core/Storage.h`** - Extended for CUDA support: - Added `DeviceTraits<DeviceType::CUDA>` specialization with: - `allocate()` - Uses cudaMalloc with CUDAGuard for device selection - `free()` - Uses cudaFree with warning on error - `memcpy()` - Supports Host↔Device and Device↔Device copies - Added `DEFAULT_CUDA_DEVICE` constant - Updated `MaybeOwningStorage` constructor to handle CUDA devices - Stub implementation when `CUDA_AVAILABLE` is not defined (throws error) Differential Revision: [D89826553](https://our.internmc.facebook.com/intern/diff/D89826553/) [ghstack-poisoned]
1 parent 3bdd8a8 commit d7c581e

File tree

10 files changed

+575
-78
lines changed

10 files changed

+575
-78
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#ifdef CUDA_AVAILABLE
12+
13+
#include <cuda.h>
14+
#include <cuda_runtime.h>
15+
16+
#include <executorch/backends/aoti/slim/c10/macros/Macros.h>
17+
#include <executorch/runtime/platform/assert.h>
18+
#include <executorch/runtime/platform/log.h>
19+
20+
/// Checks a CUDA expression and aborts on error.
21+
/// @param EXPR The CUDA expression to check.
22+
#define ET_CUDA_CHECK(EXPR) \
23+
do { \
24+
const cudaError_t __err = EXPR; \
25+
ET_CHECK_MSG( \
26+
__err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
27+
} while (0)
28+
29+
/// Checks a CUDA expression and logs a warning on error (non-fatal).
30+
/// @param EXPR The CUDA expression to check.
31+
#define ET_CUDA_LOG_WARN(EXPR) \
32+
do { \
33+
const cudaError_t __err = EXPR; \
34+
if (SLIMTENSOR_UNLIKELY(__err != cudaSuccess)) { \
35+
[[maybe_unused]] auto error_unused = cudaGetLastError(); \
36+
ET_LOG(Error, "CUDA warning: %s", cudaGetErrorString(__err)); \
37+
} \
38+
} while (0)
39+
40+
#endif // CUDA_AVAILABLE
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
load(":targets.bzl", "define_common_targets")
3+
4+
oncall("executorch")
5+
6+
define_common_targets()
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
3+
def define_common_targets():
4+
"""Define targets for SlimTensor CUDA exception handling module."""
5+
6+
runtime.cxx_library(
7+
name = "exception",
8+
exported_headers = [
9+
"Exception.h",
10+
],
11+
visibility = ["@EXECUTORCH_CLIENTS"],
12+
exported_deps = [
13+
"//executorch/backends/aoti/slim/c10/macros:macros",
14+
"//executorch/runtime/platform:platform",
15+
],
16+
)

backends/aoti/slim/core/Storage.h

Lines changed: 107 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,18 @@
1010

1111
#include <cstring>
1212

13+
#ifdef CUDA_AVAILABLE
14+
#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
15+
#include <executorch/backends/aoti/slim/cuda/Guard.h>
16+
#endif
17+
1318
#include <executorch/backends/aoti/slim/c10/core/Device.h>
1419
#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
1520
#include <executorch/backends/aoti/slim/util/ArrayRefUtil.h>
1621
#include <executorch/backends/aoti/slim/util/SharedPtr.h>
1722
#include <executorch/backends/aoti/slim/util/SizeUtil.h>
1823
#include <executorch/runtime/platform/assert.h>
24+
#include <executorch/runtime/platform/log.h>
1925

2026
namespace executorch::backends::aoti::slim {
2127

@@ -30,6 +36,10 @@ inline void noop(void*) {}
3036
/// Default CPU device constant.
3137
inline const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0);
3238

39+
/// Default CUDA device constant.
40+
inline const c10::Device DEFAULT_CUDA_DEVICE =
41+
c10::Device(c10::DeviceType::CUDA, 0);
42+
3343
/// DeviceTraits template for device-specific operations.
3444
/// Device-specific implementations provide allocate(), free(), and memcpy().
3545
template <c10::DeviceType D>
@@ -74,6 +84,93 @@ struct DeviceTraits<c10::DeviceType::CPU> {
7484
}
7585
};
7686

87+
#ifdef CUDA_AVAILABLE
88+
/// CUDA specialization of DeviceTraits.
89+
/// Provides CUDA memory allocation and copy operations using
90+
/// cudaMalloc/cudaFree.
91+
template <>
92+
struct DeviceTraits<c10::DeviceType::CUDA> {
93+
/// Allocates CUDA device memory.
94+
/// @param nbytes Number of bytes to allocate.
95+
/// @param device The target CUDA device.
96+
/// @return Pointer to allocated device memory.
97+
static void* allocate(size_t nbytes, const c10::Device& device) {
98+
cuda::CUDAGuard guard(device);
99+
void* data = nullptr;
100+
ET_CUDA_CHECK(cudaMalloc(&data, nbytes));
101+
return data;
102+
}
103+
104+
/// Frees CUDA device memory.
105+
/// @param ptr Pointer to device memory to free.
106+
static void free(void* ptr) {
107+
ET_CUDA_LOG_WARN(cudaFree(ptr));
108+
}
109+
110+
/// Copies memory between CPU and CUDA or CUDA and CUDA.
111+
/// @param dst Destination pointer.
112+
/// @param src Source pointer.
113+
/// @param nbytes Number of bytes to copy.
114+
/// @param dst_device Destination device.
115+
/// @param src_device Source device.
116+
static void memcpy(
117+
void* dst,
118+
const void* src,
119+
size_t nbytes,
120+
const c10::Device& dst_device,
121+
const c10::Device& src_device) {
122+
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
123+
c10::Device cuda_device = dst_device;
124+
125+
if (src_device.is_cpu()) {
126+
direction = cudaMemcpyHostToDevice;
127+
} else if (dst_device.is_cpu()) {
128+
direction = cudaMemcpyDeviceToHost;
129+
cuda_device = src_device;
130+
} else {
131+
ET_CHECK_MSG(
132+
src_device.index() == dst_device.index(),
133+
"CUDA memcpy across different device indices not supported: %d != %d",
134+
static_cast<int>(src_device.index()),
135+
static_cast<int>(dst_device.index()));
136+
}
137+
138+
cuda::CUDAGuard guard(cuda_device);
139+
ET_CUDA_CHECK(cudaMemcpy(dst, src, nbytes, direction));
140+
}
141+
};
142+
#else
143+
/// CUDA stub when CUDA_AVAILABLE is not defined.
144+
/// All operations abort with an error message.
145+
template <>
146+
struct DeviceTraits<c10::DeviceType::CUDA> {
147+
static void* allocate(size_t nbytes, const c10::Device& device) {
148+
(void)nbytes;
149+
(void)device;
150+
ET_CHECK_MSG(false, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
151+
}
152+
153+
static void free(void* ptr) {
154+
(void)ptr;
155+
ET_LOG(Error, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
156+
}
157+
158+
static void memcpy(
159+
void* dst,
160+
const void* src,
161+
size_t nbytes,
162+
const c10::Device& dst_device,
163+
const c10::Device& src_device) {
164+
(void)dst;
165+
(void)src;
166+
(void)nbytes;
167+
(void)dst_device;
168+
(void)src_device;
169+
ET_CHECK_MSG(false, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
170+
}
171+
};
172+
#endif // CUDA_AVAILABLE
173+
77174
/**
78175
* MaybeOwningStorage - A storage class that manages tensor data memory.
79176
*
@@ -93,17 +190,19 @@ struct DeviceTraits<c10::DeviceType::CPU> {
93190
class MaybeOwningStorage {
94191
public:
95192
/// Constructs owning storage with allocated memory.
96-
/// @param device The device for storage (must be CPU).
193+
/// @param device The device for storage (CPU or CUDA).
97194
/// @param nbytes Number of bytes to allocate.
98195
MaybeOwningStorage(const c10::Device& device, size_t nbytes)
99196
: device_(device), capacity_(nbytes), is_owning_(true) {
100-
ET_CHECK_MSG(
101-
device.is_cpu(),
102-
"Only CPU device is currently supported, got: %s",
103-
device.str().c_str());
104-
105-
data_ = DeviceTraits<c10::DeviceType::CPU>::allocate(nbytes, device);
106-
deleter_ = DeviceTraits<c10::DeviceType::CPU>::free;
197+
if (device.is_cpu()) {
198+
data_ = DeviceTraits<c10::DeviceType::CPU>::allocate(nbytes, device);
199+
deleter_ = DeviceTraits<c10::DeviceType::CPU>::free;
200+
} else if (device.is_cuda()) {
201+
data_ = DeviceTraits<c10::DeviceType::CUDA>::allocate(nbytes, device);
202+
deleter_ = DeviceTraits<c10::DeviceType::CUDA>::free;
203+
} else {
204+
ET_CHECK_MSG(false, "Unsupported device type: %s", device.str().c_str());
205+
}
107206
}
108207

109208
/// Default constructor is deleted - storage must have a device.

backends/aoti/slim/core/targets.bzl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ def define_common_targets():
1717
"//executorch/backends/aoti/slim/util:shared_ptr",
1818
"//executorch/backends/aoti/slim/util:size_util",
1919
"//executorch/runtime/platform:platform",
20+
"//executorch/backends/aoti/slim/c10/cuda:exception",
21+
"//executorch/backends/aoti/slim/cuda:guard",
2022
],
2123
)
2224

23-
# Header-only library for SlimTensor
25+
# Header-only library for SlimTensor (CPU-only for now)
2426
runtime.cxx_library(
2527
name = "slimtensor",
2628
headers = [

backends/aoti/slim/core/test/targets.bzl

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,36 @@
1+
load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
12
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
23

4+
def get_backend_mode():
5+
"""Get the supported backend mode of slimtensor."""
6+
return ["cuda", "cpu"]
7+
38
def define_common_targets():
49
"""Define test targets for SlimTensor core module."""
510

6-
runtime.cxx_test(
7-
name = "test_storage",
8-
srcs = [
9-
"test_storage.cpp",
10-
],
11-
deps = [
12-
"//executorch/backends/aoti/slim/core:storage",
13-
],
14-
)
11+
# GPU storage test with CUDA support
12+
for backend_mode in get_backend_mode():
13+
backend_suffix = "_" + backend_mode if backend_mode == "cuda" else ""
14+
15+
backend_kwargs = {
16+
"external_deps": [("cuda", None, "cuda-lazy")],
17+
"preprocessor_flags": ["-DCUDA_AVAILABLE=1"],
18+
"keep_gpu_sections": True,
19+
"remote_execution": re_test_utils.remote_execution(
20+
platform = "gpu-remote-execution",
21+
),
22+
} if backend_mode == "cuda" else {}
23+
24+
runtime.cxx_test(
25+
name = "test_storage" + backend_suffix,
26+
srcs = [
27+
"test_storage.cpp",
28+
],
29+
deps = [
30+
"//executorch/backends/aoti/slim/core:storage",
31+
],
32+
**backend_kwargs
33+
)
1534

1635
runtime.cxx_test(
1736
name = "test_slimtensor_basic",

0 commit comments

Comments
 (0)