Skip to content

Commit 7dd5801

Browse files
authored
Merge branch 'main' into new-audio-api
2 parents 8a0b403 + caa35f6 commit 7dd5801

File tree

6 files changed

+195
-30
lines changed

6 files changed

+195
-30
lines changed

backends/aoti/utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <executorch/runtime/platform/log.h>
1616
#include <cstddef>
1717
#include <cstdint>
18+
#include <vector>
1819

1920
namespace executorch {
2021
namespace backends {

backends/cuda/CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,13 @@ find_package_torch()
3636

3737
# CUDA-specific AOTI functionality
3838
set(_aoti_cuda_sources
39-
runtime/cuda_backend.cpp runtime/shims/memory.cpp
40-
runtime/shims/tensor_attribute.cpp runtime/guard.cpp
41-
runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
39+
runtime/cuda_backend.cpp
40+
runtime/shims/memory.cpp
41+
runtime/shims/tensor_attribute.cpp
42+
runtime/guard.cpp
43+
runtime/shims/cuda_guard.cpp
44+
runtime/shims/int4mm.cu
45+
runtime/platform/platform.cpp
4246
)
4347
add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
4448
target_include_directories(

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,10 @@
77
*/
88

99
#include <cuda_runtime.h>
10-
#include <dlfcn.h>
1110
#include <executorch/runtime/backend/interface.h>
1211
#include <executorch/runtime/core/error.h>
1312
#include <executorch/runtime/core/evalue.h>
1413
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
15-
#include <unistd.h>
1614
#include <cstdio>
1715

1816
#include <filesystem>
@@ -23,16 +21,19 @@
2321
// Include our shim layer headers
2422
#include <executorch/backends/aoti/aoti_delegate_handle.h>
2523
#include <executorch/backends/aoti/common_shims.h>
24+
#include <executorch/backends/cuda/runtime/platform/platform.h>
2625
#include <executorch/backends/cuda/runtime/shims/memory.h>
2726
#include <executorch/backends/cuda/runtime/utils.h>
2827

2928
namespace executorch::backends::cuda {
3029

31-
#define LOAD_SYMBOL(handle, member, name, so_handle) \
32-
do { \
33-
handle->member = reinterpret_cast<name##Func>(dlsym(so_handle, #name)); \
34-
ET_CHECK_OR_RETURN_ERROR( \
35-
handle->member != nullptr, AccessFailed, "Failed to load " #name); \
30+
#define LOAD_SYMBOL(handle, member, name, so_handle) \
31+
do { \
32+
auto symbol_res = get_function(so_handle, #name); \
33+
if (!symbol_res.ok()) { \
34+
return symbol_res.error(); \
35+
} \
36+
handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
3637
} while (0)
3738

3839
using namespace std;
@@ -122,10 +123,10 @@ class ET_EXPERIMENTAL CudaBackend final
122123
// Generate dynamic temporary file path
123124
filesystem::path temp_dir = filesystem::temp_directory_path();
124125
filesystem::path so_path =
125-
temp_dir / (so_blob_key + to_string(getpid()) + ".so");
126+
temp_dir / (so_blob_key + to_string(get_process_id()) + ".so");
126127

127128
// Create a temporary file
128-
ofstream outfile(so_path.c_str(), ios::binary);
129+
ofstream outfile(so_path, ios::binary);
129130

130131
// Write the ELF buffer to the temporary file
131132
ET_LOG(
@@ -144,24 +145,23 @@ class ET_EXPERIMENTAL CudaBackend final
144145
// Finish writing the file to disk
145146
outfile.close();
146147

147-
// Load the ELF using dlopen
148-
void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
149-
ET_CHECK_OR_RETURN_ERROR(
150-
so_handle != nullptr,
151-
AccessFailed,
152-
"Failed to load shared library: %s",
153-
dlerror());
148+
// Load the lib
149+
Result<void*> lib_handle_res = load_library(so_path);
150+
if (!lib_handle_res.ok()) {
151+
return lib_handle_res.error();
152+
}
153+
void* lib_handle = lib_handle_res.get();
154154

155155
processed->Free();
156156

157157
// Create handle and load function pointers into it
158158
AOTIDelegateHandle* handle = new AOTIDelegateHandle();
159-
handle->so_handle = so_handle;
159+
handle->so_handle = lib_handle;
160160
handle->so_path = so_path.string();
161161

162162
// Load function pointers specific to this handle's shared library
163163
ET_CHECK_OK_OR_RETURN_ERROR(
164-
load_function_pointers_into_handle(so_handle, handle));
164+
load_function_pointers_into_handle(lib_handle, handle));
165165

166166
AOTInductorModelContainerHandle container_handle = nullptr;
167167

@@ -332,8 +332,9 @@ class ET_EXPERIMENTAL CudaBackend final
332332
// AOTInductorModelContainerDelete(handle->container_handle);
333333

334334
// Now close the shared library
335+
auto err = Error::Ok;
335336
if (handle->so_handle != nullptr) {
336-
dlclose(handle->so_handle);
337+
err = close_library(handle->so_handle);
337338
}
338339

339340
// Remove the temporary shared library file
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
2+
/*
3+
* Copyright (c) Meta Platforms, Inc. and affiliates.
4+
* All rights reserved.
5+
*
6+
* This source code is licensed under the BSD-style license found in the
7+
* LICENSE file in the root directory of this source tree.
8+
*/
9+
10+
#include <executorch/backends/cuda/runtime/platform/platform.h>
11+
#include <executorch/runtime/core/error.h>
12+
#include <executorch/runtime/core/result.h>
13+
#include <filesystem>
14+
#include <string>
15+
16+
#ifdef _WIN32
17+
#include <malloc.h>
18+
#include <windows.h>
19+
#else // Posix
20+
#include <dlfcn.h>
21+
#include <unistd.h>
22+
#include <cstdlib>
23+
#endif
24+
25+
namespace executorch {
26+
namespace backends {
27+
namespace cuda {
28+
29+
executorch::runtime::Result<void*> load_library(
30+
const std::filesystem::path& path) {
31+
#ifdef _WIN32
32+
std::string utf8 = path.u8string();
33+
auto lib_handle = LoadLibrary(utf8.c_str());
34+
if (lib_handle == NULL) {
35+
ET_LOG(
36+
Error,
37+
"Failed to load %s with error: %lu",
38+
utf8.c_str(),
39+
GetLastError());
40+
return executorch::runtime::Error::AccessFailed;
41+
}
42+
43+
#else
44+
std::string path_str = path.string();
45+
void* lib_handle = dlopen(path_str.c_str(), RTLD_LAZY | RTLD_LOCAL);
46+
if (lib_handle == nullptr) {
47+
ET_LOG(
48+
Error, "Failed to load %s with error: %s", path_str.c_str(), dlerror());
49+
return executorch::runtime::Error::AccessFailed;
50+
}
51+
#endif
52+
return (void*)lib_handle;
53+
}
54+
55+
executorch::runtime::Error close_library(void* lib_handle) {
56+
#ifdef _WIN32
57+
if (!FreeLibrary((HMODULE)lib_handle)) {
58+
printf("FreeLibrary failed with error %lu\n", GetLastError());
59+
return executorch::runtime::Error::Internal;
60+
}
61+
#else
62+
if (dlclose(lib_handle) != 0) {
63+
ET_LOG(Error, "dlclose failed: %s\n", dlerror());
64+
return executorch::runtime::Error::Internal;
65+
}
66+
#endif
67+
return executorch::runtime::Error::Ok;
68+
}
69+
70+
executorch::runtime::Result<void*> get_function(
71+
void* lib_handle,
72+
const std::string& fn_name) {
73+
#ifdef _WIN32
74+
auto fn = GetProcAddress((HMODULE)lib_handle, fn_name.c_str());
75+
if (!fn) {
76+
ET_LOG(
77+
Error,
78+
"Failed loading symbol %s with error %lu\n",
79+
fn_name.c_str(),
80+
GetLastError());
81+
return executorch::runtime::Error::Internal;
82+
}
83+
#else
84+
auto fn = dlsym(lib_handle, fn_name.c_str());
85+
if (fn == nullptr) {
86+
ET_LOG(
87+
Error,
88+
"Failed loading symbol %s with error %s\n",
89+
fn_name.c_str(),
90+
dlerror());
91+
return executorch::runtime::Error::Internal;
92+
}
93+
#endif
94+
95+
return (void*)fn; // This I think is technically ub on windows. We should
96+
// probably explicitly pack the bytes.
97+
}
98+
99+
int32_t get_process_id() {
100+
#ifdef _WIN32
101+
return GetCurrentProcessId();
102+
#else
103+
return getpid();
104+
#endif
105+
}
106+
107+
void* aligned_alloc(size_t alignment, size_t size) {
108+
#ifdef _WIN32
109+
return _aligned_malloc(size, alignment);
110+
#else
111+
return std::aligned_alloc(alignment, size);
112+
#endif
113+
}
114+
115+
void aligned_free(void* ptr) {
116+
#ifdef _WIN32
117+
_aligned_free(ptr);
118+
#else
119+
std::free(ptr);
120+
#endif
121+
}
122+
123+
} // namespace cuda
124+
} // namespace backends
125+
} // namespace executorch
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
2+
/*
3+
* Copyright (c) Meta Platforms, Inc. and affiliates.
4+
* All rights reserved.
5+
*
6+
* This source code is licensed under the BSD-style license found in the
7+
* LICENSE file in the root directory of this source tree.
8+
*/
9+
10+
#pragma once
11+
12+
#include <executorch/runtime/core/error.h>
13+
#include <executorch/runtime/core/result.h>
14+
#include <filesystem>
15+
#include <string>
16+
17+
namespace executorch {
18+
namespace backends {
19+
namespace cuda {
20+
21+
executorch::runtime::Result<void*> load_library(
22+
const std::filesystem::path& path);
23+
24+
executorch::runtime::Error close_library(void* lib_handle);
25+
26+
executorch::runtime::Result<void*> get_function(
27+
void* lib_handle,
28+
const std::string& fn_name);
29+
30+
int32_t get_process_id();
31+
32+
void* aligned_alloc(size_t alignment, size_t size);
33+
34+
void aligned_free(void* ptr);
35+
36+
} // namespace cuda
37+
} // namespace backends
38+
} // namespace executorch

backends/cuda/runtime/shims/memory.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88

99
#include <executorch/backends/aoti/common_shims.h>
1010
#include <executorch/backends/aoti/utils.h>
11+
#include <executorch/backends/cuda/runtime/platform/platform.h>
1112
#include <executorch/backends/cuda/runtime/shims/memory.h>
1213
#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
1314
#include <executorch/backends/cuda/runtime/utils.h>
1415
#include <executorch/runtime/platform/log.h>
1516
#include <cstdint>
16-
#include <cstdlib> // For posix_memalign
1717
#include <memory>
1818
#include <unordered_map>
1919
#include <unordered_set>
@@ -230,15 +230,11 @@ AOTITorchError aoti_torch_empty_strided(
230230
cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
231231
} else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
232232
// Ensure 16-byte alignment for CPU memory to match CUDA requirements
233-
int result = posix_memalign(&ptr, 16, nbytes);
234-
ET_CHECK_OR_RETURN_ERROR(
235-
result == 0,
236-
MemoryAllocationFailed,
237-
"Failed to allocate aligned CPU memory");
233+
ptr = aligned_alloc(16, nbytes);
238234
ET_CHECK_OR_RETURN_ERROR(
239235
ptr != nullptr,
240236
MemoryAllocationFailed,
241-
"Failed to call posix_memalign");
237+
"Failed to allocate aligned CPU memory");
242238
} else {
243239
ET_CHECK_OR_RETURN_ERROR(
244240
false,
@@ -339,7 +335,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
339335
Internal,
340336
"Expected host memory but got managed!")
341337
// This is CPU memory - free immediately
342-
free(data_ptr);
338+
aligned_free(data_ptr);
343339
data_ptr = nullptr;
344340
}
345341

0 commit comments

Comments
 (0)