diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h index b24fcaac864..80abe663fa2 100644 --- a/backends/aoti/utils.h +++ b/backends/aoti/utils.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace executorch { namespace backends { diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 8285fc0d582..8b94351d469 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -36,9 +36,13 @@ find_package_torch() # CUDA-specific AOTI functionality set(_aoti_cuda_sources - runtime/cuda_backend.cpp runtime/shims/memory.cpp - runtime/shims/tensor_attribute.cpp runtime/guard.cpp - runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu + runtime/cuda_backend.cpp + runtime/shims/memory.cpp + runtime/shims/tensor_attribute.cpp + runtime/guard.cpp + runtime/shims/cuda_guard.cpp + runtime/shims/int4mm.cu + runtime/platform/platform.cpp ) add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) target_include_directories( diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 3fcd25a3d1d..e61b03ee8e6 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -7,12 +7,10 @@ */ #include -#include #include #include #include #include -#include #include #include @@ -23,16 +21,19 @@ // Include our shim layer headers #include #include +#include #include #include namespace executorch::backends::cuda { -#define LOAD_SYMBOL(handle, member, name, so_handle) \ - do { \ - handle->member = reinterpret_cast(dlsym(so_handle, #name)); \ - ET_CHECK_OR_RETURN_ERROR( \ - handle->member != nullptr, AccessFailed, "Failed to load " #name); \ +#define LOAD_SYMBOL(handle, member, name, so_handle) \ + do { \ + auto symbol_res = get_function(so_handle, #name); \ + if (!symbol_res.ok()) { \ + return symbol_res.error(); \ + } \ + handle->member = reinterpret_cast(symbol_res.get()); \ } while (0) using namespace std; @@ -122,10 +123,10 @@ class ET_EXPERIMENTAL CudaBackend final // Generate dynamic temporary file path filesystem::path temp_dir = filesystem::temp_directory_path(); filesystem::path so_path = - temp_dir / (so_blob_key + to_string(getpid()) + ".so"); + temp_dir / (so_blob_key + to_string(get_process_id()) + ".so"); // Create a temporary file - ofstream outfile(so_path.c_str(), ios::binary); + ofstream outfile(so_path, ios::binary); // Write the ELF buffer to the temporary file ET_LOG( @@ -144,24 +145,23 @@ class ET_EXPERIMENTAL CudaBackend final // Finish writing the file to disk outfile.close(); - // Load the ELF using dlopen - void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL); - ET_CHECK_OR_RETURN_ERROR( - so_handle != nullptr, - AccessFailed, - "Failed to load shared library: %s", - dlerror()); + // Load the lib + Result lib_handle_res = load_library(so_path); + if (!lib_handle_res.ok()) { + return lib_handle_res.error(); + } + void* lib_handle = lib_handle_res.get(); processed->Free(); // Create handle and load function pointers into it AOTIDelegateHandle* handle = new AOTIDelegateHandle(); - handle->so_handle = so_handle; + handle->so_handle = lib_handle; handle->so_path = so_path.string(); // Load function pointers specific to this handle's shared library ET_CHECK_OK_OR_RETURN_ERROR( - load_function_pointers_into_handle(so_handle, handle)); + load_function_pointers_into_handle(lib_handle, handle)); AOTInductorModelContainerHandle container_handle = nullptr; @@ -332,8 +332,9 @@ class ET_EXPERIMENTAL CudaBackend final // AOTInductorModelContainerDelete(handle->container_handle); // Now close the shared library + auto err = Error::Ok; if (handle->so_handle != nullptr) { - dlclose(handle->so_handle); + err = close_library(handle->so_handle); } // Remove the temporary shared library file diff --git a/backends/cuda/runtime/platform/platform.cpp b/backends/cuda/runtime/platform/platform.cpp new file mode 100644 index 00000000000..5264dcbd03a --- /dev/null +++ b/backends/cuda/runtime/platform/platform.cpp @@ -0,0 +1,125 @@ + +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#else // Posix +#include +#include +#include +#endif + +namespace executorch { +namespace backends { +namespace cuda { + +executorch::runtime::Result load_library( + const std::filesystem::path& path) { +#ifdef _WIN32 + std::string utf8 = path.u8string(); + auto lib_handle = LoadLibrary(utf8.c_str()); + if (lib_handle == NULL) { + ET_LOG( + Error, + "Failed to load %s with error: %lu", + utf8.c_str(), + GetLastError()); + return executorch::runtime::Error::AccessFailed; + } + +#else + std::string path_str = path.string(); + void* lib_handle = dlopen(path_str.c_str(), RTLD_LAZY | RTLD_LOCAL); + if (lib_handle == nullptr) { + ET_LOG( + Error, "Failed to load %s with error: %s", path_str.c_str(), dlerror()); + return executorch::runtime::Error::AccessFailed; + } +#endif + return (void*)lib_handle; +} + +executorch::runtime::Error close_library(void* lib_handle) { +#ifdef _WIN32 + if (!FreeLibrary((HMODULE)lib_handle)) { + printf("FreeLibrary failed with error %lu\n", GetLastError()); + return executorch::runtime::Error::Internal; + } +#else + if (dlclose(lib_handle) != 0) { + ET_LOG(Error, "dlclose failed: %s\n", dlerror()); + return executorch::runtime::Error::Internal; + } +#endif + return executorch::runtime::Error::Ok; +} + +executorch::runtime::Result get_function( + void* lib_handle, + const std::string& fn_name) { +#ifdef _WIN32 + auto fn = GetProcAddress((HMODULE)lib_handle, fn_name.c_str()); + if (!fn) { + ET_LOG( + Error, + "Failed loading symbol %s with error %lu\n", + fn_name.c_str(), + GetLastError()); + return executorch::runtime::Error::Internal; + } +#else + auto fn = dlsym(lib_handle, fn_name.c_str()); + if (fn == nullptr) { + ET_LOG( + Error, + "Failed loading symbol %s with error %s\n", + fn_name.c_str(), + dlerror()); + return executorch::runtime::Error::Internal; + } +#endif + + return (void*)fn; // This I think is technically ub on windows. We should + // probably explicitly pack the bytes. +} + +int32_t get_process_id() { +#ifdef _WIN32 + return GetCurrentProcessId(); +#else + return getpid(); +#endif +} + +void* aligned_alloc(size_t alignment, size_t size) { +#ifdef _WIN32 + return _aligned_malloc(size, alignment); +#else + return std::aligned_alloc(alignment, size); +#endif +} + +void aligned_free(void* ptr) { +#ifdef _WIN32 + _aligned_free(ptr); +#else + std::free(ptr); +#endif +} + +} // namespace cuda +} // namespace backends +} // namespace executorch diff --git a/backends/cuda/runtime/platform/platform.h b/backends/cuda/runtime/platform/platform.h new file mode 100644 index 00000000000..00f278ef85e --- /dev/null +++ b/backends/cuda/runtime/platform/platform.h @@ -0,0 +1,38 @@ + +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace cuda { + +executorch::runtime::Result load_library( + const std::filesystem::path& path); + +executorch::runtime::Error close_library(void* lib_handle); + +executorch::runtime::Result get_function( + void* lib_handle, + const std::string& fn_name); + +int32_t get_process_id(); + +void* aligned_alloc(size_t alignment, size_t size); + +void aligned_free(void* ptr); + +} // namespace cuda +} // namespace backends +} // namespace executorch diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index fe8ccf07281..5d30d3124d9 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -8,12 +8,12 @@ #include #include +#include #include #include #include #include #include -#include // For posix_memalign #include #include #include @@ -230,15 +230,11 @@ AOTITorchError aoti_torch_empty_strided( cudaMallocAsync(&ptr, static_cast(nbytes), cudaStreamDefault)); } else if (device_type == static_cast(SupportedDevices::CPU)) { // Ensure 16-byte alignment for CPU memory to match CUDA requirements - int result = posix_memalign(&ptr, 16, nbytes); - ET_CHECK_OR_RETURN_ERROR( - result == 0, - MemoryAllocationFailed, - "Failed to allocate aligned CPU memory"); + ptr = aligned_alloc(16, nbytes); ET_CHECK_OR_RETURN_ERROR( ptr != nullptr, MemoryAllocationFailed, - "Failed to call posix_memalign"); + "Failed to allocate aligned CPU memory"); } else { ET_CHECK_OR_RETURN_ERROR( false, @@ -339,7 +335,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) { Internal, "Expected host memory but got managed!") // This is CPU memory - free immediately - free(data_ptr); + aligned_free(data_ptr); data_ptr = nullptr; }