Skip to content

Commit 5023151

Browse files
committed
[LLVM] Port 'llvm-gpu-loader' to use LLVMOffload
Summary: This patch rewrites the `llvm-gpu-loader` utility to use the LLVMOffload interface. This heavily simplifies it while re-using the already existing support. Another benefit is that I can now easily do this dynamically so we can always build this utility without needing to find non-standard packages. One issue is mentioned in #159636 where this will now take extra time if you have both installed on the same machine. This is just slightly annoying since most people don't have both CUDA and ROCm at the same time so I don't consider it a blocker. I will work later to address it. Slightly unfortunate environment variable usage, I will also expose that better in the future.
1 parent 44be079 commit 5023151

File tree

10 files changed

+393
-1302
lines changed

10 files changed

+393
-1302
lines changed

libc/cmake/modules/LLVMLibCTestRules.cmake

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ function(_get_hermetic_test_compile_options output_var)
8282
-mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
8383
elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
8484
list(APPEND compile_options
85-
"SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
8685
-Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT}
8786
-nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
8887
endif()
@@ -632,6 +631,7 @@ function(add_integration_test test_name)
632631
# makes `add_custom_target` construct the correct command and execute it.
633632
set(test_cmd
634633
${INTEGRATION_TEST_ENV}
634+
$<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
635635
$<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
636636
${CMAKE_CROSSCOMPILING_EMULATOR}
637637
${INTEGRATION_TEST_LOADER_ARGS}
@@ -785,8 +785,7 @@ function(add_libc_hermetic test_name)
785785
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
786786
target_link_options(${fq_build_target_name} PRIVATE
787787
${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
788-
-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
789-
"-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
788+
-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -nostdlib -static
790789
"-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
791790
elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
792791
target_link_options(${fq_build_target_name} PRIVATE
@@ -854,6 +853,7 @@ function(add_libc_hermetic test_name)
854853
string(REPLACE " " ";" test_cmd "${test_cmd_parsed}")
855854
else()
856855
set(test_cmd ${HERMETIC_TEST_ENV}
856+
$<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
857857
$<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
858858
$<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
859859
endif()

libc/startup/gpu/amdgpu/start.cpp

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
#include "src/stdlib/atexit.h"
1414
#include "src/stdlib/exit.h"
1515

16+
// TODO: Merge this and the NVPTX start files once the common `device_kernel`
17+
// attribute correctly implies `amdgpu_kernel`.
18+
1619
extern "C" int main(int argc, char **argv, char **envp);
1720
extern "C" void __cxa_finalize(void *dso);
1821

@@ -21,45 +24,18 @@ namespace LIBC_NAMESPACE_DECL {
2124
// FIXME: Factor this out into common logic so we don't need to stub it here.
2225
void teardown_main_tls() {}
2326

24-
// FIXME: Touch this symbol to force this to be linked in statically.
25-
volatile void *dummy = &LIBC_NAMESPACE::rpc::client;
26-
2727
DataEnvironment app;
2828

29-
extern "C" uintptr_t __init_array_start[];
30-
extern "C" uintptr_t __init_array_end[];
31-
extern "C" uintptr_t __fini_array_start[];
32-
extern "C" uintptr_t __fini_array_end[];
33-
34-
using InitCallback = void(int, char **, char **);
35-
using FiniCallback = void(void);
36-
37-
static void call_init_array_callbacks(int argc, char **argv, char **env) {
38-
size_t init_array_size = __init_array_end - __init_array_start;
39-
for (size_t i = 0; i < init_array_size; ++i)
40-
reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
41-
}
42-
43-
static void call_fini_array_callbacks() {
44-
size_t fini_array_size = __fini_array_end - __fini_array_start;
45-
for (size_t i = fini_array_size; i > 0; --i)
46-
reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
47-
}
48-
4929
} // namespace LIBC_NAMESPACE_DECL
5030

5131
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
5232
clang::amdgpu_flat_work_group_size(1, 1),
5333
clang::amdgpu_max_num_work_groups(1)]] void
54-
_begin(int argc, char **argv, char **env) {
34+
_begin(int, char **, char **env) {
35+
// The LLVM offloading runtime will automatically call any present global
36+
// constructors and destructors so we defer that handling.
5537
__atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
5638
reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
57-
// We want the fini array callbacks to be run after other atexit
58-
// callbacks are run. So, we register them before running the init
59-
// array callbacks as they can potentially register their own atexit
60-
// callbacks.
61-
LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
62-
LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
6339
}
6440

6541
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void

libc/startup/gpu/nvptx/start.cpp

Lines changed: 3 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -23,48 +23,14 @@ DataEnvironment app;
2323
// FIXME: Factor this out into common logic so we don't need to stub it here.
2424
void teardown_main_tls() {}
2525

26-
// FIXME: Touch this symbol to force this to be linked in statically.
27-
volatile void *dummy = &LIBC_NAMESPACE::rpc::client;
28-
29-
extern "C" {
30-
// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
31-
// to manually create them and update the globals in the loader implememtation.
32-
uintptr_t *__init_array_start [[gnu::visibility("protected")]];
33-
uintptr_t *__init_array_end [[gnu::visibility("protected")]];
34-
uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
35-
uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
36-
}
37-
38-
// Nvidia requires that the signature of the function pointers match. This means
39-
// we cannot support the extended constructor arguments.
40-
using InitCallback = void(void);
41-
using FiniCallback = void(void);
42-
43-
static void call_init_array_callbacks(int, char **, char **) {
44-
size_t init_array_size = __init_array_end - __init_array_start;
45-
for (size_t i = 0; i < init_array_size; ++i)
46-
reinterpret_cast<InitCallback *>(__init_array_start[i])();
47-
}
48-
49-
static void call_fini_array_callbacks() {
50-
size_t fini_array_size = __fini_array_end - __fini_array_start;
51-
for (size_t i = fini_array_size; i > 0; --i)
52-
reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
53-
}
54-
5526
} // namespace LIBC_NAMESPACE_DECL
5627

5728
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
58-
_begin(int argc, char **argv, char **env) {
29+
_begin(int, char **, char **env) {
30+
// The LLVM offloading runtime will automatically call any present global
31+
// constructors and destructors so we defer that handling.
5932
__atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
6033
reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
61-
62-
// We want the fini array callbacks to be run after other atexit
63-
// callbacks are run. So, we register them before running the init
64-
// array callbacks as they can potentially register their own atexit
65-
// callbacks.
66-
LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
67-
LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
6834
}
6935

7036
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void

llvm/tools/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@
99
# traversing each directory.
1010
create_llvm_tool_options()
1111

12-
if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE OR NOT LLVM_LIBC_GPU_BUILD)
13-
set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF)
14-
endif()
15-
1612
if(NOT LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_BUILD_LLVM_C_DYLIB)
1713
set(LLVM_TOOL_LLVM_SHLIB_BUILD Off)
1814
endif()
Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,12 @@
11
set(LLVM_LINK_COMPONENTS
2-
BinaryFormat
3-
Object
42
Option
53
Support
6-
FrontendOffloading
74
TargetParser
85
)
96

107
add_llvm_tool(llvm-gpu-loader
118
llvm-gpu-loader.cpp
129

13-
# TODO: We intentionally split this currently due to statically linking the
14-
# GPU runtimes. Dynamically load the dependencies, possibly using the
15-
# LLVM offloading API when it is complete.
16-
PARTIAL_SOURCES_INTENDED
17-
1810
DEPENDS
1911
intrinsics_gen
2012
)
21-
22-
# Locate the RPC server handling interface.
23-
include(FindLibcCommonUtils)
24-
target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities)
25-
26-
# Check for HSA support for targeting AMD GPUs.
27-
find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
28-
if(hsa-runtime64_FOUND)
29-
target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp)
30-
target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT)
31-
target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64)
32-
33-
# Compatibility with the old amdhsa-loader name.
34-
add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader)
35-
endif()
36-
37-
# Check for CUDA support for targeting NVIDIA GPUs.
38-
find_package(CUDAToolkit 11.2 QUIET)
39-
if(CUDAToolkit_FOUND)
40-
target_sources(llvm-gpu-loader PRIVATE nvptx.cpp)
41-
target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT)
42-
target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver)
43-
44-
# Compatibility with the old nvptx-loader name.
45-
add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader)
46-
endif()

0 commit comments

Comments
 (0)