Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions libc/cmake/modules/LLVMLibCTestRules.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ function(_get_hermetic_test_compile_options output_var)
-mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
list(APPEND compile_options
"SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
-Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT}
-nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
endif()
Expand Down Expand Up @@ -637,6 +636,7 @@ function(add_integration_test test_name)
# makes `add_custom_target` construct the correct command and execute it.
set(test_cmd
${INTEGRATION_TEST_ENV}
$<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
$<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
${CMAKE_CROSSCOMPILING_EMULATOR}
${INTEGRATION_TEST_LOADER_ARGS}
Expand Down Expand Up @@ -790,8 +790,7 @@ function(add_libc_hermetic test_name)
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
target_link_options(${fq_build_target_name} PRIVATE
${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
"-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -nostdlib -static
"-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
target_link_options(${fq_build_target_name} PRIVATE
Expand Down Expand Up @@ -859,6 +858,7 @@ function(add_libc_hermetic test_name)
string(REPLACE " " ";" test_cmd "${test_cmd_parsed}")
else()
set(test_cmd ${HERMETIC_TEST_ENV}
$<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
$<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
$<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
endif()
Expand Down
36 changes: 6 additions & 30 deletions libc/startup/gpu/amdgpu/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
#include "src/stdlib/atexit.h"
#include "src/stdlib/exit.h"

// TODO: Merge this and the NVPTX start files once the common `device_kernel`
// attribute correctly implies `amdgpu_kernel`.

extern "C" int main(int argc, char **argv, char **envp);
extern "C" void __cxa_finalize(void *dso);

Expand All @@ -21,45 +24,18 @@ namespace LIBC_NAMESPACE_DECL {
// FIXME: Factor this out into common logic so we don't need to stub it here.
void teardown_main_tls() {}

// FIXME: Touch this symbol to force this to be linked in statically.
volatile void *dummy = &LIBC_NAMESPACE::rpc::client;

DataEnvironment app;

extern "C" uintptr_t __init_array_start[];
extern "C" uintptr_t __init_array_end[];
extern "C" uintptr_t __fini_array_start[];
extern "C" uintptr_t __fini_array_end[];

using InitCallback = void(int, char **, char **);
using FiniCallback = void(void);

static void call_init_array_callbacks(int argc, char **argv, char **env) {
size_t init_array_size = __init_array_end - __init_array_start;
for (size_t i = 0; i < init_array_size; ++i)
reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
}

static void call_fini_array_callbacks() {
size_t fini_array_size = __fini_array_end - __fini_array_start;
for (size_t i = fini_array_size; i > 0; --i)
reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
}

} // namespace LIBC_NAMESPACE_DECL

extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
clang::amdgpu_flat_work_group_size(1, 1),
clang::amdgpu_max_num_work_groups(1)]] void
_begin(int argc, char **argv, char **env) {
_begin(int, char **, char **env) {
// The LLVM offloading runtime will automatically call any present global
// constructors and destructors so we defer that handling.
__atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
// array callbacks as they can potentially register their own atexit
// callbacks.
LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
}

extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
Expand Down
40 changes: 3 additions & 37 deletions libc/startup/gpu/nvptx/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,48 +23,14 @@ DataEnvironment app;
// FIXME: Factor this out into common logic so we don't need to stub it here.
void teardown_main_tls() {}

// FIXME: Touch this symbol to force this to be linked in statically.
volatile void *dummy = &LIBC_NAMESPACE::rpc::client;

extern "C" {
// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
// to manually create them and update the globals in the loader implememtation.
uintptr_t *__init_array_start [[gnu::visibility("protected")]];
uintptr_t *__init_array_end [[gnu::visibility("protected")]];
uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
}

// Nvidia requires that the signature of the function pointers match. This means
// we cannot support the extended constructor arguments.
using InitCallback = void(void);
using FiniCallback = void(void);

static void call_init_array_callbacks(int, char **, char **) {
size_t init_array_size = __init_array_end - __init_array_start;
for (size_t i = 0; i < init_array_size; ++i)
reinterpret_cast<InitCallback *>(__init_array_start[i])();
}

static void call_fini_array_callbacks() {
size_t fini_array_size = __fini_array_end - __fini_array_start;
for (size_t i = fini_array_size; i > 0; --i)
reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
}

} // namespace LIBC_NAMESPACE_DECL

extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
_begin(int argc, char **argv, char **env) {
_begin(int, char **, char **env) {
// The LLVM offloading runtime will automatically call any present global
// constructors and destructors so we defer that handling.
__atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);

// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
// array callbacks as they can potentially register their own atexit
// callbacks.
LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
}

extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
Expand Down
4 changes: 0 additions & 4 deletions llvm/tools/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
# traversing each directory.
create_llvm_tool_options()

if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE OR NOT LLVM_LIBC_GPU_BUILD)
set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF)
endif()

if(NOT LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_BUILD_LLVM_C_DYLIB)
set(LLVM_TOOL_LLVM_SHLIB_BUILD Off)
endif()
Expand Down
34 changes: 0 additions & 34 deletions llvm/tools/llvm-gpu-loader/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,46 +1,12 @@
set(LLVM_LINK_COMPONENTS
BinaryFormat
Object
Option
Support
FrontendOffloading
TargetParser
)

add_llvm_tool(llvm-gpu-loader
llvm-gpu-loader.cpp

# TODO: We intentionally split this currently due to statically linking the
# GPU runtimes. Dynamically load the dependencies, possibly using the
# LLVM offloading API when it is complete.
PARTIAL_SOURCES_INTENDED

DEPENDS
intrinsics_gen
)

# Locate the RPC server handling interface.
include(FindLibcCommonUtils)
target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities)

# Check for HSA support for targeting AMD GPUs.
find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
if(hsa-runtime64_FOUND)
target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp)
target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT)
target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64)

# Compatibility with the old amdhsa-loader name.
add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader)
endif()

# Check for CUDA support for targeting NVIDIA GPUs.
find_package(CUDAToolkit 11.2 QUIET)
if(CUDAToolkit_FOUND)
target_sources(llvm-gpu-loader PRIVATE nvptx.cpp)
target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT)
target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver)

# Compatibility with the old nvptx-loader name.
add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader)
endif()
Loading
Loading