From 3c2b2e66267f3902fd009ec7f0176af44aec6a3b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 26 Nov 2024 14:35:09 -0600 Subject: [PATCH 1/3] [libc] Handle differing wavefront sizes correctly in the AMDHSA loader Summary: The AMDGPU backend can handle wavefront sizes of 32 and 64, with the native hardware preferring one or the other. The user can override the hardware with `-mwavefrontsize64` or `-mwavefrontsize32` which previously wasn't handled. We need to know the wavefront size to know how much memory to allocate and how to index the RPC buffer. There isn't a good way to do this with ROCm so we just use the LLVM support for offloading to check this from the image. --- libc/utils/gpu/loader/amdgpu/CMakeLists.txt | 1 + .../utils/gpu/loader/amdgpu/amdhsa-loader.cpp | 46 +++++++++++-------- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt index 46c5631046ce2..80c5ae357416a 100644 --- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt +++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt @@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS Object Option Support + FrontendOffloading ) add_llvm_executable(amdhsa-loader amdhsa-loader.cpp) diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp index 13a1366833547..5a9fe87077328 100644 --- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp @@ -28,6 +28,8 @@ #include "hsa/hsa_ext_amd.h" #endif +#include "llvm/Frontend/Offloading/Utility.h" + #include #include #include @@ -163,17 +165,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, hsa_queue_t *queue, rpc::Server &server, const LaunchParameters ¶ms, const char *kernel_name, args_t kernel_args, - bool print_resource_usage) { + uint32_t wavefront_size, bool print_resource_usage) { // Look up the kernel in the loaded executable. hsa_executable_symbol_t symbol; if (hsa_status_t err = hsa_executable_get_symbol_by_name( executable, kernel_name, &dev_agent, &symbol)) return err; - uint32_t wavefront_size = 0; - if (hsa_status_t err = hsa_agent_get_info( - dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) - handle_error(err); // Retrieve different properties of the kernel symbol used for launch. uint64_t kernel; uint32_t args_size; @@ -419,6 +417,16 @@ int load(int argc, const char **argv, const char **envp, void *image, dev_agent, &coarsegrained_pool)) handle_error(err); + // The AMDGPU target can change its wavefront size. There currently isn't a + // good way to look this up through the HSA API so we use the LLVM interface. + uint16_t abi_version; + llvm::StringRef image_ref(reinterpret_cast(image), size); + llvm::StringMap info_map; + if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage( + llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) { + handle_error(llvm::toString(std::move(err)).c_str()); + } + // Allocate fine-grained memory on the host to hold the pointer array for the // copied argv and allow the GPU agent to access it. auto allocator = [&](uint64_t size) -> void * { @@ -448,10 +456,10 @@ int load(int argc, const char **argv, const char **envp, void *image, hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); // Allocate finegrained memory for the RPC server and client to share. - uint32_t wavefront_size = 0; - if (hsa_status_t err = hsa_agent_get_info( - dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) - handle_error(err); + uint32_t wavefront_size = + llvm::max_element(info_map, [](auto &&x, auto &&y) { + return x.second.WavefrontSize < y.second.WavefrontSize; + })->second.WavefrontSize; // Set up the RPC server. void *rpc_buffer; @@ -513,7 +521,6 @@ int load(int argc, const char **argv, const char **envp, void *image, if (HSA_STATUS_SUCCESS == hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq", &dev_agent, &freq_sym)) { - void *host_clock_freq; if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), @@ -553,16 +560,17 @@ int load(int argc, const char **argv, const char **envp, void *image, LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; begin_args_t init_args = {argc, dev_argv, dev_envp}; - if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, - coarsegrained_pool, queue, server, - single_threaded_params, "_begin.kd", - init_args, print_resource_usage)) + if (hsa_status_t err = launch_kernel( + dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, + server, single_threaded_params, "_begin.kd", init_args, + info_map["_begin"].WavefrontSize, print_resource_usage)) handle_error(err); start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; if (hsa_status_t err = launch_kernel( dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - server, params, "_start.kd", args, print_resource_usage)) + server, params, "_start.kd", args, info_map["_start"].WavefrontSize, + print_resource_usage)) handle_error(err); void *host_ret; @@ -580,10 +588,10 @@ int load(int argc, const char **argv, const char **envp, void *image, int ret = *static_cast(host_ret); end_args_t fini_args = {ret}; - if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, - coarsegrained_pool, queue, server, - single_threaded_params, "_end.kd", - fini_args, print_resource_usage)) + if (hsa_status_t err = launch_kernel( + dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, + server, single_threaded_params, "_end.kd", fini_args, + info_map["_end"].WavefrontSize, print_resource_usage)) handle_error(err); if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer)) From 5b2289315befa861ec38d2a5fed8b0901fa4a210 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 26 Nov 2024 16:24:31 -0600 Subject: [PATCH 2/3] test --- libc/cmake/modules/LLVMLibCTestRules.cmake | 8 ++--- .../integration/startup/gpu/CMakeLists.txt | 33 +++++++++++++++++ .../integration/startup/gpu/rpc_lane_test.cpp | 36 +++++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 libc/test/integration/startup/gpu/rpc_lane_test.cpp diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index c3a0f371cd620..36f871920c3c3 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -476,14 +476,14 @@ function(add_integration_test test_name) if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) target_link_options(${fq_build_target_name} PRIVATE - ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu - -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto + ${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS} + -Wno-multi-gpu -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}") elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) target_link_options(${fq_build_target_name} PRIVATE - ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu - "-Wl,--suppress-stack-size-warning" + ${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS} + "-Wl,--suppress-stack-size-warning" -Wno-multi-gpu "-Wl,-mllvm,-nvptx-lower-global-ctor-dtor=1" "-Wl,-mllvm,-nvptx-emit-init-fini-kernel" -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt index 7555986b16df4..1eee7bcc3d18a 100644 --- a/libc/test/integration/startup/gpu/CMakeLists.txt +++ b/libc/test/integration/startup/gpu/CMakeLists.txt @@ -53,3 +53,36 @@ add_integration_test( --threads 32 --blocks 8 ) + +if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + add_integration_test( + startup_rpc_lane_test_w32 + SUITE libc-startup-tests + SRCS + rpc_lane_test + LOADER_ARGS + --threads 32 + COMPILE_OPTIONS + -mno-wavefrontsize64 + ) + + add_integration_test( + startup_rpc_lane_test_w64 + SUITE libc-startup-tests + SRCS + rpc_lane_test.cpp + LOADER_ARGS + --threads 64 + COMPILE_OPTIONS + -mwavefrontsize64 + ) +else() + add_integration_test( + startup_rpc_lane_test_w32 + SUITE libc-startup-tests + SRCS + rpc_lane_test.cpp + LOADER_ARGS + --threads 32 + ) +endif() diff --git a/libc/test/integration/startup/gpu/rpc_lane_test.cpp b/libc/test/integration/startup/gpu/rpc_lane_test.cpp new file mode 100644 index 0000000000000..923c4decc8b96 --- /dev/null +++ b/libc/test/integration/startup/gpu/rpc_lane_test.cpp @@ -0,0 +1,36 @@ +//===-- Loader test to check the RPC interface with the loader ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "include/llvm-libc-types/test_rpc_opcodes_t.h" +#include "src/__support/GPU/utils.h" +#include "src/__support/RPC/rpc_client.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +static void test_add() { + uint64_t cnt = gpu::get_lane_id(); + LIBC_NAMESPACE::rpc::Client::Port port = + LIBC_NAMESPACE::rpc::client.open(); + port.send_and_recv( + [=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) { + reinterpret_cast(buffer->data)[0] = cnt; + }, + [&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) { + cnt = reinterpret_cast(buffer->data)[0]; + }); + port.close(); + ASSERT_TRUE(cnt == gpu::get_lane_id() + 1 && "Incorrect sum"); + ASSERT_TRUE(gpu::get_thread_id() == gpu::get_lane_id() && "Not in same lane"); +} + +TEST_MAIN(int argc, char **argv, char **envp) { + test_add(); + + return 0; +} From 641143cbf774ec2cb0ab2f47ca5ff590f0863ec6 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 26 Nov 2024 19:34:13 -0600 Subject: [PATCH 3/3] test eq --- libc/test/integration/startup/gpu/rpc_lane_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/test/integration/startup/gpu/rpc_lane_test.cpp b/libc/test/integration/startup/gpu/rpc_lane_test.cpp index 923c4decc8b96..72c7109eecfd6 100644 --- a/libc/test/integration/startup/gpu/rpc_lane_test.cpp +++ b/libc/test/integration/startup/gpu/rpc_lane_test.cpp @@ -25,8 +25,8 @@ static void test_add() { cnt = reinterpret_cast(buffer->data)[0]; }); port.close(); - ASSERT_TRUE(cnt == gpu::get_lane_id() + 1 && "Incorrect sum"); - ASSERT_TRUE(gpu::get_thread_id() == gpu::get_lane_id() && "Not in same lane"); + EXPECT_EQ(cnt, gpu::get_lane_id() + 1); + EXPECT_EQ(gpu::get_thread_id(), gpu::get_lane_id()); } TEST_MAIN(int argc, char **argv, char **envp) {