-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[libc] Handle differing wavefront sizes correctly in the AMDHSA loader #117788
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Summary: The AMDGPU backend can handle wavefront sizes of 32 and 64, with the native hardware preferring one or the other. The user can override the hardware with `-mwavefrontsize64` or `-mwavefrontsize32` which previously wasn't handled. We need to know the wavefront size to know how much memory to allocate and how to index the RPC buffer. There isn't a good way to do this with ROCm so we just use the LLVM support for offloading to check this from the image.
|
@llvm/pr-subscribers-libc @llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/117788.diff 2 Files Affected:
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index 46c5631046ce20..80c5ae357416af 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
Object
Option
Support
+ FrontendOffloading
)
add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index 13a13668335471..5a9fe87077328e 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -28,6 +28,8 @@
#include "hsa/hsa_ext_amd.h"
#endif
+#include "llvm/Frontend/Offloading/Utility.h"
+
#include <atomic>
#include <cstdio>
#include <cstdlib>
@@ -163,17 +165,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_queue_t *queue, rpc::Server &server,
const LaunchParameters ¶ms,
const char *kernel_name, args_t kernel_args,
- bool print_resource_usage) {
+ uint32_t wavefront_size, bool print_resource_usage) {
// Look up the kernel in the loaded executable.
hsa_executable_symbol_t symbol;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
executable, kernel_name, &dev_agent, &symbol))
return err;
- uint32_t wavefront_size = 0;
- if (hsa_status_t err = hsa_agent_get_info(
- dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
- handle_error(err);
// Retrieve different properties of the kernel symbol used for launch.
uint64_t kernel;
uint32_t args_size;
@@ -419,6 +417,16 @@ int load(int argc, const char **argv, const char **envp, void *image,
dev_agent, &coarsegrained_pool))
handle_error(err);
+ // The AMDGPU target can change its wavefront size. There currently isn't a
+ // good way to look this up through the HSA API so we use the LLVM interface.
+ uint16_t abi_version;
+ llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
+ llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
+ if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
+ llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
+ handle_error(llvm::toString(std::move(err)).c_str());
+ }
+
// Allocate fine-grained memory on the host to hold the pointer array for the
// copied argv and allow the GPU agent to access it.
auto allocator = [&](uint64_t size) -> void * {
@@ -448,10 +456,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
// Allocate finegrained memory for the RPC server and client to share.
- uint32_t wavefront_size = 0;
- if (hsa_status_t err = hsa_agent_get_info(
- dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
- handle_error(err);
+ uint32_t wavefront_size =
+ llvm::max_element(info_map, [](auto &&x, auto &&y) {
+ return x.second.WavefrontSize < y.second.WavefrontSize;
+ })->second.WavefrontSize;
// Set up the RPC server.
void *rpc_buffer;
@@ -513,7 +521,6 @@ int load(int argc, const char **argv, const char **envp, void *image,
if (HSA_STATUS_SUCCESS ==
hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
&dev_agent, &freq_sym)) {
-
void *host_clock_freq;
if (hsa_status_t err =
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
@@ -553,16 +560,17 @@ int load(int argc, const char **argv, const char **envp, void *image,
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp};
- if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
- coarsegrained_pool, queue, server,
- single_threaded_params, "_begin.kd",
- init_args, print_resource_usage))
+ if (hsa_status_t err = launch_kernel(
+ dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+ server, single_threaded_params, "_begin.kd", init_args,
+ info_map["_begin"].WavefrontSize, print_resource_usage))
handle_error(err);
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
- server, params, "_start.kd", args, print_resource_usage))
+ server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
+ print_resource_usage))
handle_error(err);
void *host_ret;
@@ -580,10 +588,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
int ret = *static_cast<int *>(host_ret);
end_args_t fini_args = {ret};
- if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
- coarsegrained_pool, queue, server,
- single_threaded_params, "_end.kd",
- fini_args, print_resource_usage))
+ if (hsa_status_t err = launch_kernel(
+ dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+ server, single_threaded_params, "_end.kd", fini_args,
+ info_map["_end"].WavefrontSize, print_resource_usage))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
|
shiltian
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no test?
Was kind of annoying, but it should work. |
| ASSERT_TRUE(cnt == gpu::get_lane_id() + 1 && "Incorrect sum"); | ||
| ASSERT_TRUE(gpu::get_thread_id() == gpu::get_lane_id() && "Not in same lane"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this just gtest? Should be EXPECT_EQ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a little confusing, the libc has integration tests that simply run main with a really small subset of gtest. However, EXPECT_EQ does exist there, so I can use that instead.
| }); | ||
| port.close(); | ||
| EXPECT_EQ(cnt, gpu::get_lane_id() + 1); | ||
| EXPECT_EQ(gpu::get_thread_id(), gpu::get_lane_id()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| EXPECT_EQ(gpu::get_thread_id(), gpu::get_lane_id()); | |
| EXPECT_EQ(gpu::get_thread_id(), gpu::get_lane_id()) << "Not in same lane"; |
Preserve the messages from before?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The macro doesn't take a message and since it's not a boolean I can't use the && trick where a const char pointer is evaluated as true.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it's a << after the failure
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That macro expands to if (x == y) { ... } which won't accept an operator.
Summary:
The AMDGPU backend can handle wavefront sizes of 32 and 64, with the
native hardware preferring one or the other. The user can override the
hardware with
-mwavefrontsize64or-mwavefrontsize32whichpreviously wasn't handled. We need to know the wavefront size to know
how much memory to allocate and how to index the RPC buffer. There isn't
a good way to do this with ROCm so we just use the LLVM support for
offloading to check this from the image.