From 3c2b2e66267f3902fd009ec7f0176af44aec6a3b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 26 Nov 2024 14:35:09 -0600
Subject: [PATCH 1/3] [libc] Handle differing wavefront sizes correctly in the
 AMDHSA loader

Summary:
The AMDGPU backend can handle wavefront sizes of 32 and 64, with the
native hardware preferring one or the other. The user can override the
hardware with `-mwavefrontsize64` or `-mwavefrontsize32` which
previously wasn't handled. We need to know the wavefront size to know
how much memory to allocate and how to index the RPC buffer. There isn't
a good way to do this with ROCm so we just use the LLVM support for
offloading to check this from the image.
---
 libc/utils/gpu/loader/amdgpu/CMakeLists.txt   |  1 +
 .../utils/gpu/loader/amdgpu/amdhsa-loader.cpp | 46 +++++++++++--------
 2 files changed, 28 insertions(+), 19 deletions(-)
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index 46c5631046ce2..80c5ae357416a 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
   Object
   Option
   Support
+  FrontendOffloading
   )
 
 add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index 13a1366833547..5a9fe87077328 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -28,6 +28,8 @@
 #include "hsa/hsa_ext_amd.h"
 #endif
 
+#include "llvm/Frontend/Offloading/Utility.h"
+
 #include <atomic>
 #include <cstdio>
 #include <cstdlib>
@@ -163,17 +165,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
                            hsa_queue_t *queue, rpc::Server &server,
                            const LaunchParameters &params,
                            const char *kernel_name, args_t kernel_args,
-                           bool print_resource_usage) {
+                           uint32_t wavefront_size, bool print_resource_usage) {
   // Look up the kernel in the loaded executable.
   hsa_executable_symbol_t symbol;
   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
           executable, kernel_name, &dev_agent, &symbol))
     return err;
 
-  uint32_t wavefront_size = 0;
-  if (hsa_status_t err = hsa_agent_get_info(
-          dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
-    handle_error(err);
   // Retrieve different properties of the kernel symbol used for launch.
   uint64_t kernel;
   uint32_t args_size;
@@ -419,6 +417,16 @@ int load(int argc, const char **argv, const char **envp, void *image,
               dev_agent, &coarsegrained_pool))
     handle_error(err);
 
+  // The AMDGPU target can change its wavefront size. There currently isn't a
+  // good way to look this up through the HSA API so we use the LLVM interface.
+  uint16_t abi_version;
+  llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
+  llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
+  if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
+          llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
+    handle_error(llvm::toString(std::move(err)).c_str());
+  }
+
   // Allocate fine-grained memory on the host to hold the pointer array for the
   // copied argv and allow the GPU agent to access it.
   auto allocator = [&](uint64_t size) -> void * {
@@ -448,10 +456,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
   hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
 
   // Allocate finegrained memory for the RPC server and client to share.
-  uint32_t wavefront_size = 0;
-  if (hsa_status_t err = hsa_agent_get_info(
-          dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
-    handle_error(err);
+  uint32_t wavefront_size =
+      llvm::max_element(info_map, [](auto &&x, auto &&y) {
+        return x.second.WavefrontSize < y.second.WavefrontSize;
+      })->second.WavefrontSize;
 
   // Set up the RPC server.
   void *rpc_buffer;
@@ -513,7 +521,6 @@ int load(int argc, const char **argv, const char **envp, void *image,
   if (HSA_STATUS_SUCCESS ==
       hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
                                         &dev_agent, &freq_sym)) {
-
     void *host_clock_freq;
     if (hsa_status_t err =
             hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
@@ -553,16 +560,17 @@ int load(int argc, const char **argv, const char **envp, void *image,
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   begin_args_t init_args = {argc, dev_argv, dev_envp};
-  if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
-                                       coarsegrained_pool, queue, server,
-                                       single_threaded_params, "_begin.kd",
-                                       init_args, print_resource_usage))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          server, single_threaded_params, "_begin.kd", init_args,
+          info_map["_begin"].WavefrontSize, print_resource_usage))
     handle_error(err);
 
   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
   if (hsa_status_t err = launch_kernel(
           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          server, params, "_start.kd", args, print_resource_usage))
+          server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
+          print_resource_usage))
     handle_error(err);
 
   void *host_ret;
@@ -580,10 +588,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
   int ret = *static_cast<int *>(host_ret);
 
   end_args_t fini_args = {ret};
-  if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
-                                       coarsegrained_pool, queue, server,
-                                       single_threaded_params, "_end.kd",
-                                       fini_args, print_resource_usage))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          server, single_threaded_params, "_end.kd", fini_args,
+          info_map["_end"].WavefrontSize, print_resource_usage))
     handle_error(err);
 
   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))

From 5b2289315befa861ec38d2a5fed8b0901fa4a210 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 26 Nov 2024 16:24:31 -0600
Subject: [PATCH 2/3] test

---
 libc/cmake/modules/LLVMLibCTestRules.cmake    |  8 ++---
 .../integration/startup/gpu/CMakeLists.txt    | 33 +++++++++++++++++
 .../integration/startup/gpu/rpc_lane_test.cpp | 36 +++++++++++++++++++
 3 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 libc/test/integration/startup/gpu/rpc_lane_test.cpp

diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index c3a0f371cd620..36f871920c3c3 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -476,14 +476,14 @@ function(add_integration_test test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS}
+      -Wno-multi-gpu -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
-      "-Wl,--suppress-stack-size-warning"
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS}
+      "-Wl,--suppress-stack-size-warning" -Wno-multi-gpu
       "-Wl,-mllvm,-nvptx-lower-global-ctor-dtor=1"
       "-Wl,-mllvm,-nvptx-emit-init-fini-kernel"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt
index 7555986b16df4..1eee7bcc3d18a 100644
--- a/libc/test/integration/startup/gpu/CMakeLists.txt
+++ b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -53,3 +53,36 @@ add_integration_test(
    --threads 32
    --blocks 8
 )
+
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  add_integration_test(
+    startup_rpc_lane_test_w32
+    SUITE libc-startup-tests
+    SRCS
+      rpc_lane_test
+    LOADER_ARGS
+      --threads 32
+    COMPILE_OPTIONS
+      -mno-wavefrontsize64
+  )
+
+  add_integration_test(
+    startup_rpc_lane_test_w64
+    SUITE libc-startup-tests
+    SRCS
+      rpc_lane_test.cpp
+    LOADER_ARGS
+      --threads 64
+    COMPILE_OPTIONS
+      -mwavefrontsize64
+  )
+else()
+  add_integration_test(
+    startup_rpc_lane_test_w32
+    SUITE libc-startup-tests
+    SRCS
+      rpc_lane_test.cpp
+    LOADER_ARGS
+      --threads 32
+  )
+endif()
diff --git a/libc/test/integration/startup/gpu/rpc_lane_test.cpp b/libc/test/integration/startup/gpu/rpc_lane_test.cpp
new file mode 100644
index 0000000000000..923c4decc8b96
--- /dev/null
+++ b/libc/test/integration/startup/gpu/rpc_lane_test.cpp
@@ -0,0 +1,36 @@
+//===-- Loader test to check the RPC interface with the loader ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+static void test_add() {
+  uint64_t cnt = gpu::get_lane_id();
+  LIBC_NAMESPACE::rpc::Client::Port port =
+      LIBC_NAMESPACE::rpc::client.open<RPC_TEST_INCREMENT>();
+  port.send_and_recv(
+      [=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+        reinterpret_cast<uint64_t *>(buffer->data)[0] = cnt;
+      },
+      [&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+        cnt = reinterpret_cast<uint64_t *>(buffer->data)[0];
+      });
+  port.close();
+  ASSERT_TRUE(cnt == gpu::get_lane_id() + 1 && "Incorrect sum");
+  ASSERT_TRUE(gpu::get_thread_id() == gpu::get_lane_id() && "Not in same lane");
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  test_add();
+
+  return 0;
+}

From 641143cbf774ec2cb0ab2f47ca5ff590f0863ec6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 26 Nov 2024 19:34:13 -0600
Subject: [PATCH 3/3] test eq

---
 libc/test/integration/startup/gpu/rpc_lane_test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/test/integration/startup/gpu/rpc_lane_test.cpp b/libc/test/integration/startup/gpu/rpc_lane_test.cpp
index 923c4decc8b96..72c7109eecfd6 100644
--- a/libc/test/integration/startup/gpu/rpc_lane_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_lane_test.cpp
@@ -25,8 +25,8 @@ static void test_add() {
         cnt = reinterpret_cast<uint64_t *>(buffer->data)[0];
       });
   port.close();
-  ASSERT_TRUE(cnt == gpu::get_lane_id() + 1 && "Incorrect sum");
-  ASSERT_TRUE(gpu::get_thread_id() == gpu::get_lane_id() && "Not in same lane");
+  EXPECT_EQ(cnt, gpu::get_lane_id() + 1);
+  EXPECT_EQ(gpu::get_thread_id(), gpu::get_lane_id());
 }
 
 TEST_MAIN(int argc, char **argv, char **envp) {