From 3694816bf1b44d024699c72c131852104f96cfa2 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 9 Oct 2025 16:23:46 -0500
Subject: [PATCH] [LLVM] Port 'llvm-gpu-loader' to use LLVMOffload

Summary:
This patch rewrites the `llvm-gpu-loader` utility to use the LLVMOffload
interface. This heavily simplifies it while re-using the already
existing support. Another benefit is that I can now easily do this
dynamically so we can always build this utility without needing to find
non-standard packages.

One issue is mentioned in
https://github.com/llvm/llvm-project/issues/159636 where this will now
take extra time if you have both installed on the same machine. This is
just slightly annoying since most people don't have both CUDA and ROCm
at the same time so I don't consider it a blocker. I will work later to
address it.

Slightly unfortunate environment variable usage, I will also expose that
better in the future.
---
 libc/cmake/modules/LLVMLibCTestRules.cmake    |   6 +-
 libc/startup/gpu/amdgpu/start.cpp             |  36 +-
 libc/startup/gpu/nvptx/start.cpp              |  40 +-
 llvm/tools/CMakeLists.txt                     |   4 -
 llvm/tools/llvm-gpu-loader/CMakeLists.txt     |  34 -
 llvm/tools/llvm-gpu-loader/amdhsa.cpp         | 594 ------------------
 .../tools/llvm-gpu-loader/llvm-gpu-loader.cpp | 300 ++++++---
 llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h  | 259 +++++---
 llvm/tools/llvm-gpu-loader/nvptx.cpp          | 367 -----------
 llvm/tools/llvm-gpu-loader/server.h           |  55 --
 10 files changed, 393 insertions(+), 1302 deletions(-)
 delete mode 100644 llvm/tools/llvm-gpu-loader/amdhsa.cpp
 delete mode 100644 llvm/tools/llvm-gpu-loader/nvptx.cpp
 delete mode 100644 llvm/tools/llvm-gpu-loader/server.h

diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 19da0ad29cd84..dfc5ab904fe9d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -87,7 +87,6 @@ function(_get_hermetic_test_compile_options output_var)
          -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     list(APPEND compile_options
-         "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
          -Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT}
          -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
@@ -637,6 +636,7 @@ function(add_integration_test test_name)
   # makes `add_custom_target` construct the correct command and execute it.
   set(test_cmd
       ${INTEGRATION_TEST_ENV}
+      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
       $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
       ${CMAKE_CROSSCOMPILING_EMULATOR}
       ${INTEGRATION_TEST_LOADER_ARGS}
@@ -790,8 +790,7 @@ function(add_libc_hermetic test_name)
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     target_link_options(${fq_build_target_name} PRIVATE
@@ -859,6 +858,7 @@ function(add_libc_hermetic test_name)
       string(REPLACE " " ";" test_cmd "${test_cmd_parsed}")
     else()
       set(test_cmd ${HERMETIC_TEST_ENV}
+        $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
         $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
         $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
     endif()
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 48f095d924931..e876629e6d0e1 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -13,6 +13,9 @@
 #include "src/stdlib/atexit.h"
 #include "src/stdlib/exit.h"
 
+// TODO: Merge this and the NVPTX start files once the common `device_kernel`
+// attribute correctly implies `amdgpu_kernel`.
+
 extern "C" int main(int argc, char **argv, char **envp);
 extern "C" void __cxa_finalize(void *dso);
 
@@ -21,45 +24,18 @@ namespace LIBC_NAMESPACE_DECL {
 // FIXME: Factor this out into common logic so we don't need to stub it here.
 void teardown_main_tls() {}
 
-// FIXME: Touch this symbol to force this to be linked in statically.
-volatile void *dummy = &LIBC_NAMESPACE::rpc::client;
-
 DataEnvironment app;
 
-extern "C" uintptr_t __init_array_start[];
-extern "C" uintptr_t __init_array_end[];
-extern "C" uintptr_t __fini_array_start[];
-extern "C" uintptr_t __fini_array_end[];
-
-using InitCallback = void(int, char **, char **);
-using FiniCallback = void(void);
-
-static void call_init_array_callbacks(int argc, char **argv, char **env) {
-  size_t init_array_size = __init_array_end - __init_array_start;
-  for (size_t i = 0; i < init_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
-}
-
-static void call_fini_array_callbacks() {
-  size_t fini_array_size = __fini_array_end - __fini_array_start;
-  for (size_t i = fini_array_size; i > 0; --i)
-    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
-}
-
 } // namespace LIBC_NAMESPACE_DECL
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
              clang::amdgpu_flat_work_group_size(1, 1),
              clang::amdgpu_max_num_work_groups(1)]] void
-_begin(int argc, char **argv, char **env) {
+_begin(int, char **, char **env) {
+  // The LLVM offloading runtime will automatically call any present global
+  // constructors and destructors so we defer that handling.
   __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
                    reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
-  // We want the fini array callbacks to be run after other atexit
-  // callbacks are run. So, we register them before running the init
-  // array callbacks as they can potentially register their own atexit
-  // callbacks.
-  LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
-  LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
 }
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index ce8f5bbb6d4eb..822930e22bc26 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -23,48 +23,14 @@ DataEnvironment app;
 // FIXME: Factor this out into common logic so we don't need to stub it here.
 void teardown_main_tls() {}
 
-// FIXME: Touch this symbol to force this to be linked in statically.
-volatile void *dummy = &LIBC_NAMESPACE::rpc::client;
-
-extern "C" {
-// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
-// to manually create them and update the globals in the loader implememtation.
-uintptr_t *__init_array_start [[gnu::visibility("protected")]];
-uintptr_t *__init_array_end [[gnu::visibility("protected")]];
-uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
-uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
-}
-
-// Nvidia requires that the signature of the function pointers match. This means
-// we cannot support the extended constructor arguments.
-using InitCallback = void(void);
-using FiniCallback = void(void);
-
-static void call_init_array_callbacks(int, char **, char **) {
-  size_t init_array_size = __init_array_end - __init_array_start;
-  for (size_t i = 0; i < init_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__init_array_start[i])();
-}
-
-static void call_fini_array_callbacks() {
-  size_t fini_array_size = __fini_array_end - __fini_array_start;
-  for (size_t i = fini_array_size; i > 0; --i)
-    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
-}
-
 } // namespace LIBC_NAMESPACE_DECL
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_begin(int argc, char **argv, char **env) {
+_begin(int, char **, char **env) {
+  // The LLVM offloading runtime will automatically call any present global
+  // constructors and destructors so we defer that handling.
   __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
                    reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
-
-  // We want the fini array callbacks to be run after other atexit
-  // callbacks are run. So, we register them before running the init
-  // array callbacks as they can potentially register their own atexit
-  // callbacks.
-  LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
-  LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
 }
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt
index 729797aa43f0b..b9c5a79849ec8 100644
--- a/llvm/tools/CMakeLists.txt
+++ b/llvm/tools/CMakeLists.txt
@@ -9,10 +9,6 @@
 # traversing each directory.
 create_llvm_tool_options()
 
-if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE OR NOT LLVM_LIBC_GPU_BUILD)
-  set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF)
-endif()
-
 if(NOT LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_BUILD_LLVM_C_DYLIB)
   set(LLVM_TOOL_LLVM_SHLIB_BUILD Off)
 endif()
diff --git a/llvm/tools/llvm-gpu-loader/CMakeLists.txt b/llvm/tools/llvm-gpu-loader/CMakeLists.txt
index b35a702476ada..de276635e2713 100644
--- a/llvm/tools/llvm-gpu-loader/CMakeLists.txt
+++ b/llvm/tools/llvm-gpu-loader/CMakeLists.txt
@@ -1,46 +1,12 @@
 set(LLVM_LINK_COMPONENTS
-  BinaryFormat
-  Object
   Option
   Support
-  FrontendOffloading
   TargetParser
 )
 
 add_llvm_tool(llvm-gpu-loader
   llvm-gpu-loader.cpp
 
-  # TODO: We intentionally split this currently due to statically linking the
-  #       GPU runtimes. Dynamically load the dependencies, possibly using the
-  #       LLVM offloading API when it is complete.
-  PARTIAL_SOURCES_INTENDED
-
   DEPENDS
   intrinsics_gen
 )
-
-# Locate the RPC server handling interface.
-include(FindLibcCommonUtils)
-target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities)
-
-# Check for HSA support for targeting AMD GPUs.
-find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-if(hsa-runtime64_FOUND)
-  target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp)
-  target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT)
-  target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64)
-
-  # Compatibility with the old amdhsa-loader name.
-  add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader)
-endif()
-
-# Check for CUDA support for targeting NVIDIA GPUs.
-find_package(CUDAToolkit 11.2 QUIET)
-if(CUDAToolkit_FOUND)
-  target_sources(llvm-gpu-loader PRIVATE nvptx.cpp)
-  target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT)
-  target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver)
-
-  # Compatibility with the old nvptx-loader name.
-  add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader)
-endif()
diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
deleted file mode 100644
index 5715058d8cfac..0000000000000
--- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp
+++ /dev/null
@@ -1,594 +0,0 @@
-//===-- Loader Implementation for AMDHSA devices --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file impelements a simple loader to run images supporting the AMDHSA
-// architecture. The file launches the '_start' kernel which should be provided
-// by the device application start code and call ultimately call the 'main'
-// function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm-gpu-loader.h"
-#include "server.h"
-
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-
-#include "llvm/Frontend/Offloading/Utility.h"
-
-#include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <thread>
-#include <tuple>
-#include <utility>
-
-// The implicit arguments of COV5 AMDGPU kernels.
-struct implicit_args_t {
-  uint32_t grid_size_x;
-  uint32_t grid_size_y;
-  uint32_t grid_size_z;
-  uint16_t workgroup_size_x;
-  uint16_t workgroup_size_y;
-  uint16_t workgroup_size_z;
-  uint8_t Unused0[46];
-  uint16_t grid_dims;
-  uint8_t Unused1[190];
-};
-
-/// Print the error code and exit if \p code indicates an error.
-static void handle_error_impl(const char *file, int32_t line,
-                              hsa_status_t code) {
-  if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
-    return;
-
-  const char *desc;
-  if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
-    desc = "Unknown error";
-  fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
-  exit(EXIT_FAILURE);
-}
-
-/// Generic interface for iterating using the HSA callbacks.
-template <typename elem_ty, typename func_ty, typename callback_ty>
-hsa_status_t iterate(func_ty func, callback_ty cb) {
-  auto l = [](elem_ty elem, void *data) -> hsa_status_t {
-    callback_ty *unwrapped = static_cast<callback_ty *>(data);
-    return (*unwrapped)(elem);
-  };
-  return func(l, static_cast<void *>(&cb));
-}
-
-/// Generic interface for iterating using the HSA callbacks.
-template <typename elem_ty, typename func_ty, typename func_arg_ty,
-          typename callback_ty>
-hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
-  auto l = [](elem_ty elem, void *data) -> hsa_status_t {
-    callback_ty *unwrapped = static_cast<callback_ty *>(data);
-    return (*unwrapped)(elem);
-  };
-  return func(func_arg, l, static_cast<void *>(&cb));
-}
-
-/// Iterate through all availible agents.
-template <typename callback_ty>
-hsa_status_t iterate_agents(callback_ty callback) {
-  return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
-}
-
-/// Iterate through all availible memory pools.
-template <typename callback_ty>
-hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
-  return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
-                                        agent, cb);
-}
-
-template <hsa_device_type_t flag>
-hsa_status_t get_agent(hsa_agent_t *output_agent) {
-  // Find the first agent with a matching device type.
-  auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
-    hsa_device_type_t type;
-    hsa_status_t status =
-        hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
-    if (status != HSA_STATUS_SUCCESS)
-      return status;
-
-    if (type == flag) {
-      // Ensure that a GPU agent supports kernel dispatch packets.
-      if (type == HSA_DEVICE_TYPE_GPU) {
-        hsa_agent_feature_t features;
-        status =
-            hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
-        if (status != HSA_STATUS_SUCCESS)
-          return status;
-        if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
-          *output_agent = hsa_agent;
-      } else {
-        *output_agent = hsa_agent;
-      }
-      return HSA_STATUS_INFO_BREAK;
-    }
-    return HSA_STATUS_SUCCESS;
-  };
-
-  return iterate_agents(cb);
-}
-
-void print_kernel_resources(const char *kernel_name) {
-  fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
-}
-
-/// Retrieve a global memory pool with a \p flag from the agent.
-template <hsa_amd_memory_pool_global_flag_t flag>
-hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
-                                   hsa_amd_memory_pool_t *output_pool) {
-  auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
-    uint32_t flags;
-    hsa_amd_segment_t segment;
-    if (auto err = hsa_amd_memory_pool_get_info(
-            memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
-      return err;
-    if (auto err = hsa_amd_memory_pool_get_info(
-            memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
-      return err;
-
-    if (segment != HSA_AMD_SEGMENT_GLOBAL)
-      return HSA_STATUS_SUCCESS;
-
-    if (flags & flag)
-      *output_pool = memory_pool;
-
-    return HSA_STATUS_SUCCESS;
-  };
-  return iterate_agent_memory_pools(agent, cb);
-}
-
-template <typename args_t>
-hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
-                           hsa_amd_memory_pool_t kernargs_pool,
-                           hsa_amd_memory_pool_t coarsegrained_pool,
-                           hsa_queue_t *queue, rpc::Server &server,
-                           const LaunchParameters &params,
-                           const char *kernel_name, args_t kernel_args,
-                           uint32_t wavefront_size, bool print_resource_usage) {
-  // Look up the kernel in the loaded executable.
-  hsa_executable_symbol_t symbol;
-  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
-          executable, kernel_name, &dev_agent, &symbol))
-    return err;
-
-  // Retrieve different properties of the kernel symbol used for launch.
-  uint64_t kernel;
-  uint32_t args_size;
-  uint32_t group_size;
-  uint32_t private_size;
-  bool dynamic_stack;
-
-  std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
-
-  for (auto &[info, value] : symbol_infos)
-    if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
-      return err;
-
-  // Allocate space for the kernel arguments on the host and allow the GPU agent
-  // to access it.
-  void *args;
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
-                                                      /*flags=*/0, &args))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
-
-  // Initialize all the arguments (explicit and implicit) to zero, then set the
-  // explicit arguments to the values created above.
-  std::memset(args, 0, args_size);
-  std::memcpy(args, &kernel_args, std::is_empty_v<args_t> ? 0 : sizeof(args_t));
-
-  // Initialize the necessary implicit arguments to the proper values.
-  int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
-             (params.num_blocks_z * params.num_threads_z != 1);
-  implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
-      reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
-  implicit_args->grid_dims = dims;
-  implicit_args->grid_size_x = params.num_blocks_x;
-  implicit_args->grid_size_y = params.num_blocks_y;
-  implicit_args->grid_size_z = params.num_blocks_z;
-  implicit_args->workgroup_size_x = params.num_threads_x;
-  implicit_args->workgroup_size_y = params.num_threads_y;
-  implicit_args->workgroup_size_z = params.num_threads_z;
-
-  // Obtain a packet from the queue.
-  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
-  while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
-    ;
-
-  const uint32_t mask = queue->size - 1;
-  hsa_kernel_dispatch_packet_t *packet =
-      static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
-      (packet_id & mask);
-
-  // Set up the packet for exeuction on the device. We currently only launch
-  // with one thread on the device, forcing the rest of the wavefront to be
-  // masked off.
-  uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  packet->workgroup_size_x = params.num_threads_x;
-  packet->workgroup_size_y = params.num_threads_y;
-  packet->workgroup_size_z = params.num_threads_z;
-  packet->reserved0 = 0;
-  packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
-  packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
-  packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
-  packet->private_segment_size =
-      dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
-  packet->group_segment_size = group_size;
-  packet->kernel_object = kernel;
-  packet->kernarg_address = args;
-  packet->reserved2 = 0;
-  // Create a signal to indicate when this packet has been completed.
-  if (hsa_status_t err =
-          hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
-    handle_error(err);
-
-  if (print_resource_usage)
-    print_kernel_resources(kernel_name);
-
-  // Initialize the packet header and set the doorbell signal to begin execution
-  // by the HSA runtime.
-  uint16_t header =
-      1u << HSA_PACKET_HEADER_BARRIER |
-      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
-      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
-  uint32_t header_word = header | (setup << 16u);
-  __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
-  hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
-
-  std::atomic<bool> finished = false;
-  std::thread server_thread(
-      [](std::atomic<bool> *finished, rpc::Server *server,
-         uint32_t wavefront_size, hsa_agent_t dev_agent,
-         hsa_amd_memory_pool_t coarsegrained_pool) {
-        // Register RPC callbacks for the malloc and free functions on HSA.
-        auto malloc_handler = [&](size_t size) -> void * {
-          void *dev_ptr = nullptr;
-          if (hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
-                                           /*flags=*/0, &dev_ptr))
-            dev_ptr = nullptr;
-          hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
-          return dev_ptr;
-        };
-
-        auto free_handler = [](void *ptr) -> void {
-          if (hsa_status_t err =
-                  hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
-            handle_error(err);
-        };
-
-        uint32_t index = 0;
-        while (!*finished) {
-          if (wavefront_size == 32)
-            index =
-                handle_server<32>(*server, index, malloc_handler, free_handler);
-          else
-            index =
-                handle_server<64>(*server, index, malloc_handler, free_handler);
-        }
-      },
-      &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
-
-  // Wait until the kernel has completed execution on the device. Periodically
-  // check the RPC client for work to be performed on the server.
-  while (hsa_signal_wait_scacquire(packet->completion_signal,
-                                   HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                   HSA_WAIT_STATE_BLOCKED) != 0)
-    ;
-
-  finished = true;
-  if (server_thread.joinable())
-    server_thread.join();
-
-  // Destroy the resources acquired to launch the kernel and return.
-  if (hsa_status_t err = hsa_amd_memory_pool_free(args))
-    handle_error(err);
-  if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
-    handle_error(err);
-
-  return HSA_STATUS_SUCCESS;
-}
-
-/// Copies data from the source agent to the destination agent. The source
-/// memory must first be pinned explicitly or allocated via HSA.
-static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
-                               const void *src, hsa_agent_t src_agent,
-                               uint64_t size) {
-  // Create a memory signal to copy information between the host and device.
-  hsa_signal_t memory_signal;
-  if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
-    return err;
-
-  if (hsa_status_t err = hsa_amd_memory_async_copy(
-          dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
-    return err;
-
-  while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                   UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
-    ;
-
-  if (hsa_status_t err = hsa_signal_destroy(memory_signal))
-    return err;
-
-  return HSA_STATUS_SUCCESS;
-}
-
-int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
-                size_t size, const LaunchParameters &params,
-                bool print_resource_usage) {
-  // Initialize the HSA runtime used to communicate with the device.
-  if (hsa_status_t err = hsa_init())
-    handle_error(err);
-
-  // Register a callback when the device encounters a memory fault.
-  if (hsa_status_t err = hsa_amd_register_system_event_handler(
-          [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
-            if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
-              return HSA_STATUS_ERROR;
-            return HSA_STATUS_SUCCESS;
-          },
-          nullptr))
-    handle_error(err);
-
-  // Obtain a single agent for the device and host to use the HSA memory model.
-  hsa_agent_t dev_agent;
-  hsa_agent_t host_agent;
-  if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
-    handle_error(err);
-  if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
-    handle_error(err);
-
-  // Load the code object's ISA information and executable data segments.
-  hsa_code_object_reader_t reader;
-  if (hsa_status_t err =
-          hsa_code_object_reader_create_from_memory(image, size, &reader))
-    handle_error(err);
-
-  hsa_executable_t executable;
-  if (hsa_status_t err = hsa_executable_create_alt(
-          HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
-          &executable))
-    handle_error(err);
-
-  hsa_loaded_code_object_t object;
-  if (hsa_status_t err = hsa_executable_load_agent_code_object(
-          executable, dev_agent, reader, "", &object))
-    handle_error(err);
-
-  // No modifications to the executable are allowed  after this point.
-  if (hsa_status_t err = hsa_executable_freeze(executable, ""))
-    handle_error(err);
-
-  // Check the validity of the loaded executable. If the agents ISA features do
-  // not match the executable's code object it will fail here.
-  uint32_t result;
-  if (hsa_status_t err = hsa_executable_validate(executable, &result))
-    handle_error(err);
-  if (result)
-    handle_error(HSA_STATUS_ERROR);
-
-  if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
-    handle_error(err);
-
-  // Obtain memory pools to exchange data between the host and the device. The
-  // fine-grained pool acts as pinned memory on the host for DMA transfers to
-  // the device, the coarse-grained pool is for allocations directly on the
-  // device, and the kernerl-argument pool is for executing the kernel.
-  hsa_amd_memory_pool_t kernargs_pool;
-  hsa_amd_memory_pool_t finegrained_pool;
-  hsa_amd_memory_pool_t coarsegrained_pool;
-  if (hsa_status_t err =
-          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
-              host_agent, &kernargs_pool))
-    handle_error(err);
-  if (hsa_status_t err =
-          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
-              host_agent, &finegrained_pool))
-    handle_error(err);
-  if (hsa_status_t err =
-          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
-              dev_agent, &coarsegrained_pool))
-    handle_error(err);
-
-  // The AMDGPU target can change its wavefront size. There currently isn't a
-  // good way to look this up through the HSA API so we use the LLVM interface.
-  uint16_t abi_version;
-  llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
-  llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
-  if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
-          llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
-    handle_error(llvm::toString(std::move(err)).c_str());
-  }
-
-  // Allocate fine-grained memory on the host to hold the pointer array for the
-  // copied argv and allow the GPU agent to access it.
-  auto allocator = [&](uint64_t size) -> void * {
-    void *dev_ptr = nullptr;
-    if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
-                                                        /*flags=*/0, &dev_ptr))
-      handle_error(err);
-    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
-    return dev_ptr;
-  };
-  void *dev_argv = copy_argument_vector(argc, argv, allocator);
-  if (!dev_argv)
-    handle_error("Failed to allocate device argv");
-
-  // Allocate fine-grained memory on the host to hold the pointer array for the
-  // copied environment array and allow the GPU agent to access it.
-  void *dev_envp = copy_environment(envp, allocator);
-  if (!dev_envp)
-    handle_error("Failed to allocate device environment");
-
-  // Allocate space for the return pointer and initialize it to zero.
-  void *dev_ret;
-  if (hsa_status_t err =
-          hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
-                                       /*flags=*/0, &dev_ret))
-    handle_error(err);
-  hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
-
-  // Allocate finegrained memory for the RPC server and client to share.
-  uint32_t wavefront_size =
-      llvm::max_element(info_map, [](auto &&x, auto &&y) {
-        return x.second.WavefrontSize < y.second.WavefrontSize;
-      })->second.WavefrontSize;
-
-  // Set up the RPC server.
-  void *rpc_buffer;
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
-          finegrained_pool,
-          rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
-          /*flags=*/0, &rpc_buffer))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
-
-  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
-  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
-
-  // Initialize the RPC client on the device by copying the local data to the
-  // device's internal pointer.
-  hsa_executable_symbol_t rpc_client_sym;
-  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
-          executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym))
-    handle_error(err);
-
-  void *rpc_client_dev;
-  if (hsa_status_t err = hsa_executable_symbol_get_info(
-          rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
-          &rpc_client_dev))
-    handle_error(err);
-
-  void *rpc_client_buffer;
-  if (hsa_status_t err =
-          hsa_amd_memory_lock(&client, sizeof(rpc::Client),
-                              /*agents=*/nullptr, 0, &rpc_client_buffer))
-    handle_error(err);
-
-  // Copy the RPC client buffer to the address pointed to by the symbol.
-  if (hsa_status_t err =
-          hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent,
-                     sizeof(rpc::Client)))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_amd_memory_unlock(&client))
-    handle_error(err);
-
-  // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
-  // If the clock_freq symbol is missing, no work to do.
-  hsa_executable_symbol_t freq_sym;
-  if (HSA_STATUS_SUCCESS ==
-      hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
-                                        &dev_agent, &freq_sym)) {
-    void *host_clock_freq;
-    if (hsa_status_t err =
-            hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
-                                         /*flags=*/0, &host_clock_freq))
-      handle_error(err);
-    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
-
-    if (HSA_STATUS_SUCCESS ==
-        hsa_agent_get_info(dev_agent,
-                           static_cast<hsa_agent_info_t>(
-                               HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
-                           host_clock_freq)) {
-
-      void *freq_addr;
-      if (hsa_status_t err = hsa_executable_symbol_get_info(
-              freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
-              &freq_addr))
-        handle_error(err);
-
-      if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
-                                        host_agent, sizeof(uint64_t)))
-        handle_error(err);
-    }
-  }
-
-  // Obtain a queue with the maximum (power of two) size, used to send commands
-  // to the HSA runtime and launch execution on the device.
-  uint64_t queue_size;
-  if (hsa_status_t err = hsa_agent_get_info(
-          dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
-    handle_error(err);
-  hsa_queue_t *queue = nullptr;
-  if (hsa_status_t err =
-          hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
-                           nullptr, UINT32_MAX, UINT32_MAX, &queue))
-    handle_error(err);
-
-  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  begin_args_t init_args = {argc, dev_argv, dev_envp};
-  if (hsa_status_t err = launch_kernel(
-          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          server, single_threaded_params, "_begin.kd", init_args,
-          info_map["_begin"].WavefrontSize, print_resource_usage))
-    handle_error(err);
-
-  start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
-  if (hsa_status_t err = launch_kernel(
-          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
-          print_resource_usage))
-    handle_error(err);
-
-  void *host_ret;
-  if (hsa_status_t err =
-          hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
-                                       /*flags=*/0, &host_ret))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
-
-  if (hsa_status_t err =
-          hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
-    handle_error(err);
-
-  // Save the return value and perform basic clean-up.
-  int ret = *static_cast<int *>(host_ret);
-
-  end_args_t fini_args = {};
-  if (hsa_status_t err = launch_kernel(
-          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          server, single_threaded_params, "_end.kd", fini_args,
-          info_map["_end"].WavefrontSize, print_resource_usage))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
-    handle_error(err);
-
-  // Free the memory allocated for the device.
-  if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_queue_destroy(queue))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_executable_destroy(executable))
-    handle_error(err);
-
-  if (hsa_status_t err = hsa_shut_down())
-    handle_error(err);
-
-  return ret;
-}
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
index a8204664e85eb..d66f2b8d69b62 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
@@ -35,121 +35,255 @@
 
 using namespace llvm;
 
-static cl::OptionCategory loader_category("loader options");
+static cl::OptionCategory LoaderCategory("loader options");
 
-static cl::opt<bool> help("h", cl::desc("Alias for -help"), cl::Hidden,
-                          cl::cat(loader_category));
+static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden,
+                          cl::cat(LoaderCategory));
 
 static cl::opt<unsigned>
-    threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
-              cl::init(1), cl::cat(loader_category));
+    Threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
-              cl::init(1), cl::cat(loader_category));
+    Threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
-              cl::init(1), cl::cat(loader_category));
-static cl::alias threads("threads", cl::aliasopt(threads_x),
+    Threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
+static cl::alias threads("threads", cl::aliasopt(Threads_x),
                          cl::desc("Alias for --threads-x"),
-                         cl::cat(loader_category));
+                         cl::cat(LoaderCategory));
 
 static cl::opt<unsigned>
-    blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
-             cl::init(1), cl::cat(loader_category));
+    Blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
-             cl::init(1), cl::cat(loader_category));
+    Blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
-             cl::init(1), cl::cat(loader_category));
-static cl::alias blocks("blocks", cl::aliasopt(blocks_x),
+    Blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
+static cl::alias Blocks("blocks", cl::aliasopt(Blocks_x),
                         cl::desc("Alias for --blocks-x"),
-                        cl::cat(loader_category));
+                        cl::cat(LoaderCategory));
 
-static cl::opt<bool>
-    print_resource_usage("print-resource-usage",
-                         cl::desc("Output resource usage of launched kernels"),
-                         cl::init(false), cl::cat(loader_category));
-
-static cl::opt<std::string> file(cl::Positional, cl::Required,
+static cl::opt<std::string> File(cl::Positional, cl::Required,
                                  cl::desc("<gpu executable>"),
-                                 cl::cat(loader_category));
-static cl::list<std::string> args(cl::ConsumeAfter,
+                                 cl::cat(LoaderCategory));
+static cl::list<std::string> Args(cl::ConsumeAfter,
                                   cl::desc("<program arguments>..."),
-                                  cl::cat(loader_category));
+                                  cl::cat(LoaderCategory));
+
+// The arguments to the '_begin' kernel.
+struct BeginArgs {
+  int Argc;
+  void *Argv;
+  void *Envp;
+};
+
+// The arguments to the '_start' kernel.
+struct StartArgs {
+  int Argc;
+  void *Argv;
+  void *Envp;
+  void *Ret;
+};
 
-[[noreturn]] void report_error(Error E) {
+// The arguments to the '_end' kernel.
+struct EndArgs {};
+
+[[noreturn]] static void handleError(Error E) {
   outs().flush();
   logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader"));
   exit(EXIT_FAILURE);
 }
 
-std::string get_main_executable(const char *name) {
-  void *ptr = (void *)(intptr_t)&get_main_executable;
-  auto cow_path = sys::fs::getMainExecutable(name, ptr);
-  return sys::path::parent_path(cow_path).str();
+[[noreturn]] static void handleError(ol_result_t Err, unsigned Line) {
+  fprintf(stderr, "%s:%d %s\n", __FILE__, Line, Err->Details);
+  exit(EXIT_FAILURE);
+}
+
+#define OFFLOAD_ERR(X)                                                         \
+  if (ol_result_t Err = X)                                                     \
+    handleError(Err, __LINE__);
+
+static void *copyArgumentVector(int Argc, const char **Argv,
+                                ol_device_handle_t Device) {
+  size_t ArgSize = sizeof(char *) * (Argc + 1);
+  size_t StringLen = 0;
+  for (int i = 0; i < Argc; ++i)
+    StringLen += strlen(Argv[i]) + 1;
+
+  // We allocate enough space for a null terminated array and all the strings.
+  void *DevArgv;
+  OFFLOAD_ERR(
+      olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv));
+  if (!DevArgv)
+    handleError(
+        createStringError("Failed to allocate memory for environment."));
+
+  // Store the strings linerally in the same memory buffer.
+  void *DevString = reinterpret_cast<uint8_t *>(DevArgv) + ArgSize;
+  for (int i = 0; i < Argc; ++i) {
+    size_t size = strlen(Argv[i]) + 1;
+    std::memcpy(DevString, Argv[i], size);
+    static_cast<void **>(DevArgv)[i] = DevString;
+    DevString = reinterpret_cast<uint8_t *>(DevString) + size;
+  }
+
+  // Ensure the vector is null terminated.
+  reinterpret_cast<void **>(DevArgv)[Argc] = nullptr;
+  return DevArgv;
+}
+
+void *copyEnvironment(const char **Envp, ol_device_handle_t Device) {
+  int Envc = 0;
+  for (const char **Env = Envp; *Env != 0; ++Env)
+    ++Envc;
+
+  return copyArgumentVector(Envc, Envp, Device);
+}
+
+ol_device_handle_t findDevice(MemoryBufferRef Binary) {
+  ol_device_handle_t Device;
+  std::tuple Data = std::make_tuple(&Device, &Binary);
+  OFFLOAD_ERR(olIterateDevices(
+      [](ol_device_handle_t Device, void *UserData) {
+        auto &[Output, Binary] = *reinterpret_cast<decltype(Data) *>(UserData);
+        bool IsValid = false;
+        OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(),
+                                    Binary->getBufferSize(), &IsValid));
+        if (!IsValid)
+          return true;
+
+        *Output = Device;
+        return false;
+      },
+      &Data));
+  return Device;
+}
+
+ol_device_handle_t getHostDevice() {
+  ol_device_handle_t Device;
+  OFFLOAD_ERR(olIterateDevices(
+      [](ol_device_handle_t Device, void *UserData) {
+        ol_platform_handle_t Platform;
+        olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
+                        &Platform);
+        ol_platform_backend_t Backend;
+        olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
+                          &Backend);
+
+        auto &Output = *reinterpret_cast<decltype(Device) *>(UserData);
+        if (Backend == OL_PLATFORM_BACKEND_HOST) {
+          Output = Device;
+          return false;
+        }
+        return true;
+      },
+      &Device));
+  return Device;
+}
+
+ol_program_handle_t loadBinary(std::vector<char> &Binary,
+                               std::vector<ol_device_handle_t> &Devices) {
+  for (ol_device_handle_t &Device : Devices) {
+    bool IsValid = false;
+    OFFLOAD_ERR(
+        olIsValidBinary(Device, Binary.data(), Binary.size(), &IsValid));
+    if (!IsValid)
+      continue;
+
+    ol_program_handle_t Program;
+    OFFLOAD_ERR(
+        olCreateProgram(Device, Binary.data(), Binary.size(), &Program));
+    return Program;
+  }
+  handleError(
+      createStringError("No valid device found for '%s'", File.c_str()));
+}
+
+template <typename Args>
+void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device,
+                  ol_program_handle_t Program, const char *Name,
+                  ol_kernel_launch_size_args_t LaunchArgs, Args KernelArgs) {
+  ol_symbol_handle_t Kernel;
+  OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel));
+
+  OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs,
+                             std::is_empty_v<Args> ? 0 : sizeof(Args),
+                             &LaunchArgs));
 }
 
 int main(int argc, const char **argv, const char **envp) {
   sys::PrintStackTraceOnErrorSignal(argv[0]);
-  cl::HideUnrelatedOptions(loader_category);
+  cl::HideUnrelatedOptions(LoaderCategory);
   cl::ParseCommandLineOptions(
       argc, argv,
       "A utility used to launch unit tests built for a GPU target. This is\n"
       "intended to provide an intrface simular to cross-compiling emulators\n");
 
-  if (help) {
+  if (Help) {
     cl::PrintHelpMessage();
     return EXIT_SUCCESS;
   }
 
-  ErrorOr<std::unique_ptr<MemoryBuffer>> image_or_err =
-      MemoryBuffer::getFileOrSTDIN(file);
-  if (std::error_code ec = image_or_err.getError())
-    report_error(errorCodeToError(ec));
-  MemoryBufferRef image = **image_or_err;
-
-  SmallVector<const char *> new_argv = {file.c_str()};
-  llvm::transform(args, std::back_inserter(new_argv),
-                  [](const std::string &arg) { return arg.c_str(); });
-
-  Expected<llvm::object::ELF64LEObjectFile> elf_or_err =
-      llvm::object::ELF64LEObjectFile::create(image);
-  if (!elf_or_err)
-    report_error(elf_or_err.takeError());
-
-  int ret = 1;
-  if (elf_or_err->getArch() == Triple::amdgcn) {
-#ifdef AMDHSA_SUPPORT
-    LaunchParameters params{threads_x, threads_y, threads_z,
-                            blocks_x,  blocks_y,  blocks_z};
-
-    ret = load_amdhsa(new_argv.size(), new_argv.data(), envp,
-                      const_cast<char *>(image.getBufferStart()),
-                      image.getBufferSize(), params, print_resource_usage);
-#else
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-#endif
-  } else if (elf_or_err->getArch() == Triple::nvptx64) {
-#ifdef NVPTX_SUPPORT
-    LaunchParameters params{threads_x, threads_y, threads_z,
-                            blocks_x,  blocks_y,  blocks_z};
-
-    ret = load_nvptx(new_argv.size(), new_argv.data(), envp,
-                     const_cast<char *>(image.getBufferStart()),
-                     image.getBufferSize(), params, print_resource_usage);
-#else
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-#endif
-  } else {
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-  }
+  if (Error Err = loadLLVMOffload())
+    handleError(std::move(Err));
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ImageOrErr =
+      MemoryBuffer::getFileOrSTDIN(File);
+  if (std::error_code EC = ImageOrErr.getError())
+    handleError(errorCodeToError(EC));
+  MemoryBufferRef Image = **ImageOrErr;
+
+  SmallVector<const char *> NewArgv = {File.c_str()};
+  llvm::transform(Args, std::back_inserter(NewArgv),
+                  [](const std::string &Arg) { return Arg.c_str(); });
+
+  OFFLOAD_ERR(olInit());
+  ol_device_handle_t Device = findDevice(Image);
+  ol_device_handle_t Host = getHostDevice();
+
+  ol_program_handle_t Program;
+  OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(),
+                              Image.getBufferSize(), &Program));
+
+  ol_queue_handle_t Queue;
+  OFFLOAD_ERR(olCreateQueue(Device, &Queue));
+
+  int DevArgc = static_cast<int>(NewArgv.size());
+  void *DevArgv = copyArgumentVector(NewArgv.size(), NewArgv.begin(), Device);
+  void *DevEnvp = copyEnvironment(envp, Device);
+
+  void *DevRet;
+  OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet));
+
+  ol_kernel_launch_size_args_t BeginLaunch{1, {1, 1, 1}, {1, 1, 1}, 0};
+  BeginArgs BeginArgs = {DevArgc, DevArgv, DevEnvp};
+  launchKernel(Queue, Device, Program, "_begin", BeginLaunch, BeginArgs);
+  OFFLOAD_ERR(olSyncQueue(Queue));
+
+  uint32_t Dims = (Blocks_z > 1) ? 3 : (Blocks_y > 1) ? 2 : 1;
+  ol_kernel_launch_size_args_t StartLaunch{Dims,
+                                           {Blocks_x, Blocks_y, Blocks_z},
+                                           {Threads_x, Threads_y, Threads_z},
+                                           /*SharedMemBytes=*/0};
+  StartArgs StartArgs = {DevArgc, DevArgv, DevEnvp, DevRet};
+  launchKernel(Queue, Device, Program, "_start", StartLaunch, StartArgs);
+
+  ol_kernel_launch_size_args_t EndLaunch{1, {1, 1, 1}, {1, 1, 1}, 0};
+  EndArgs EndArgs = {};
+  launchKernel(Queue, Device, Program, "_end", EndLaunch, EndArgs);
+
+  int Ret;
+  OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int)));
+  OFFLOAD_ERR(olSyncQueue(Queue));
+
+  OFFLOAD_ERR(olMemFree(DevArgv));
+  OFFLOAD_ERR(olMemFree(DevEnvp));
+  OFFLOAD_ERR(olDestroyQueue(Queue));
+  OFFLOAD_ERR(olDestroyProgram(Program));
+  OFFLOAD_ERR(olShutDown());
 
-  return ret;
+  return Ret;
 }
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
index 08861c29b4fa4..3990cb3911e30 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
@@ -1,108 +1,177 @@
-//===-- Generic device loader interface -----------------------------------===//
+//===-- Dynamically loaded offload API ------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Dynamically loads the API provided by the LLVMOffload library. We need to do
+// this dynamically because this tool is used before it is actually built and
+// should be provided even when the user did not specify the offload runtime.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H
 #define LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H
 
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-/// Generic launch parameters for configuration the number of blocks / threads.
-struct LaunchParameters {
-  uint32_t num_threads_x;
-  uint32_t num_threads_y;
-  uint32_t num_threads_z;
-  uint32_t num_blocks_x;
-  uint32_t num_blocks_y;
-  uint32_t num_blocks_z;
-};
-
-/// The arguments to the '_begin' kernel.
-struct begin_args_t {
-  int argc;
-  void *argv;
-  void *envp;
-};
-
-/// The arguments to the '_start' kernel.
-struct start_args_t {
-  int argc;
-  void *argv;
-  void *envp;
-  void *ret;
-};
-
-/// The arguments to the '_end' kernel.
-struct end_args_t {};
-
-/// Generic interface to load the \p image and launch execution of the _start
-/// kernel on the target device. Copies \p argc and \p argv to the device.
-/// Returns the final value of the `main` function on the device.
-#ifdef AMDHSA_SUPPORT
-int load_amdhsa(int argc, const char **argv, const char **evnp, void *image,
-                size_t size, const LaunchParameters &params,
-                bool print_resource_usage);
-#endif
-#ifdef NVPTX_SUPPORT
-int load_nvptx(int argc, const char **argv, const char **evnp, void *image,
-               size_t size, const LaunchParameters &params,
-               bool print_resource_usage);
-#endif
-
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename V, typename A> inline V align_up(V val, A align) {
-  return ((val + V(align) - 1) / V(align)) * V(align);
-}
-
-/// Copy the system's argument vector to GPU memory allocated using \p alloc.
-template <typename Allocator>
-void *copy_argument_vector(int argc, const char **argv, Allocator alloc) {
-  size_t argv_size = sizeof(char *) * (argc + 1);
-  size_t str_size = 0;
-  for (int i = 0; i < argc; ++i)
-    str_size += strlen(argv[i]) + 1;
-
-  // We allocate enough space for a null terminated array and all the strings.
-  void *dev_argv = alloc(argv_size + str_size);
-  if (!dev_argv)
-    return nullptr;
-
-  // Store the strings linerally in the same memory buffer.
-  void *dev_str = reinterpret_cast<uint8_t *>(dev_argv) + argv_size;
-  for (int i = 0; i < argc; ++i) {
-    size_t size = strlen(argv[i]) + 1;
-    std::memcpy(dev_str, argv[i], size);
-    static_cast<void **>(dev_argv)[i] = dev_str;
-    dev_str = reinterpret_cast<uint8_t *>(dev_str) + size;
-  }
-
-  // Ensure the vector is null terminated.
-  reinterpret_cast<void **>(dev_argv)[argc] = nullptr;
-  return dev_argv;
-}
-
-/// Copy the system's environment to GPU memory allocated using \p alloc.
-template <typename Allocator>
-void *copy_environment(const char **envp, Allocator alloc) {
-  int envc = 0;
-  for (const char **env = envp; *env != 0; ++env)
-    ++envc;
-
-  return copy_argument_vector(envc, envp, alloc);
-}
-
-inline void handle_error_impl(const char *file, int32_t line, const char *msg) {
-  fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg);
-  exit(EXIT_FAILURE);
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Error.h"
+
+typedef enum ol_alloc_type_t {
+  OL_ALLOC_TYPE_HOST = 0,
+  OL_ALLOC_TYPE_DEVICE = 1,
+  OL_ALLOC_TYPE_FORCE_UINT32 = 0x7fffffff
+} ol_alloc_type_t;
+
+typedef enum ol_device_info_t {
+  OL_DEVICE_INFO_TYPE = 0,
+  OL_DEVICE_INFO_PLATFORM = 1,
+  OL_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff
+} ol_device_info_t;
+
+typedef enum ol_platform_info_t {
+  OL_PLATFORM_INFO_NAME = 0,
+  OL_PLATFORM_INFO_BACKEND = 3,
+  OL_PLATFORM_INFO_FORCE_UINT32 = 0x7fffffff
+} ol_platform_info_t;
+
+typedef enum ol_symbol_kind_t {
+  OL_SYMBOL_KIND_KERNEL = 0,
+  OL_SYMBOL_KIND_GLOBAL_VARIABLE = 1,
+  OL_SYMBOL_KIND_FORCE_UINT32 = 0x7fffffff
+} ol_symbol_kind_t;
+
+typedef enum ol_errc_t {
+  OL_ERRC_SUCCESS = 0,
+  OL_ERRC_FORCE_UINT32 = 0x7fffffff
+} ol_errc_t;
+
+typedef struct ol_error_struct_t {
+  ol_errc_t Code;
+  const char *Details;
+} ol_error_struct_t;
+
+typedef struct ol_dimensions_t {
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+} ol_dimensions_t;
+
+typedef struct ol_kernel_launch_size_args_t {
+  size_t Dimensions;
+  struct ol_dimensions_t NumGroups;
+  struct ol_dimensions_t GroupSize;
+  size_t DynSharedMemory;
+} ol_kernel_launch_size_args_t;
+
+typedef enum ol_platform_backend_t {
+  OL_PLATFORM_BACKEND_UNKNOWN = 0,
+  OL_PLATFORM_BACKEND_CUDA = 1,
+  OL_PLATFORM_BACKEND_AMDGPU = 2,
+  OL_PLATFORM_BACKEND_HOST = 3,
+  OL_PLATFORM_BACKEND_LAST = 4,
+  OL_PLATFORM_BACKEND_FORCE_UINT32 = 0x7fffffff
+} ol_platform_backend_t;
+
+typedef struct ol_device_impl_t *ol_device_handle_t;
+typedef struct ol_platform_impl_t *ol_platform_handle_t;
+typedef struct ol_program_impl_t *ol_program_handle_t;
+typedef struct ol_queue_impl_t *ol_queue_handle_t;
+typedef struct ol_symbol_impl_t *ol_symbol_handle_t;
+typedef const struct ol_error_struct_t *ol_result_t;
+
+typedef bool (*ol_device_iterate_cb_t)(ol_device_handle_t Device,
+                                       void *UserData);
+
+ol_result_t (*olInit)();
+ol_result_t (*olShutDown)();
+
+ol_result_t (*olIterateDevices)(ol_device_iterate_cb_t Callback,
+                                void *UserData);
+
+ol_result_t (*olIsValidBinary)(ol_device_handle_t Device, const void *ProgData,
+                               size_t ProgDataSize, bool *Valid);
+
+ol_result_t (*olCreateProgram)(ol_device_handle_t Device, const void *ProgData,
+                               size_t ProgDataSize,
+                               ol_program_handle_t *Program);
+
+ol_result_t (*olDestroyProgram)(ol_program_handle_t Program);
+
+ol_result_t (*olGetSymbol)(ol_program_handle_t Program, const char *Name,
+                           ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol);
+
+ol_result_t (*olLaunchKernel)(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_symbol_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_args_t *LaunchSizeArgs);
+
+ol_result_t (*olCreateQueue)(ol_device_handle_t Device,
+                             ol_queue_handle_t *Queue);
+
+ol_result_t (*olDestroyQueue)(ol_queue_handle_t Queue);
+
+ol_result_t (*olSyncQueue)(ol_queue_handle_t Queue);
+
+ol_result_t (*olMemAlloc)(ol_device_handle_t Device, ol_alloc_type_t Type,
+                          size_t Size, void **AllocationOut);
+
+ol_result_t (*olMemFree)(void *Address);
+
+ol_result_t (*olMemcpy)(ol_queue_handle_t Queue, void *DstPtr,
+                        ol_device_handle_t DstDevice, const void *SrcPtr,
+                        ol_device_handle_t SrcDevice, size_t Size);
+
+ol_result_t (*olGetDeviceInfo)(ol_device_handle_t Device,
+                               ol_device_info_t PropName, size_t PropSize,
+                               void *PropValue);
+
+ol_result_t (*olGetPlatformInfo)(ol_platform_handle_t Platform,
+                                 ol_platform_info_t PropName, size_t PropSize,
+                                 void *PropValue);
+
+llvm::Error loadLLVMOffload() {
+  constexpr const char *OffloadLibrary = "libLLVMOffload.so";
+
+  std::string ErrMsg;
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(OffloadLibrary, &ErrMsg));
+
+  if (!DynlibHandle->isValid())
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Failed to dlopen %s: %s", OffloadLibrary,
+                                   ErrMsg.c_str());
+
+#define DYNAMIC_INIT(SYM)                                                      \
+  do {                                                                         \
+    void *Ptr = DynlibHandle->getAddressOfSymbol(#SYM);                        \
+    if (!Ptr)                                                                  \
+      return llvm::createStringError(                                          \
+          llvm::inconvertibleErrorCode(), "Missing symbol '%s' in %s",         \
+          reinterpret_cast<const char *>(#SYM), OffloadLibrary);               \
+    SYM = reinterpret_cast<decltype(SYM)>(Ptr);                                \
+  } while (0)
+
+  DYNAMIC_INIT(olInit);
+  DYNAMIC_INIT(olShutDown);
+  DYNAMIC_INIT(olIterateDevices);
+  DYNAMIC_INIT(olIsValidBinary);
+  DYNAMIC_INIT(olCreateProgram);
+  DYNAMIC_INIT(olDestroyProgram);
+  DYNAMIC_INIT(olGetSymbol);
+  DYNAMIC_INIT(olLaunchKernel);
+  DYNAMIC_INIT(olCreateQueue);
+  DYNAMIC_INIT(olDestroyQueue);
+  DYNAMIC_INIT(olSyncQueue);
+  DYNAMIC_INIT(olMemAlloc);
+  DYNAMIC_INIT(olMemFree);
+  DYNAMIC_INIT(olMemcpy);
+  DYNAMIC_INIT(olGetDeviceInfo);
+  DYNAMIC_INIT(olGetPlatformInfo);
+#undef DYNAMIC_INIT
+
+  return llvm::Error::success();
 }
-#define handle_error(X) handle_error_impl(__FILE__, __LINE__, X)
 
 #endif // LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H
diff --git a/llvm/tools/llvm-gpu-loader/nvptx.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp
deleted file mode 100644
index 82b455249ad24..0000000000000
--- a/llvm/tools/llvm-gpu-loader/nvptx.cpp
+++ /dev/null
@@ -1,367 +0,0 @@
-//===-- Loader Implementation for NVPTX devices --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file impelements a simple loader to run images supporting the NVPTX
-// architecture. The file launches the '_start' kernel which should be provided
-// by the device application start code and call ultimately call the 'main'
-// function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm-gpu-loader.h"
-#include "server.h"
-
-#include "cuda.h"
-
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <atomic>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <thread>
-#include <vector>
-
-using namespace llvm;
-using namespace object;
-
-static void handle_error_impl(const char *file, int32_t line, CUresult err) {
-  if (err == CUDA_SUCCESS)
-    return;
-
-  const char *err_str = nullptr;
-  CUresult result = cuGetErrorString(err, &err_str);
-  if (result != CUDA_SUCCESS)
-    fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
-  else
-    fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
-  exit(1);
-}
-
-// Gets the names of all the globals that contain functions to initialize or
-// deinitialize. We need to do this manually because the NVPTX toolchain does
-// not contain the necessary binary manipulation tools.
-template <typename Alloc>
-Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
-                                     Alloc allocator, CUmodule binary) {
-  auto mem_buffer = MemoryBuffer::getMemBuffer(
-      StringRef(reinterpret_cast<const char *>(image), size), "image",
-      /*RequiresNullTerminator=*/false);
-  Expected<ELF64LEObjectFile> elf_or_err =
-      ELF64LEObjectFile::create(*mem_buffer);
-  if (!elf_or_err)
-    handle_error(toString(elf_or_err.takeError()).c_str());
-
-  std::vector<std::pair<const char *, uint16_t>> ctors;
-  std::vector<std::pair<const char *, uint16_t>> dtors;
-  // CUDA has no way to iterate over all the symbols so we need to inspect the
-  // ELF directly using the LLVM libraries.
-  for (const auto &symbol : elf_or_err->symbols()) {
-    auto name_or_err = symbol.getName();
-    if (!name_or_err)
-      handle_error(toString(name_or_err.takeError()).c_str());
-
-    // Search for all symbols that contain a constructor or destructor.
-    if (!name_or_err->starts_with("__init_array_object_") &&
-        !name_or_err->starts_with("__fini_array_object_"))
-      continue;
-
-    uint16_t priority;
-    if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
-      handle_error("Invalid priority for constructor or destructor");
-
-    if (name_or_err->starts_with("__init"))
-      ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
-    else
-      dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
-  }
-  // Lower priority constructors are run before higher ones. The reverse is true
-  // for destructors.
-  llvm::sort(ctors, llvm::less_second());
-  llvm::sort(dtors, llvm::less_second());
-
-  // Allocate host pinned memory to make these arrays visible to the GPU.
-  CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
-      ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
-  uint64_t global_size = 0;
-
-  // Get the address of the global and then store the address of the constructor
-  // function to call in the constructor array.
-  CUdeviceptr *dev_ctors_start = dev_memory;
-  CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
-  for (uint64_t i = 0; i < ctors.size(); ++i) {
-    CUdeviceptr dev_ptr;
-    if (CUresult err =
-            cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
-      handle_error(err);
-    if (CUresult err =
-            cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
-      handle_error(err);
-  }
-
-  // Get the address of the global and then store the address of the destructor
-  // function to call in the destructor array.
-  CUdeviceptr *dev_dtors_start = dev_ctors_end;
-  CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
-  for (uint64_t i = 0; i < dtors.size(); ++i) {
-    CUdeviceptr dev_ptr;
-    if (CUresult err =
-            cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
-      handle_error(err);
-    if (CUresult err =
-            cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
-      handle_error(err);
-  }
-
-  // Obtain the address of the pointers the startup implementation uses to
-  // iterate the constructors and destructors.
-  CUdeviceptr init_start;
-  if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
-                                       "__init_array_start"))
-    handle_error(err);
-  CUdeviceptr init_end;
-  if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
-                                       "__init_array_end"))
-    handle_error(err);
-  CUdeviceptr fini_start;
-  if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
-                                       "__fini_array_start"))
-    handle_error(err);
-  CUdeviceptr fini_end;
-  if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
-                                       "__fini_array_end"))
-    handle_error(err);
-
-  // Copy the pointers to the newly written array to the symbols so the startup
-  // implementation can iterate them.
-  if (CUresult err =
-          cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
-    handle_error(err);
-  if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
-    handle_error(err);
-  if (CUresult err =
-          cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
-    handle_error(err);
-  if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
-    handle_error(err);
-
-  return dev_memory;
-}
-
-void print_kernel_resources(CUmodule binary, const char *kernel_name) {
-  CUfunction function;
-  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
-    handle_error(err);
-  int num_regs;
-  if (CUresult err =
-          cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
-    handle_error(err);
-  printf("Executing kernel %s:\n", kernel_name);
-  printf("%6s registers: %d\n", kernel_name, num_regs);
-}
-
-template <typename args_t>
-CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
-                       const LaunchParameters &params, const char *kernel_name,
-                       args_t kernel_args, bool print_resource_usage) {
-  // look up the '_start' kernel in the loaded module.
-  CUfunction function;
-  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
-    handle_error(err);
-
-  // Set up the arguments to the '_start' kernel on the GPU.
-  uint64_t args_size = std::is_empty_v<args_t> ? 0 : sizeof(args_t);
-  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
-                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
-                         CU_LAUNCH_PARAM_END};
-  if (print_resource_usage)
-    print_kernel_resources(binary, kernel_name);
-
-  // Initialize a non-blocking CUDA stream to allocate memory if needed.
-  // This needs to be done on a separate stream or else it will deadlock
-  // with the executing kernel.
-  CUstream memory_stream;
-  if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
-    handle_error(err);
-
-  std::atomic<bool> finished = false;
-  std::thread server_thread(
-      [](std::atomic<bool> *finished, rpc::Server *server,
-         CUstream memory_stream) {
-        auto malloc_handler = [&](size_t size) -> void * {
-          CUdeviceptr dev_ptr;
-          if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
-            dev_ptr = 0UL;
-
-          // Wait until the memory allocation is complete.
-          while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
-            ;
-          return reinterpret_cast<void *>(dev_ptr);
-        };
-
-        auto free_handler = [&](void *ptr) -> void {
-          if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
-                                            memory_stream))
-            handle_error(err);
-        };
-
-        uint32_t index = 0;
-        while (!*finished) {
-          index =
-              handle_server<32>(*server, index, malloc_handler, free_handler);
-        }
-      },
-      &finished, &server, memory_stream);
-
-  // Call the kernel with the given arguments.
-  if (CUresult err = cuLaunchKernel(
-          function, params.num_blocks_x, params.num_blocks_y,
-          params.num_blocks_z, params.num_threads_x, params.num_threads_y,
-          params.num_threads_z, 0, stream, nullptr, args_config))
-    handle_error(err);
-
-  if (CUresult err = cuStreamSynchronize(stream))
-    handle_error(err);
-
-  finished = true;
-  if (server_thread.joinable())
-    server_thread.join();
-
-  return CUDA_SUCCESS;
-}
-
-int load_nvptx(int argc, const char **argv, const char **envp, void *image,
-               size_t size, const LaunchParameters &params,
-               bool print_resource_usage) {
-  if (CUresult err = cuInit(0))
-    handle_error(err);
-  // Obtain the first device found on the system.
-  uint32_t device_id = 0;
-  CUdevice device;
-  if (CUresult err = cuDeviceGet(&device, device_id))
-    handle_error(err);
-
-  // Initialize the CUDA context and claim it for this execution.
-  CUcontext context;
-  if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
-    handle_error(err);
-  if (CUresult err = cuCtxSetCurrent(context))
-    handle_error(err);
-
-  // Increase the stack size per thread.
-  // TODO: We should allow this to be passed in so only the tests that require a
-  // larger stack can specify it to save on memory usage.
-  if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
-    handle_error(err);
-
-  // Initialize a non-blocking CUDA stream to execute the kernel.
-  CUstream stream;
-  if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
-    handle_error(err);
-
-  // Load the image into a CUDA module.
-  CUmodule binary;
-  if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
-    handle_error(err);
-
-  // Allocate pinned memory on the host to hold the pointer array for the
-  // copied argv and allow the GPU device to access it.
-  auto allocator = [&](uint64_t size) -> void * {
-    void *dev_ptr;
-    if (CUresult err = cuMemAllocHost(&dev_ptr, size))
-      handle_error(err);
-    return dev_ptr;
-  };
-
-  auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
-  if (!memory_or_err)
-    handle_error(toString(memory_or_err.takeError()).c_str());
-
-  void *dev_argv = copy_argument_vector(argc, argv, allocator);
-  if (!dev_argv)
-    handle_error("Failed to allocate device argv");
-
-  // Allocate pinned memory on the host to hold the pointer array for the
-  // copied environment array and allow the GPU device to access it.
-  void *dev_envp = copy_environment(envp, allocator);
-  if (!dev_envp)
-    handle_error("Failed to allocate device environment");
-
-  // Allocate space for the return pointer and initialize it to zero.
-  CUdeviceptr dev_ret;
-  if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
-    handle_error(err);
-  if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
-    handle_error(err);
-
-  uint32_t warp_size = 32;
-  void *rpc_buffer = nullptr;
-  if (CUresult err = cuMemAllocHost(
-          &rpc_buffer,
-          rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
-    handle_error(err);
-  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
-  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
-
-  // Initialize the RPC client on the device by copying the local data to the
-  // device's internal pointer.
-  CUdeviceptr rpc_client_dev = 0;
-  uint64_t client_ptr_size = sizeof(void *);
-  if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
-                                       binary, "__llvm_rpc_client"))
-    handle_error(err);
-
-  if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
-    handle_error(err);
-
-  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  begin_args_t init_args = {argc, dev_argv, dev_envp};
-  if (CUresult err =
-          launch_kernel(binary, stream, server, single_threaded_params,
-                        "_begin", init_args, print_resource_usage))
-    handle_error(err);
-
-  start_args_t args = {argc, dev_argv, dev_envp,
-                       reinterpret_cast<void *>(dev_ret)};
-  if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
-                                   args, print_resource_usage))
-    handle_error(err);
-
-  // Copy the return value back from the kernel and wait.
-  int host_ret = 0;
-  if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
-    handle_error(err);
-
-  if (CUresult err = cuStreamSynchronize(stream))
-    handle_error(err);
-
-  end_args_t fini_args = {};
-  if (CUresult err =
-          launch_kernel(binary, stream, server, single_threaded_params, "_end",
-                        fini_args, print_resource_usage))
-    handle_error(err);
-
-  // Free the memory allocated for the device.
-  if (CUresult err = cuMemFreeHost(*memory_or_err))
-    handle_error(err);
-  if (CUresult err = cuMemFree(dev_ret))
-    handle_error(err);
-  if (CUresult err = cuMemFreeHost(dev_argv))
-    handle_error(err);
-  if (CUresult err = cuMemFreeHost(rpc_buffer))
-    handle_error(err);
-
-  // Destroy the context and the loaded binary.
-  if (CUresult err = cuModuleUnload(binary))
-    handle_error(err);
-  if (CUresult err = cuDevicePrimaryCtxRelease(device))
-    handle_error(err);
-  return host_ret;
-}
diff --git a/llvm/tools/llvm-gpu-loader/server.h b/llvm/tools/llvm-gpu-loader/server.h
deleted file mode 100644
index da73cc007f5d5..0000000000000
--- a/llvm/tools/llvm-gpu-loader/server.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//===-- Common RPC server handler -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H
-#define LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H
-
-#include <cstddef>
-#include <cstdint>
-
-#include "shared/rpc.h"
-#include "shared/rpc_opcodes.h"
-#include "shared/rpc_server.h"
-
-template <uint32_t num_lanes, typename Alloc, typename Free>
-inline uint32_t handle_server(rpc::Server &server, uint32_t index,
-                              Alloc &&alloc, Free &&free) {
-  auto port = server.try_open(num_lanes, index);
-  if (!port)
-    return 0;
-  index = port->get_index() + 1;
-
-  int status = rpc::RPC_SUCCESS;
-  switch (port->get_opcode()) {
-  case LIBC_MALLOC: {
-    port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = reinterpret_cast<uintptr_t>(alloc(buffer->data[0]));
-    });
-    break;
-  }
-  case LIBC_FREE: {
-    port->recv([&](rpc::Buffer *buffer, uint32_t) {
-      free(reinterpret_cast<void *>(buffer->data[0]));
-    });
-    break;
-  }
-  default:
-    status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*port, num_lanes);
-    break;
-  }
-
-  // Handle all of the `libc` specific opcodes.
-  if (status != rpc::RPC_SUCCESS)
-    handle_error("Error handling RPC server");
-
-  port->close();
-
-  return index;
-}
-
-#endif // LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H