Skip to content

Commit 8c9c91f

Browse files
authored
[libc] Make GPU _end kernel only call exit callbacks (#162371)
Summary: We use the infrastructure to stand up a pretend hosted environment on the GPU. Part of that is calling exit codes and handling the callback. Exiting from inside a GPU region is problematic as it actually relies on a lot of GPU magic behind the scenes. This is at least *correct* now as we use `quick_exit` on the CPU when the GPU calls `exit`. However, calling `quick_exit` will interfere with instrumentation or benchmarking that expects a nice teardown order. For normal execution we should do the friendly option and let the loader utility clean everything up manually.
1 parent 2690bb6 commit 8c9c91f

File tree

5 files changed

+15
-17
lines changed

5 files changed

+15
-17
lines changed

libc/startup/gpu/amdgpu/start.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "src/stdlib/exit.h"
1515

1616
extern "C" int main(int argc, char **argv, char **envp);
17+
extern "C" void __cxa_finalize(void *dso);
1718

1819
namespace LIBC_NAMESPACE_DECL {
1920

@@ -68,9 +69,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
6869
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
6970
clang::amdgpu_flat_work_group_size(1, 1),
7071
clang::amdgpu_max_num_work_groups(1)]] void
71-
_end(int retval) {
72-
// Only a single thread should call `exit` here, the rest should gracefully
73-
// return from the kernel. This is so only one thread calls the destructors
74-
// registred with 'atexit' above.
75-
LIBC_NAMESPACE::exit(retval);
72+
_end() {
73+
// Only a single thread should call the destructors registred with 'atexit'.
74+
// The loader utility will handle the actual exit and return code cleanly.
75+
__cxa_finalize(nullptr);
7676
}

libc/startup/gpu/nvptx/start.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "src/stdlib/exit.h"
1515

1616
extern "C" int main(int argc, char **argv, char **envp);
17+
extern "C" void __cxa_finalize(void *dso);
1718

1819
namespace LIBC_NAMESPACE_DECL {
1920

@@ -70,9 +71,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
7071
__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
7172
}
7273

73-
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
74-
_end(int retval) {
75-
// To finis the execution we invoke all the callbacks registered via 'atexit'
76-
// and then exit with the appropriate return value.
77-
LIBC_NAMESPACE::exit(retval);
74+
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _end() {
75+
// Only a single thread should call the destructors registred with 'atexit'.
76+
// The loader utility will handle the actual exit and return code cleanly.
77+
__cxa_finalize(nullptr);
7878
}

llvm/tools/llvm-gpu-loader/amdhsa.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
192192
// Initialize all the arguments (explicit and implicit) to zero, then set the
193193
// explicit arguments to the values created above.
194194
std::memset(args, 0, args_size);
195-
std::memcpy(args, &kernel_args, sizeof(args_t));
195+
std::memcpy(args, &kernel_args, std::is_empty_v<args_t> ? 0 : sizeof(args_t));
196196

197197
// Initialize the necessary implicit arguments to the proper values.
198198
int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
@@ -563,7 +563,7 @@ int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
563563
// Save the return value and perform basic clean-up.
564564
int ret = *static_cast<int *>(host_ret);
565565

566-
end_args_t fini_args = {ret};
566+
end_args_t fini_args = {};
567567
if (hsa_status_t err = launch_kernel(
568568
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
569569
server, single_threaded_params, "_end.kd", fini_args,

llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@ struct start_args_t {
4141
};
4242

4343
/// The arguments to the '_end' kernel.
44-
struct end_args_t {
45-
int argc;
46-
};
44+
struct end_args_t {};
4745

4846
/// Generic interface to load the \p image and launch execution of the _start
4947
/// kernel on the target device. Copies \p argc and \p argv to the device.

llvm/tools/llvm-gpu-loader/nvptx.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
177177
handle_error(err);
178178

179179
// Set up the arguments to the '_start' kernel on the GPU.
180-
uint64_t args_size = sizeof(args_t);
180+
uint64_t args_size = std::is_empty_v<args_t> ? 0 : sizeof(args_t);
181181
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
182182
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
183183
CU_LAUNCH_PARAM_END};
@@ -342,7 +342,7 @@ int load_nvptx(int argc, const char **argv, const char **envp, void *image,
342342
if (CUresult err = cuStreamSynchronize(stream))
343343
handle_error(err);
344344

345-
end_args_t fini_args = {host_ret};
345+
end_args_t fini_args = {};
346346
if (CUresult err =
347347
launch_kernel(binary, stream, server, single_threaded_params, "_end",
348348
fini_args, print_resource_usage))

0 commit comments

Comments
 (0)