diff --git a/libc/src/__support/OSUtil/gpu/exit.cpp b/libc/src/__support/OSUtil/gpu/exit.cpp index 8aaa41b4e3eef..0cb266a42d180 100644 --- a/libc/src/__support/OSUtil/gpu/exit.cpp +++ b/libc/src/__support/OSUtil/gpu/exit.cpp @@ -8,6 +8,7 @@ #include "src/__support/OSUtil/exit.h" +#include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index a257003a907de..45a7ce3109f57 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -20,7 +20,6 @@ #include "rpc_util.h" #include "src/__support/CPP/optional.h" -#include "src/__support/GPU/utils.h" #include "src/__support/macros/config.h" #include @@ -38,6 +37,9 @@ namespace rpc { #define __scoped_atomic_fetch_and(src, val, ord, scp) \ __atomic_fetch_and(src, val, ord) #endif +#if !__has_builtin(__scoped_atomic_thread_fence) +#define __scoped_atomic_thread_fence(ord, scp) __atomic_thread_fence(ord) +#endif /// A fixed size channel used to communicate between the RPC client and server. struct Buffer { @@ -110,14 +112,14 @@ template struct Process { /// Retrieve the inbox state from memory shared between processes. LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const { - return gpu::broadcast_value( + return rpc::broadcast_value( lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM)); } /// Retrieve the outbox state from memory shared between processes. LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const { - return gpu::broadcast_value( + return rpc::broadcast_value( lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM)); } @@ -128,7 +130,7 @@ template struct Process { /// cheaper than calling load_outbox to get the value to store. LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) { uint32_t inverted_outbox = !current_outbox; - __atomic_thread_fence(__ATOMIC_RELEASE); + __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM); __scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); return inverted_outbox; @@ -142,7 +144,7 @@ template struct Process { sleep_briefly(); in = load_inbox(lane_mask, index); } - __atomic_thread_fence(__ATOMIC_ACQUIRE); + __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM); } /// The packet is a linearly allocated array of buffers used to communicate @@ -162,9 +164,10 @@ template struct Process { /// Attempt to claim the lock at index. Return true on lock taken. /// lane_mask is a bitmap of the threads in the warp that would hold the - /// single lock on success, e.g. the result of gpu::get_lane_mask() + /// single lock on success, e.g. the result of rpc::get_lane_mask() /// The lock is held when the n-th bit of the lock bitfield is set. - LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) { + [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask, + uint32_t index) { // On amdgpu, test and set to the nth lock bit and a sync_lane would suffice // On volta, need to handle differences between the threads running and // the threads that were detected in the previous call to get_lane_mask() @@ -173,12 +176,12 @@ template struct Process { // There may be threads active which are not in lane mask which must not // succeed in taking the lock, as otherwise it will leak. This is handled // by making threads which are not in lane_mask or with 0, a no-op. - uint32_t id = gpu::get_lane_id(); + uint32_t id = rpc::get_lane_id(); bool id_in_lane_mask = lane_mask & (1ul << id); // All threads in the warp call fetch_or. Possibly at the same time. bool before = set_nth(lock, index, id_in_lane_mask); - uint64_t packed = gpu::ballot(lane_mask, before); + uint64_t packed = rpc::ballot(lane_mask, before); // If every bit set in lane_mask is also set in packed, every single thread // in the warp failed to get the lock. Ballot returns unset for threads not @@ -198,22 +201,23 @@ template struct Process { // inlining the current function. bool holding_lock = lane_mask != packed; if (holding_lock) - __atomic_thread_fence(__ATOMIC_ACQUIRE); + __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE); return holding_lock; } /// Unlock the lock at index. We need a lane sync to keep this function /// convergent, otherwise the compiler will sink the store and deadlock. - LIBC_INLINE void unlock(uint64_t lane_mask, uint32_t index) { + [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask, + uint32_t index) { // Do not move any writes past the unlock. - __atomic_thread_fence(__ATOMIC_RELEASE); + __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_DEVICE); // Use exactly one thread to clear the nth bit in the lock array Must // restrict to a single thread to avoid one thread dropping the lock, then // an unrelated warp claiming the lock, then a second thread in this warp // dropping the lock again. - clear_nth(lock, index, gpu::is_first_lane(lane_mask)); - gpu::sync_lane(lane_mask); + clear_nth(lock, index, rpc::is_first_lane(lane_mask)); + rpc::sync_lane(lane_mask); } /// Number of bytes to allocate for an inbox or outbox. @@ -276,9 +280,9 @@ template LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size, uint64_t lane_mask, Buffer *slot) { if constexpr (is_process_gpu()) { - fn(&slot[gpu::get_lane_id()], gpu::get_lane_id()); + fn(&slot[rpc::get_lane_id()], rpc::get_lane_id()); } else { - for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size()) + for (uint32_t i = 0; i < lane_size; i += rpc::get_num_lanes()) if (lane_mask & (1ul << i)) fn(&slot[i], i); } @@ -323,7 +327,7 @@ template struct Port { LIBC_INLINE void close() { // Wait for all lanes to finish using the port. - gpu::sync_lane(lane_mask); + rpc::sync_lane(lane_mask); // The server is passive, if it own the buffer when it closes we need to // give ownership back to the client. @@ -466,7 +470,7 @@ LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { }); uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); uint64_t mask = process.header[index].mask; - while (gpu::ballot(mask, idx < num_sends)) { + while (rpc::ballot(mask, idx < num_sends)) { send([=](Buffer *buffer, uint32_t id) { uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) ? sizeof(Buffer::data) @@ -499,7 +503,7 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { }); uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); uint64_t mask = process.header[index].mask; - while (gpu::ballot(mask, idx < num_recvs)) { + while (rpc::ballot(mask, idx < num_recvs)) { recv([=](Buffer *buffer, uint32_t id) { uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) ? sizeof(Buffer::data) @@ -517,16 +521,17 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { /// port. Each port instance uses an associated \p opcode to tell the server /// what to do. The Client interface provides the appropriate lane size to the /// port using the platform's returned value. -template LIBC_INLINE Client::Port Client::open() { +template +[[clang::convergent]] LIBC_INLINE Client::Port Client::open() { // Repeatedly perform a naive linear scan for a port that can be opened to // send data. - for (uint32_t index = gpu::get_cluster_id();; ++index) { + for (uint32_t index = 0;; ++index) { // Start from the beginning if we run out of ports to check. if (index >= process.port_count) index = 0; // Attempt to acquire the lock on this index. - uint64_t lane_mask = gpu::get_lane_mask(); + uint64_t lane_mask = rpc::get_lane_mask(); if (!process.try_lock(lane_mask, index)) continue; @@ -540,22 +545,22 @@ template LIBC_INLINE Client::Port Client::open() { continue; } - if (gpu::is_first_lane(lane_mask)) { + if (rpc::is_first_lane(lane_mask)) { process.header[index].opcode = opcode; process.header[index].mask = lane_mask; } - gpu::sync_lane(lane_mask); - return Port(process, lane_mask, gpu::get_lane_size(), index, out); + rpc::sync_lane(lane_mask); + return Port(process, lane_mask, rpc::get_num_lanes(), index, out); } } /// Attempts to open a port to use as the server. The server can only open a /// port if it has a pending receive operation -LIBC_INLINE cpp::optional +[[clang::convergent]] LIBC_INLINE cpp::optional Server::try_open(uint32_t lane_size, uint32_t start) { // Perform a naive linear scan for a port that has a pending request. for (uint32_t index = start; index < process.port_count; ++index) { - uint64_t lane_mask = gpu::get_lane_mask(); + uint64_t lane_mask = rpc::get_lane_mask(); uint32_t in = process.load_inbox(lane_mask, index); uint32_t out = process.load_outbox(lane_mask, index); @@ -595,6 +600,9 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) { #undef __scoped_atomic_fetch_or #undef __scoped_atomic_fetch_and #endif +#if !__has_builtin(__scoped_atomic_thread_fence) +#undef __scoped_atomic_thread_fence +#endif } // namespace rpc } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h index 93b8289617484..39e5f30b84ac4 100644 --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -10,22 +10,87 @@ #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H #include "src/__support/CPP/type_traits.h" -#include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" #include "src/__support/threads/sleep.h" +#if defined(__NVPTX__) || defined(__AMDGPU__) +#include +#define RPC_TARGET_IS_GPU +#endif + namespace LIBC_NAMESPACE_DECL { namespace rpc { /// Conditional to indicate if this process is running on the GPU. LIBC_INLINE constexpr bool is_process_gpu() { -#if defined(__NVPTX__) || defined(__AMDGPU__) +#ifdef RPC_TARGET_IS_GPU return true; #else return false; #endif } +/// Wait for all lanes in the group to complete. +LIBC_INLINE void sync_lane(uint64_t lane_mask) { +#ifdef RPC_TARGET_IS_GPU + return __gpu_sync_lane(lane_mask); +#endif +} + +/// Copies the value from the first active thread to the rest. +LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) { +#ifdef RPC_TARGET_IS_GPU + return __gpu_read_first_lane_u32(lane_mask, x); +#else + return x; +#endif +} + +/// Returns the number lanes that participate in the RPC interface. +LIBC_INLINE uint32_t get_num_lanes() { +#ifdef RPC_TARGET_IS_GPU + return __gpu_num_lanes(); +#else + return 1; +#endif +} + +/// Returns the id of the thread inside of an AMD wavefront executing together. +LIBC_INLINE uint64_t get_lane_mask() { +#ifdef RPC_TARGET_IS_GPU + return __gpu_lane_mask(); +#else + return 1; +#endif +} + +/// Returns the id of the thread inside of an AMD wavefront executing together. +LIBC_INLINE uint32_t get_lane_id() { +#ifdef RPC_TARGET_IS_GPU + return __gpu_lane_id(); +#else + return 0; +#endif +} + +/// Conditional that is only true for a single thread in a lane. +LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { +#ifdef RPC_TARGET_IS_GPU + return __gpu_is_first_in_lane(lane_mask); +#else + return true; +#endif +} + +/// Returns a bitmask of threads in the current lane for which \p x is true. +LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { +#ifdef RPC_TARGET_IS_GPU + return __gpu_ballot(lane_mask, x); +#else + return x; +#endif +} + /// Return \p val aligned "upwards" according to \p align. template LIBC_INLINE constexpr V align_up(V val, A align) { diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h index 5010ee16d9607..409775f3f33cc 100644 --- a/libc/src/stdio/gpu/vfprintf_utils.h +++ b/libc/src/stdio/gpu/vfprintf_utils.h @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/types/FILE.h" +#include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" diff --git a/libc/src/stdlib/gpu/abort.cpp b/libc/src/stdlib/gpu/abort.cpp index cfc7e9b8e228b..3a06fb38c3f64 100644 --- a/libc/src/stdlib/gpu/abort.cpp +++ b/libc/src/stdlib/gpu/abort.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" #include "src/__support/common.h" #include "src/__support/macros/config.h"