1919#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
2020
2121#include " rpc_util.h"
22- #include " src/__support/CPP/algorithm.h" // max
23- #include " src/__support/CPP/atomic.h"
2422#include " src/__support/CPP/optional.h"
2523#include " src/__support/GPU/utils.h"
2624#include " src/__support/macros/config.h"
3028namespace LIBC_NAMESPACE_DECL {
3129namespace rpc {
3230
31+ // / Use scoped atomic variants if they are available for the target.
32+ #if !__has_builtin(__scoped_atomic_load_n)
33+ #define __scoped_atomic_load_n (src, ord, scp ) __atomic_load_n(src, ord)
34+ #define __scoped_atomic_store_n (dst, src, ord, scp ) \
35+ __atomic_store_n (dst, src, ord)
36+ #define __scoped_atomic_fetch_or (src, val, ord, scp ) \
37+ __atomic_fetch_or (src, val, ord)
38+ #define __scoped_atomic_fetch_and (src, val, ord, scp ) \
39+ __atomic_fetch_and (src, val, ord)
40+ #endif
41+
3342// / A fixed size channel used to communicate between the RPC client and server.
3443struct Buffer {
3544 uint64_t data[8 ];
@@ -67,18 +76,18 @@ template <bool Invert> struct Process {
6776 LIBC_INLINE ~Process () = default ;
6877
6978 uint32_t port_count = 0 ;
70- cpp::Atomic< uint32_t > *inbox = nullptr ;
71- cpp::Atomic< uint32_t > *outbox = nullptr ;
79+ uint32_t *inbox = nullptr ;
80+ uint32_t *outbox = nullptr ;
7281 Header *header = nullptr ;
7382 Buffer *packet = nullptr ;
7483
7584 static constexpr uint64_t NUM_BITS_IN_WORD = sizeof (uint32_t ) * 8 ;
76- cpp::Atomic< uint32_t > lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0 };
85+ uint32_t lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0 };
7786
7887 LIBC_INLINE Process (uint32_t port_count, void *buffer)
79- : port_count(port_count), inbox(reinterpret_cast <cpp::Atomic< uint32_t > *>(
88+ : port_count(port_count), inbox(reinterpret_cast <uint32_t *>(
8089 advance (buffer, inbox_offset(port_count)))),
81- outbox(reinterpret_cast <cpp::Atomic< uint32_t > *>(
90+ outbox(reinterpret_cast <uint32_t *>(
8291 advance (buffer, outbox_offset(port_count)))),
8392 header(reinterpret_cast <Header *>(
8493 advance (buffer, header_offset(port_count)))),
@@ -102,15 +111,15 @@ template <bool Invert> struct Process {
102111 // / Retrieve the inbox state from memory shared between processes.
103112 LIBC_INLINE uint32_t load_inbox (uint64_t lane_mask, uint32_t index) const {
104113 return gpu::broadcast_value (
105- lane_mask,
106- inbox[index]. load (cpp::MemoryOrder::RELAXED, cpp::MemoryScope::SYSTEM ));
114+ lane_mask, __scoped_atomic_load_n (&inbox[index], __ATOMIC_RELAXED,
115+ __MEMORY_SCOPE_SYSTEM ));
107116 }
108117
109118 // / Retrieve the outbox state from memory shared between processes.
110119 LIBC_INLINE uint32_t load_outbox (uint64_t lane_mask, uint32_t index) const {
111- return gpu::broadcast_value (lane_mask,
112- outbox[index]. load (cpp::MemoryOrder::RELAXED ,
113- cpp::MemoryScope::SYSTEM ));
120+ return gpu::broadcast_value (
121+ lane_mask, __scoped_atomic_load_n (& outbox[index], __ATOMIC_RELAXED ,
122+ __MEMORY_SCOPE_SYSTEM ));
114123 }
115124
116125 // / Signal to the other process that this one is finished with the buffer.
@@ -119,9 +128,9 @@ template <bool Invert> struct Process {
119128 // / cheaper than calling load_outbox to get the value to store.
120129 LIBC_INLINE uint32_t invert_outbox (uint32_t index, uint32_t current_outbox) {
121130 uint32_t inverted_outbox = !current_outbox;
122- atomic_thread_fence (cpp::MemoryOrder::RELEASE );
123- outbox[index]. store ( inverted_outbox, cpp::MemoryOrder::RELAXED ,
124- cpp::MemoryScope::SYSTEM );
131+ __atomic_thread_fence (__ATOMIC_RELEASE );
132+ __scoped_atomic_store_n (& outbox[index], inverted_outbox, __ATOMIC_RELAXED ,
133+ __MEMORY_SCOPE_SYSTEM );
125134 return inverted_outbox;
126135 }
127136
@@ -133,7 +142,7 @@ template <bool Invert> struct Process {
133142 sleep_briefly ();
134143 in = load_inbox (lane_mask, index);
135144 }
136- atomic_thread_fence (cpp::MemoryOrder::ACQUIRE );
145+ __atomic_thread_fence (__ATOMIC_ACQUIRE );
137146 }
138147
139148 // / The packet is a linearly allocated array of buffers used to communicate
@@ -155,8 +164,7 @@ template <bool Invert> struct Process {
155164 // / lane_mask is a bitmap of the threads in the warp that would hold the
156165 // / single lock on success, e.g. the result of gpu::get_lane_mask()
157166 // / The lock is held when the n-th bit of the lock bitfield is set.
158- [[clang::convergent]] LIBC_INLINE bool try_lock (uint64_t lane_mask,
159- uint32_t index) {
167+ LIBC_INLINE bool try_lock (uint64_t lane_mask, uint32_t index) {
160168 // On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
161169 // On volta, need to handle differences between the threads running and
162170 // the threads that were detected in the previous call to get_lane_mask()
@@ -190,16 +198,15 @@ template <bool Invert> struct Process {
190198 // inlining the current function.
191199 bool holding_lock = lane_mask != packed;
192200 if (holding_lock)
193- atomic_thread_fence (cpp::MemoryOrder::ACQUIRE );
201+ __atomic_thread_fence (__ATOMIC_ACQUIRE );
194202 return holding_lock;
195203 }
196204
197205 // / Unlock the lock at index. We need a lane sync to keep this function
198206 // / convergent, otherwise the compiler will sink the store and deadlock.
199- [[clang::convergent]] LIBC_INLINE void unlock (uint64_t lane_mask,
200- uint32_t index) {
207+ LIBC_INLINE void unlock (uint64_t lane_mask, uint32_t index) {
201208 // Do not move any writes past the unlock.
202- atomic_thread_fence (cpp::MemoryOrder::RELEASE );
209+ __atomic_thread_fence (__ATOMIC_RELEASE );
203210
204211 // Use exactly one thread to clear the nth bit in the lock array Must
205212 // restrict to a single thread to avoid one thread dropping the lock, then
@@ -211,7 +218,7 @@ template <bool Invert> struct Process {
211218
212219 // / Number of bytes to allocate for an inbox or outbox.
213220 LIBC_INLINE static constexpr uint64_t mailbox_bytes (uint32_t port_count) {
214- return port_count * sizeof (cpp::Atomic< uint32_t > );
221+ return port_count * sizeof (uint32_t );
215222 }
216223
217224 // / Number of bytes to allocate for the buffer containing the packets.
@@ -242,24 +249,24 @@ template <bool Invert> struct Process {
242249 }
243250
244251 // / Conditionally set the n-th bit in the atomic bitfield.
245- LIBC_INLINE static constexpr uint32_t set_nth (cpp::Atomic< uint32_t > *bits,
246- uint32_t index, bool cond) {
252+ LIBC_INLINE static constexpr uint32_t set_nth (uint32_t *bits, uint32_t index ,
253+ bool cond) {
247254 uint32_t slot = index / NUM_BITS_IN_WORD;
248255 uint32_t bit = index % NUM_BITS_IN_WORD;
249- return bits[slot]. fetch_or ( static_cast < uint32_t >(cond) << bit ,
250- cpp::MemoryOrder::RELAXED ,
251- cpp::MemoryScope::DEVICE ) &
256+ return __scoped_atomic_fetch_or (& bits[slot],
257+ static_cast < uint32_t >(cond) << bit ,
258+ __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE ) &
252259 (1u << bit);
253260 }
254261
255262 // / Conditionally clear the n-th bit in the atomic bitfield.
256- LIBC_INLINE static constexpr uint32_t clear_nth (cpp::Atomic< uint32_t > *bits,
263+ LIBC_INLINE static constexpr uint32_t clear_nth (uint32_t *bits,
257264 uint32_t index, bool cond) {
258265 uint32_t slot = index / NUM_BITS_IN_WORD;
259266 uint32_t bit = index % NUM_BITS_IN_WORD;
260- return bits[slot]. fetch_and (~ 0u ^ ( static_cast < uint32_t >(cond) << bit) ,
261- cpp::MemoryOrder::RELAXED ,
262- cpp::MemoryScope::DEVICE ) &
267+ return __scoped_atomic_fetch_and (& bits[slot],
268+ ~ 0u ^ ( static_cast < uint32_t >(cond) << bit) ,
269+ __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE ) &
263270 (1u << bit);
264271 }
265272};
@@ -450,7 +457,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
450457 send ([&](Buffer *buffer, uint32_t id) {
451458 reinterpret_cast <uint64_t *>(buffer->data )[0 ] = lane_value (size, id);
452459 num_sends = is_process_gpu () ? lane_value (size, id)
453- : cpp ::max (lane_value (size, id), num_sends);
460+ : rpc ::max (lane_value (size, id), num_sends);
454461 uint64_t len =
455462 lane_value (size, id) > sizeof (Buffer::data) - sizeof (uint64_t )
456463 ? sizeof (Buffer::data) - sizeof (uint64_t )
@@ -483,7 +490,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
483490 lane_value (dst, id) =
484491 reinterpret_cast <uint8_t *>(alloc (lane_value (size, id)));
485492 num_recvs = is_process_gpu () ? lane_value (size, id)
486- : cpp ::max (lane_value (size, id), num_recvs);
493+ : rpc ::max (lane_value (size, id), num_recvs);
487494 uint64_t len =
488495 lane_value (size, id) > sizeof (Buffer::data) - sizeof (uint64_t )
489496 ? sizeof (Buffer::data) - sizeof (uint64_t )
@@ -510,8 +517,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
510517// / port. Each port instance uses an associated \p opcode to tell the server
511518// / what to do. The Client interface provides the appropriate lane size to the
512519// / port using the platform's returned value.
513- template <uint16_t opcode>
514- [[clang::convergent]] LIBC_INLINE Client::Port Client::open () {
520+ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open () {
515521 // Repeatedly perform a naive linear scan for a port that can be opened to
516522 // send data.
517523 for (uint32_t index = gpu::get_cluster_id ();; ++index) {
@@ -545,7 +551,7 @@ template <uint16_t opcode>
545551
546552// / Attempts to open a port to use as the server. The server can only open a
547553// / port if it has a pending receive operation
548- [[clang::convergent]] LIBC_INLINE cpp::optional<typename Server::Port>
554+ LIBC_INLINE cpp::optional<typename Server::Port>
549555Server::try_open (uint32_t lane_size, uint32_t start) {
550556 // Perform a naive linear scan for a port that has a pending request.
551557 for (uint32_t index = start; index < process.port_count ; ++index) {
@@ -583,6 +589,13 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
583589 }
584590}
585591
592+ #if !__has_builtin(__scoped_atomic_load_n)
593+ #undef __scoped_atomic_load_n
594+ #undef __scoped_atomic_store_n
595+ #undef __scoped_atomic_fetch_or
596+ #undef __scoped_atomic_fetch_and
597+ #endif
598+
586599} // namespace rpc
587600} // namespace LIBC_NAMESPACE_DECL
588601
0 commit comments