1616
1717#include " allocator.h"
1818
19+ #include " src/__support/CPP/algorithm.h"
1920#include " src/__support/CPP/atomic.h"
2021#include " src/__support/CPP/bit.h"
2122#include " src/__support/CPP/new.h"
@@ -31,6 +32,7 @@ constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
3132constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
3233constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1 ;
3334constexpr static uint32_t BITS_IN_WORD = sizeof (uint32_t ) * 8 ;
35+ constexpr static uint32_t BITS_IN_DWORD = sizeof (uint64_t ) * 8 ;
3436constexpr static uint32_t MIN_SIZE = 16 ;
3537constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1 ;
3638
@@ -70,8 +72,8 @@ static void rpc_free(void *ptr) {
7072
7173// Convert a potentially disjoint bitmask into an increasing integer per-lane
7274// for use with indexing between gpu lanes.
73- static inline uint32_t lane_count (uint64_t lane_mask) {
74- return cpp::popcount (lane_mask & ((uint64_t (1 ) << gpu::get_lane_id () ) - 1 ));
75+ static inline uint32_t lane_count (uint64_t lane_mask, uint32_t id ) {
76+ return cpp::popcount (lane_mask & ((uint64_t (1 ) << id ) - 1 ));
7577}
7678
7779// Obtain an initial value to seed a random number generator. We use the rounded
@@ -133,7 +135,8 @@ static inline constexpr T round_up(const T x) {
133135void uniform_memset (uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
134136 uint64_t mask = gpu::get_lane_mask ();
135137 uint32_t workers = cpp::popcount (uniform);
136- for (uint32_t i = impl::lane_count (mask & uniform); i < n; i += workers)
138+ for (uint32_t i = impl::lane_count (mask & uniform, gpu::get_lane_id ()); i < n;
139+ i += workers)
137140 s[i] = c;
138141}
139142
@@ -154,6 +157,12 @@ static inline constexpr uint32_t get_start_index(uint32_t chunk_size) {
154157 return static_cast <uint32_t >(((ARRAY_SIZE - 1 ) * inv) >> 16 );
155158}
156159
160+ // Returns the id of the lane below this one that acts as its leader.
161+ static inline uint32_t get_leader_id (uint64_t ballot, uint32_t id) {
162+ uint64_t mask = id < BITS_IN_DWORD ? ~0ull << (id + 1 ) : 0 ;
163+ return BITS_IN_DWORD - cpp::countl_zero (ballot & ~mask) - 1 ;
164+ }
165+
157166} // namespace impl
158167
159168// / A slab allocator used to hand out identically sized slabs of memory.
@@ -275,23 +284,28 @@ struct Slab {
275284 ~after ? (old_index & ~(BITS_IN_WORD - 1 )) + cpp::countr_zero (~after)
276285 : __builtin_align_down (impl::xorshift32 (state), BITS_IN_WORD));
277286
278- uint32_t id = impl::lane_count (uniform & mask);
287+ // Each lane tries to claim one bit in a single contiguous mask.
288+ uint32_t id = impl::lane_count (uniform & mask, gpu::get_lane_id ());
279289 uint32_t index = (start + id) % usable_bits (chunk_size);
280290 uint32_t slot = index / BITS_IN_WORD;
281291 uint32_t bit = index % BITS_IN_WORD;
282292
283293 // Get the mask of bits destined for the same slot and coalesce it.
284- uint64_t match = uniform & gpu::match_any (mask, slot);
285- uint32_t length = cpp::popcount (match);
286- uint32_t bitmask = gpu::shuffle (
287- mask, cpp::countr_zero (match),
288- static_cast <uint32_t >((uint64_t (1 ) << length) - 1 ) << bit);
294+ uint32_t leader = impl::get_leader_id (
295+ uniform & gpu::ballot (mask, !id || index % BITS_IN_WORD == 0 ),
296+ gpu::get_lane_id ());
297+ uint32_t length = cpp::popcount (uniform & mask) -
298+ impl::lane_count (uniform & mask, leader);
299+ uint32_t bitmask =
300+ static_cast <uint32_t >(
301+ (uint64_t (1 ) << cpp::min (length, BITS_IN_WORD)) - 1 )
302+ << bit;
289303
290304 uint32_t before = 0 ;
291- if (gpu::get_lane_id () == static_cast < uint32_t >( cpp::countr_zero (match)) )
305+ if (gpu::get_lane_id () == leader )
292306 before = cpp::AtomicRef (get_bitfield ()[slot])
293307 .fetch_or (bitmask, cpp::MemoryOrder::RELAXED);
294- before = gpu::shuffle (mask, cpp::countr_zero (match) , before);
308+ before = gpu::shuffle (mask, leader , before);
295309 if (~before & (1 << bit))
296310 result = ptr_from_index (index, chunk_size);
297311 else
@@ -446,7 +460,8 @@ struct GuardPtr {
446460 }
447461
448462 if (count != cpp::numeric_limits<uint64_t >::max ())
449- count = count - cpp::popcount (uniform) + impl::lane_count (uniform) + 1 ;
463+ count = count - cpp::popcount (uniform) +
464+ impl::lane_count (uniform, gpu::get_lane_id ()) + 1 ;
450465
451466 return result;
452467 }
0 commit comments