16
16
17
17
#include " allocator.h"
18
18
19
+ #include " src/__support/CPP/algorithm.h"
19
20
#include " src/__support/CPP/atomic.h"
20
21
#include " src/__support/CPP/bit.h"
21
22
#include " src/__support/CPP/new.h"
@@ -31,6 +32,7 @@ constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
31
32
constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
32
33
constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1 ;
33
34
constexpr static uint32_t BITS_IN_WORD = sizeof (uint32_t ) * 8 ;
35
+ constexpr static uint32_t BITS_IN_DWORD = sizeof (uint64_t ) * 8 ;
34
36
constexpr static uint32_t MIN_SIZE = 16 ;
35
37
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1 ;
36
38
@@ -70,8 +72,8 @@ static void rpc_free(void *ptr) {
70
72
71
73
// Convert a potentially disjoint bitmask into an increasing integer per-lane
72
74
// for use with indexing between gpu lanes.
73
- static inline uint32_t lane_count (uint64_t lane_mask) {
74
- return cpp::popcount (lane_mask & ((uint64_t (1 ) << gpu::get_lane_id () ) - 1 ));
75
+ static inline uint32_t lane_count (uint64_t lane_mask, uint32_t id ) {
76
+ return cpp::popcount (lane_mask & ((uint64_t (1 ) << id ) - 1 ));
75
77
}
76
78
77
79
// Obtain an initial value to seed a random number generator. We use the rounded
@@ -133,7 +135,8 @@ static inline constexpr T round_up(const T x) {
133
135
void uniform_memset (uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
134
136
uint64_t mask = gpu::get_lane_mask ();
135
137
uint32_t workers = cpp::popcount (uniform);
136
- for (uint32_t i = impl::lane_count (mask & uniform); i < n; i += workers)
138
+ for (uint32_t i = impl::lane_count (mask & uniform, gpu::get_lane_id ()); i < n;
139
+ i += workers)
137
140
s[i] = c;
138
141
}
139
142
@@ -154,6 +157,12 @@ static inline constexpr uint32_t get_start_index(uint32_t chunk_size) {
154
157
return static_cast <uint32_t >(((ARRAY_SIZE - 1 ) * inv) >> 16 );
155
158
}
156
159
160
+ // Returns the id of the lane below this one that acts as its leader.
161
+ static inline uint32_t get_leader_id (uint64_t ballot, uint32_t id) {
162
+ uint64_t mask = id < BITS_IN_DWORD ? ~0ull << (id + 1 ) : 0 ;
163
+ return BITS_IN_DWORD - cpp::countl_zero (ballot & ~mask) - 1 ;
164
+ }
165
+
157
166
} // namespace impl
158
167
159
168
// / A slab allocator used to hand out identically sized slabs of memory.
@@ -275,23 +284,28 @@ struct Slab {
275
284
~after ? (old_index & ~(BITS_IN_WORD - 1 )) + cpp::countr_zero (~after)
276
285
: __builtin_align_down (impl::xorshift32 (state), BITS_IN_WORD));
277
286
278
- uint32_t id = impl::lane_count (uniform & mask);
287
+ // Each lane tries to claim one bit in a single contiguous mask.
288
+ uint32_t id = impl::lane_count (uniform & mask, gpu::get_lane_id ());
279
289
uint32_t index = (start + id) % usable_bits (chunk_size);
280
290
uint32_t slot = index / BITS_IN_WORD;
281
291
uint32_t bit = index % BITS_IN_WORD;
282
292
283
293
// Get the mask of bits destined for the same slot and coalesce it.
284
- uint64_t match = uniform & gpu::match_any (mask, slot);
285
- uint32_t length = cpp::popcount (match);
286
- uint32_t bitmask = gpu::shuffle (
287
- mask, cpp::countr_zero (match),
288
- static_cast <uint32_t >((uint64_t (1 ) << length) - 1 ) << bit);
294
+ uint32_t leader = impl::get_leader_id (
295
+ uniform & gpu::ballot (mask, !id || index % BITS_IN_WORD == 0 ),
296
+ gpu::get_lane_id ());
297
+ uint32_t length = cpp::popcount (uniform & mask) -
298
+ impl::lane_count (uniform & mask, leader);
299
+ uint32_t bitmask =
300
+ static_cast <uint32_t >(
301
+ (uint64_t (1 ) << cpp::min (length, BITS_IN_WORD)) - 1 )
302
+ << bit;
289
303
290
304
uint32_t before = 0 ;
291
- if (gpu::get_lane_id () == static_cast < uint32_t >( cpp::countr_zero (match)) )
305
+ if (gpu::get_lane_id () == leader )
292
306
before = cpp::AtomicRef (get_bitfield ()[slot])
293
307
.fetch_or (bitmask, cpp::MemoryOrder::RELAXED);
294
- before = gpu::shuffle (mask, cpp::countr_zero (match) , before);
308
+ before = gpu::shuffle (mask, leader , before);
295
309
if (~before & (1 << bit))
296
310
result = ptr_from_index (index, chunk_size);
297
311
else
@@ -446,7 +460,8 @@ struct GuardPtr {
446
460
}
447
461
448
462
if (count != cpp::numeric_limits<uint64_t >::max ())
449
- count = count - cpp::popcount (uniform) + impl::lane_count (uniform) + 1 ;
463
+ count = count - cpp::popcount (uniform) +
464
+ impl::lane_count (uniform, gpu::get_lane_id ()) + 1 ;
450
465
451
466
return result;
452
467
}
0 commit comments