@@ -266,38 +266,31 @@ struct Slab {
266
266
267
267
// Randomly walks the bitfield until it finds a free bit. Allocations attempt
268
268
// to put lanes right next to each other for better caching and convergence.
269
- void *allocate (uint64_t lane_mask, uint64_t uniform ) {
269
+ void *allocate (uint64_t uniform, uint32_t reserved ) {
270
270
uint32_t chunk_size = get_chunk_size ();
271
271
uint32_t state = impl::entropy ();
272
272
273
- // The uniform mask represents which lanes contain a uniform target pointer.
274
- // We attempt to place these next to each other.
275
- void *result = nullptr ;
276
- uint32_t after = ~0u ;
277
- uint32_t old_index = 0 ;
278
- for (uint64_t mask = lane_mask; mask;
279
- mask = gpu::ballot (lane_mask, !result)) {
280
- if (result)
281
- continue ;
282
-
283
- // We try using any known empty bits from the previous attempt first.
284
- uint32_t start = gpu::shuffle (
285
- mask, cpp::countr_zero (uniform & mask),
286
- ~after ? (old_index & ~(BITS_IN_WORD - 1 )) + cpp::countr_zero (~after)
287
- : __builtin_align_down (impl::xorshift32 (state), BITS_IN_WORD));
273
+ // Try to find the empty bit in the bitfield to finish the allocation. We
274
+ // start at the number of allocations as this is guaranteed to be available
275
+ // until the user starts freeing memory.
276
+ uint64_t lane_mask = gpu::get_lane_mask ();
277
+ uint32_t start = gpu::shuffle (
278
+ lane_mask, cpp::countr_zero (uniform & lane_mask), reserved);
279
+ for (;;) {
280
+ uint64_t lane_mask = gpu::get_lane_mask ();
288
281
289
282
// Each lane tries to claim one bit in a single contiguous mask.
290
- uint32_t id = impl::lane_count (uniform & mask , gpu::get_lane_id ());
283
+ uint32_t id = impl::lane_count (uniform & lane_mask , gpu::get_lane_id ());
291
284
uint32_t index = (start + id) % usable_bits (chunk_size);
292
285
uint32_t slot = index / BITS_IN_WORD;
293
286
uint32_t bit = index % BITS_IN_WORD;
294
287
295
288
// Get the mask of bits destined for the same slot and coalesce it.
296
289
uint32_t leader = impl::get_leader_id (
297
- uniform & gpu::ballot (mask , !id || index % BITS_IN_WORD == 0 ),
290
+ uniform & gpu::ballot (lane_mask , !id || index % BITS_IN_WORD == 0 ),
298
291
gpu::get_lane_id ());
299
- uint32_t length = cpp::popcount (uniform & mask ) -
300
- impl::lane_count (uniform & mask , leader);
292
+ uint32_t length = cpp::popcount (uniform & lane_mask ) -
293
+ impl::lane_count (uniform & lane_mask , leader);
301
294
uint32_t bitmask =
302
295
static_cast <uint32_t >(
303
296
(uint64_t (1 ) << cpp::min (length, BITS_IN_WORD)) - 1 )
@@ -307,18 +300,23 @@ struct Slab {
307
300
if (gpu::get_lane_id () == leader)
308
301
before = cpp::AtomicRef (get_bitfield ()[slot])
309
302
.fetch_or (bitmask, cpp::MemoryOrder::RELAXED);
310
- before = gpu::shuffle (mask , leader, before);
311
- if (~before & (1 << bit))
312
- result = ptr_from_index (index, chunk_size );
313
- else
314
- sleep_briefly ();
303
+ before = gpu::shuffle (lane_mask , leader, before);
304
+ if (~before & (1 << bit)) {
305
+ cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE );
306
+ return ptr_from_index (index, chunk_size);
307
+ }
315
308
316
- after = before | bitmask;
317
- old_index = index;
309
+ // If the previous operation found an empty bit we move there, otherwise
310
+ // we generate new random index to start at.
311
+ uint32_t after = before | bitmask;
312
+ start = gpu::shuffle (
313
+ gpu::get_lane_mask (),
314
+ cpp::countr_zero (uniform & gpu::get_lane_mask ()),
315
+ ~after ? __builtin_align_down (index, BITS_IN_WORD) +
316
+ cpp::countr_zero (~after)
317
+ : __builtin_align_down (impl::xorshift32 (state), BITS_IN_WORD));
318
+ sleep_briefly ();
318
319
}
319
-
320
- cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
321
- return result;
322
320
}
323
321
324
322
// Deallocates memory by resetting its corresponding bit in the bitfield.
@@ -507,7 +505,8 @@ static cpp::Atomic<uint32_t> indices[] = {
507
505
#undef S
508
506
509
507
// Tries to find a slab in the table that can support the given chunk size.
510
- static Slab *find_slab (uint32_t chunk_size, uint64_t &uniform) {
508
+ static Slab *find_slab (uint32_t chunk_size, uint64_t &uniform,
509
+ uint32_t &reserved) {
511
510
// We start at the index of the last successful allocation for this kind.
512
511
uint32_t chunk_id = impl::get_chunk_id (chunk_size);
513
512
uint32_t start = indices[chunk_id].load (cpp::MemoryOrder::RELAXED);
@@ -520,7 +519,6 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
520
519
if (!offset ||
521
520
slots[index].use_count () < Slab::available_chunks (chunk_size)) {
522
521
uint64_t lane_mask = gpu::get_lane_mask ();
523
- uint32_t reserved = 0 ;
524
522
525
523
Slab *slab = slots[index].try_lock (lane_mask, uniform & lane_mask,
526
524
reserved, chunk_size, index);
@@ -580,12 +578,12 @@ void *allocate(uint64_t size) {
580
578
// Try to find a slab for the rounded up chunk size and allocate from it.
581
579
uint32_t chunk_size = impl::get_chunk_size (static_cast <uint32_t >(size));
582
580
uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
583
- Slab *slab = find_slab (chunk_size, uniform);
584
- if (!slab || impl::is_sentinel (reinterpret_cast <uintptr_t >(slab)))
581
+ uint32_t reserved = 0 ;
582
+ Slab *slab = find_slab (chunk_size, uniform, reserved);
583
+ if (!slab)
585
584
return nullptr ;
586
585
587
- uint64_t lane_mask = gpu::get_lane_mask ();
588
- void *ptr = slab->allocate (lane_mask, uniform);
586
+ void *ptr = slab->allocate (uniform, reserved);
589
587
return ptr;
590
588
}
591
589
0 commit comments