@@ -266,38 +266,31 @@ struct Slab {
266266
267267 // Randomly walks the bitfield until it finds a free bit. Allocations attempt
268268 // to put lanes right next to each other for better caching and convergence.
269- void *allocate (uint64_t lane_mask, uint64_t uniform ) {
269+ void *allocate (uint64_t uniform, uint32_t reserved ) {
270270 uint32_t chunk_size = get_chunk_size ();
271271 uint32_t state = impl::entropy ();
272272
273- // The uniform mask represents which lanes contain a uniform target pointer.
274- // We attempt to place these next to each other.
275- void *result = nullptr ;
276- uint32_t after = ~0u ;
277- uint32_t old_index = 0 ;
278- for (uint64_t mask = lane_mask; mask;
279- mask = gpu::ballot (lane_mask, !result)) {
280- if (result)
281- continue ;
282-
283- // We try using any known empty bits from the previous attempt first.
284- uint32_t start = gpu::shuffle (
285- mask, cpp::countr_zero (uniform & mask),
286- ~after ? (old_index & ~(BITS_IN_WORD - 1 )) + cpp::countr_zero (~after)
287- : __builtin_align_down (impl::xorshift32 (state), BITS_IN_WORD));
273+ // Try to find the empty bit in the bitfield to finish the allocation. We
274+ // start at the number of allocations as this is guaranteed to be available
275+ // until the user starts freeing memory.
276+ uint64_t lane_mask = gpu::get_lane_mask ();
277+ uint32_t start = gpu::shuffle (
278+ lane_mask, cpp::countr_zero (uniform & lane_mask), reserved);
279+ for (;;) {
280+ uint64_t lane_mask = gpu::get_lane_mask ();
288281
289282 // Each lane tries to claim one bit in a single contiguous mask.
290- uint32_t id = impl::lane_count (uniform & mask , gpu::get_lane_id ());
283+ uint32_t id = impl::lane_count (uniform & lane_mask , gpu::get_lane_id ());
291284 uint32_t index = (start + id) % usable_bits (chunk_size);
292285 uint32_t slot = index / BITS_IN_WORD;
293286 uint32_t bit = index % BITS_IN_WORD;
294287
295288 // Get the mask of bits destined for the same slot and coalesce it.
296289 uint32_t leader = impl::get_leader_id (
297- uniform & gpu::ballot (mask , !id || index % BITS_IN_WORD == 0 ),
290+ uniform & gpu::ballot (lane_mask , !id || index % BITS_IN_WORD == 0 ),
298291 gpu::get_lane_id ());
299- uint32_t length = cpp::popcount (uniform & mask ) -
300- impl::lane_count (uniform & mask , leader);
292+ uint32_t length = cpp::popcount (uniform & lane_mask ) -
293+ impl::lane_count (uniform & lane_mask , leader);
301294 uint32_t bitmask =
302295 static_cast <uint32_t >(
303296 (uint64_t (1 ) << cpp::min (length, BITS_IN_WORD)) - 1 )
@@ -307,18 +300,23 @@ struct Slab {
307300 if (gpu::get_lane_id () == leader)
308301 before = cpp::AtomicRef (get_bitfield ()[slot])
309302 .fetch_or (bitmask, cpp::MemoryOrder::RELAXED);
310- before = gpu::shuffle (mask , leader, before);
311- if (~before & (1 << bit))
312- result = ptr_from_index (index, chunk_size );
313- else
314- sleep_briefly ();
303+ before = gpu::shuffle (lane_mask , leader, before);
304+ if (~before & (1 << bit)) {
305+ cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE );
306+ return ptr_from_index (index, chunk_size);
307+ }
315308
316- after = before | bitmask;
317- old_index = index;
309+ // If the previous operation found an empty bit we move there, otherwise
310+ // we generate new random index to start at.
311+ uint32_t after = before | bitmask;
312+ start = gpu::shuffle (
313+ gpu::get_lane_mask (),
314+ cpp::countr_zero (uniform & gpu::get_lane_mask ()),
315+ ~after ? __builtin_align_down (index, BITS_IN_WORD) +
316+ cpp::countr_zero (~after)
317+ : __builtin_align_down (impl::xorshift32 (state), BITS_IN_WORD));
318+ sleep_briefly ();
318319 }
319-
320- cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
321- return result;
322320 }
323321
324322 // Deallocates memory by resetting its corresponding bit in the bitfield.
@@ -507,7 +505,8 @@ static cpp::Atomic<uint32_t> indices[] = {
507505#undef S
508506
509507// Tries to find a slab in the table that can support the given chunk size.
510- static Slab *find_slab (uint32_t chunk_size, uint64_t &uniform) {
508+ static Slab *find_slab (uint32_t chunk_size, uint64_t &uniform,
509+ uint32_t &reserved) {
511510 // We start at the index of the last successful allocation for this kind.
512511 uint32_t chunk_id = impl::get_chunk_id (chunk_size);
513512 uint32_t start = indices[chunk_id].load (cpp::MemoryOrder::RELAXED);
@@ -520,7 +519,6 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
520519 if (!offset ||
521520 slots[index].use_count () < Slab::available_chunks (chunk_size)) {
522521 uint64_t lane_mask = gpu::get_lane_mask ();
523- uint32_t reserved = 0 ;
524522
525523 Slab *slab = slots[index].try_lock (lane_mask, uniform & lane_mask,
526524 reserved, chunk_size, index);
@@ -580,12 +578,12 @@ void *allocate(uint64_t size) {
580578 // Try to find a slab for the rounded up chunk size and allocate from it.
581579 uint32_t chunk_size = impl::get_chunk_size (static_cast <uint32_t >(size));
582580 uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
583- Slab *slab = find_slab (chunk_size, uniform);
584- if (!slab || impl::is_sentinel (reinterpret_cast <uintptr_t >(slab)))
581+ uint32_t reserved = 0 ;
582+ Slab *slab = find_slab (chunk_size, uniform, reserved);
583+ if (!slab)
585584 return nullptr ;
586585
587- uint64_t lane_mask = gpu::get_lane_mask ();
588- void *ptr = slab->allocate (lane_mask, uniform);
586+ void *ptr = slab->allocate (uniform, reserved);
589587 return ptr;
590588}
591589
0 commit comments