@@ -32,6 +32,9 @@ constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
3232constexpr static uint32_t BITS_IN_WORD = sizeof (uint32_t ) * 8 ;
3333constexpr static uint32_t MIN_SIZE = 16 ;
3434
35+ // A sentinel used to indicate an invalid but non-null pointer value.
36+ constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t >::max();
37+
3538static_assert (!(ARRAY_SIZE & (ARRAY_SIZE - 1 )), " Must be a power of two" );
3639
3740namespace impl {
@@ -322,9 +325,6 @@ template <typename T> struct GuardPtr {
322325 cpp::Atomic<T *> ptr{nullptr };
323326 RefCounter ref{};
324327
325- // A sentinel value used to claim the pointer slot.
326- static constexpr uint64_t SENTINEL = cpp::numeric_limits<uint64_t >::max();
327-
328328 // Should be called be a single lane for each different pointer.
329329 template <typename ... Args>
330330 T *try_lock_impl (uint32_t n, uint64_t &count, Args &&...args) {
@@ -370,14 +370,14 @@ template <typename T> struct GuardPtr {
370370 result = try_lock_impl (cpp::popcount (uniform), count,
371371 cpp::forward<Args>(args)...);
372372 result = gpu::shuffle (lane_mask, cpp::countr_zero (uniform), result);
373+ count = gpu::shuffle (lane_mask, cpp::countr_zero (uniform), count);
373374
374375 if (!result)
375376 return nullptr ;
376377
377- // Obtain the value of the reference counter for each lane given the
378- // aggregate value.
379- count = gpu::shuffle (lane_mask, cpp::countr_zero (uniform), count) -
380- cpp::popcount (uniform) + impl::lane_count (uniform) + 1 ;
378+ if (count != cpp::numeric_limits<uint64_t >::max ())
379+ count = count - cpp::popcount (uniform) + impl::lane_count (uniform) + 1 ;
380+
381381 return result;
382382 }
383383
@@ -406,48 +406,60 @@ static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
406406static Slab *find_slab (uint32_t chunk_size) {
407407 // We start at a hashed value to spread out different chunk sizes.
408408 uint32_t start = impl::hash (chunk_size);
409- for (uint32_t offset = 0 ; offset < ARRAY_SIZE;) {
410- uint32_t index = (offset + start) % ARRAY_SIZE;
411-
412- // If this slot is too full we exit early.
413- if (slots[index].use_count () >= Slab::available_chunks (chunk_size)) {
414- offset++;
415- sleep_briefly ();
416- continue ;
409+ uint64_t lane_mask = gpu::get_lane_mask ();
410+ uint64_t uniform = gpu::match_any (lane_mask, chunk_size);
411+
412+ Slab *result = nullptr ;
413+ for (uint64_t mask = lane_mask; mask;
414+ mask = gpu::ballot (lane_mask, !result)) {
415+ uint32_t index = cpp::numeric_limits<uint32_t >::max ();
416+ for (uint32_t offset = 0 ;
417+ gpu::ballot (lane_mask, index == cpp::numeric_limits<uint32_t >::max ());
418+ offset += cpp::popcount (uniform & lane_mask)) {
419+ uint32_t candidate =
420+ (start + offset + impl::lane_count (uniform & lane_mask)) % ARRAY_SIZE;
421+ uint64_t available =
422+ gpu::ballot (lane_mask, slots[candidate].use_count () <
423+ Slab::available_chunks (chunk_size));
424+ uint32_t new_index = gpu::shuffle (
425+ lane_mask, cpp::countr_zero (available & uniform), candidate);
426+
427+ // Each uniform group will use the first empty slot they find.
428+ if (index == cpp::numeric_limits<uint32_t >::max () &&
429+ (available & uniform))
430+ index = new_index;
431+
432+ if (offset >= ARRAY_SIZE)
433+ result = reinterpret_cast <Slab *>(SENTINEL);
417434 }
418435
419- uint64_t lane_mask = gpu::get_lane_mask ();
420- uint64_t uniform = gpu::match_any (lane_mask, index);
421- uint64_t reserved = 0 ;
422- Slab *slab =
423- slots[index].try_lock (lane_mask, uniform, reserved, chunk_size, index);
424- gpu::sync_lane (lane_mask);
425-
426- // We successfully obtained a slab with enough space for our allocation.
427- // This guarantees that a call to Slab::allocate will always succeed.
428- if (slab && reserved <= Slab::available_chunks (chunk_size) &&
429- slab->get_chunk_size () == chunk_size)
430- return slab;
431-
432- // We encountered either a full slab or an slab with an incompatible chunk
433- // size. Move to the next slot.
434- if (slab && reserved > Slab::available_chunks (chunk_size) &&
435- slab->get_chunk_size () == chunk_size) {
436- slots[index].unlock (gpu::get_lane_mask (), gpu::get_lane_mask () & uniform);
437- offset++;
436+ // Try to claim a slot for the found slot.
437+ if (!result) {
438+ uint64_t reserved = 0 ;
439+ Slab *slab = slots[index].try_lock (lane_mask & mask, uniform & mask,
440+ reserved, chunk_size, index);
441+ uint64_t claimed = gpu::ballot (
442+ lane_mask & mask, reserved <= Slab::available_chunks (chunk_size));
443+
444+ // If we find a slab with a matching chunk size then we store the result.
445+ // Otherwise, we need to free the claimed lock and continue. In the case
446+ // of out-of-memory we return a sentinel value.
447+ if (slab && reserved <= Slab::available_chunks (chunk_size) &&
448+ slab->get_chunk_size () == chunk_size) {
449+ result = slab;
450+ } else if (slab && (reserved > Slab::available_chunks (chunk_size) ||
451+ slab->get_chunk_size () != chunk_size)) {
452+ // Shuffle the start so we don't get stuck behind another slab forever.
453+ if (slab->get_chunk_size () != chunk_size)
454+ start = impl::hash (start);
455+ slots[index].unlock (lane_mask & mask & ~claimed,
456+ mask & ~claimed & uniform);
457+ } else if (!slab && reserved == cpp::numeric_limits<uint64_t >::max ()) {
458+ result = reinterpret_cast <Slab *>(SENTINEL);
459+ }
438460 }
439-
440- // Malloc returned a null pointer and we are out-of-memory.
441- if (!slab && reserved == cpp::numeric_limits<uint64_t >::max ())
442- return nullptr ;
443-
444- // The slab is in the process of being initialized. Start at the beginning
445- // to prevent too many slab allocations from happening at once.
446- if (!slab && reserved == 0 )
447- offset = 0 ;
448- sleep_briefly ();
449461 }
450- return nullptr ;
462+ return result ;
451463}
452464
453465// Release the lock associated with a given slab.
@@ -471,7 +483,7 @@ void *allocate(uint64_t size) {
471483 // Try to find a slab for the rounded up chunk size and allocate from it.
472484 uint32_t chunk_size = impl::get_chunk_size (static_cast <uint32_t >(size));
473485 Slab *slab = find_slab (chunk_size);
474- if (!slab)
486+ if (!slab || slab == reinterpret_cast <Slab *>(SENTINEL) )
475487 return nullptr ;
476488
477489 uint64_t lane_mask = gpu::get_lane_mask ();
0 commit comments