@@ -145,7 +145,7 @@ static inline constexpr bool is_pow2(uint64_t x) {
145145// Where this chunk size should start looking in the global array. Small
146146// allocations are much more likely than large ones, so we give them the most
147147// space. We use a cubic easing function normalized on the possible chunks.
148- static inline constexpr uint32_t start_index (uint32_t chunk_size) {
148+ static inline constexpr uint32_t get_start_index (uint32_t chunk_size) {
149149 constexpr uint32_t max_chunk = impl::get_chunk_id (SLAB_SIZE / 2 );
150150 uint64_t norm =
151151 (1 << 16 ) - (impl::get_chunk_id (chunk_size) << 16 ) / max_chunk;
@@ -270,10 +270,10 @@ struct Slab {
270270 continue ;
271271
272272 // We try using any known empty bits from the previous attempt first.
273- uint32_t start = gpu::shuffle (mask, cpp::countr_zero (uniform & mask),
274- ~after ? (old_index & ~(BITS_IN_WORD - 1 )) +
275- cpp::countr_zero (~after)
276- : impl::xorshift32 (state));
273+ uint32_t start = gpu::shuffle (
274+ mask, cpp::countr_zero (uniform & mask),
275+ ~after ? (old_index & ~(BITS_IN_WORD - 1 )) + cpp::countr_zero (~after)
276+ : __builtin_align_down ( impl::xorshift32 (state), BITS_IN_WORD ));
277277
278278 uint32_t id = impl::lane_count (uniform & mask);
279279 uint32_t index = (start + id) % usable_bits (chunk_size);
@@ -475,7 +475,7 @@ static GuardPtr slots[ARRAY_SIZE] = {};
475475// Keep a cache of the last successful slot for each chunk size. Initialize it
476476// to an even spread of the total size. Must be updated if the chunking scheme
477477// changes.
478- #define S (X ) (impl::start_index (X))
478+ #define S (X ) (impl::get_start_index (X))
479479static cpp::Atomic<uint32_t > indices[] = {
480480 S (16 ), S (32 ), S (48 ), S (64 ), S (96 ), S (112 ), S (128 ),
481481 S (192 ), S (224 ), S (256 ), S (384 ), S (448 ), S (512 ), S (768 ),
@@ -487,18 +487,18 @@ static cpp::Atomic<uint32_t> indices[] = {
487487#undef S
488488
489489// Tries to find a slab in the table that can support the given chunk size.
490- static Slab *find_slab (uint32_t chunk_size) {
490+ static Slab *find_slab (uint32_t chunk_size, uint64_t &uniform ) {
491491 // We start at the index of the last successful allocation for this kind.
492492 uint32_t chunk_id = impl::get_chunk_id (chunk_size);
493493 uint32_t start = indices[chunk_id].load (cpp::MemoryOrder::RELAXED);
494- uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
495494
496495 for (uint32_t offset = 0 ; offset <= ARRAY_SIZE; ++offset) {
497496 uint32_t index =
498497 !offset ? start
499- : (impl::start_index (chunk_size) + offset - 1 ) % ARRAY_SIZE;
498+ : (impl::get_start_index (chunk_size) + offset - 1 ) % ARRAY_SIZE;
500499
501- if (slots[index].use_count () < Slab::available_chunks (chunk_size)) {
500+ if (!offset ||
501+ slots[index].use_count () < Slab::available_chunks (chunk_size)) {
502502 uint64_t lane_mask = gpu::get_lane_mask ();
503503 uint64_t reserved = 0 ;
504504
@@ -521,13 +521,17 @@ static Slab *find_slab(uint32_t chunk_size) {
521521 slab->get_chunk_size () == chunk_size) {
522522 if (index != start)
523523 indices[chunk_id].store (index, cpp::MemoryOrder::RELAXED);
524+ uniform = uniform & gpu::get_lane_mask ();
524525 return slab;
525526 } else if (slab && (reserved > Slab::available_chunks (chunk_size) ||
526527 slab->get_chunk_size () != chunk_size)) {
527528 slots[index].unlock (gpu::get_lane_mask (),
528529 gpu::get_lane_mask () & uniform);
529530 } else if (!slab && reserved == SENTINEL) {
531+ uniform = uniform & gpu::get_lane_mask ();
530532 return nullptr ;
533+ } else {
534+ sleep_briefly ();
531535 }
532536 }
533537 }
@@ -554,12 +558,12 @@ void *allocate(uint64_t size) {
554558
555559 // Try to find a slab for the rounded up chunk size and allocate from it.
556560 uint32_t chunk_size = impl::get_chunk_size (static_cast <uint32_t >(size));
557- Slab *slab = find_slab (chunk_size);
561+ uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
562+ Slab *slab = find_slab (chunk_size, uniform);
558563 if (!slab || slab == reinterpret_cast <Slab *>(SENTINEL))
559564 return nullptr ;
560565
561566 uint64_t lane_mask = gpu::get_lane_mask ();
562- uint64_t uniform = gpu::match_any (lane_mask, slab->get_global_index ());
563567 void *ptr = slab->allocate (lane_mask, uniform);
564568 return ptr;
565569}
0 commit comments