@@ -145,7 +145,7 @@ static inline constexpr bool is_pow2(uint64_t x) {
145
145
// Where this chunk size should start looking in the global array. Small
146
146
// allocations are much more likely than large ones, so we give them the most
147
147
// space. We use a cubic easing function normalized on the possible chunks.
148
- static inline constexpr uint32_t start_index (uint32_t chunk_size) {
148
+ static inline constexpr uint32_t get_start_index (uint32_t chunk_size) {
149
149
constexpr uint32_t max_chunk = impl::get_chunk_id (SLAB_SIZE / 2 );
150
150
uint64_t norm =
151
151
(1 << 16 ) - (impl::get_chunk_id (chunk_size) << 16 ) / max_chunk;
@@ -270,10 +270,10 @@ struct Slab {
270
270
continue ;
271
271
272
272
// We try using any known empty bits from the previous attempt first.
273
- uint32_t start = gpu::shuffle (mask, cpp::countr_zero (uniform & mask),
274
- ~after ? (old_index & ~(BITS_IN_WORD - 1 )) +
275
- cpp::countr_zero (~after)
276
- : impl::xorshift32 (state));
273
+ uint32_t start = gpu::shuffle (
274
+ mask, cpp::countr_zero (uniform & mask),
275
+ ~after ? (old_index & ~(BITS_IN_WORD - 1 )) + cpp::countr_zero (~after)
276
+ : __builtin_align_down ( impl::xorshift32 (state), BITS_IN_WORD ));
277
277
278
278
uint32_t id = impl::lane_count (uniform & mask);
279
279
uint32_t index = (start + id) % usable_bits (chunk_size);
@@ -475,7 +475,7 @@ static GuardPtr slots[ARRAY_SIZE] = {};
475
475
// Keep a cache of the last successful slot for each chunk size. Initialize it
476
476
// to an even spread of the total size. Must be updated if the chunking scheme
477
477
// changes.
478
- #define S (X ) (impl::start_index (X))
478
+ #define S (X ) (impl::get_start_index (X))
479
479
static cpp::Atomic<uint32_t > indices[] = {
480
480
S (16 ), S (32 ), S (48 ), S (64 ), S (96 ), S (112 ), S (128 ),
481
481
S (192 ), S (224 ), S (256 ), S (384 ), S (448 ), S (512 ), S (768 ),
@@ -487,18 +487,18 @@ static cpp::Atomic<uint32_t> indices[] = {
487
487
#undef S
488
488
489
489
// Tries to find a slab in the table that can support the given chunk size.
490
- static Slab *find_slab (uint32_t chunk_size) {
490
+ static Slab *find_slab (uint32_t chunk_size, uint64_t &uniform ) {
491
491
// We start at the index of the last successful allocation for this kind.
492
492
uint32_t chunk_id = impl::get_chunk_id (chunk_size);
493
493
uint32_t start = indices[chunk_id].load (cpp::MemoryOrder::RELAXED);
494
- uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
495
494
496
495
for (uint32_t offset = 0 ; offset <= ARRAY_SIZE; ++offset) {
497
496
uint32_t index =
498
497
!offset ? start
499
- : (impl::start_index (chunk_size) + offset - 1 ) % ARRAY_SIZE;
498
+ : (impl::get_start_index (chunk_size) + offset - 1 ) % ARRAY_SIZE;
500
499
501
- if (slots[index].use_count () < Slab::available_chunks (chunk_size)) {
500
+ if (!offset ||
501
+ slots[index].use_count () < Slab::available_chunks (chunk_size)) {
502
502
uint64_t lane_mask = gpu::get_lane_mask ();
503
503
uint64_t reserved = 0 ;
504
504
@@ -521,13 +521,17 @@ static Slab *find_slab(uint32_t chunk_size) {
521
521
slab->get_chunk_size () == chunk_size) {
522
522
if (index != start)
523
523
indices[chunk_id].store (index, cpp::MemoryOrder::RELAXED);
524
+ uniform = uniform & gpu::get_lane_mask ();
524
525
return slab;
525
526
} else if (slab && (reserved > Slab::available_chunks (chunk_size) ||
526
527
slab->get_chunk_size () != chunk_size)) {
527
528
slots[index].unlock (gpu::get_lane_mask (),
528
529
gpu::get_lane_mask () & uniform);
529
530
} else if (!slab && reserved == SENTINEL) {
531
+ uniform = uniform & gpu::get_lane_mask ();
530
532
return nullptr ;
533
+ } else {
534
+ sleep_briefly ();
531
535
}
532
536
}
533
537
}
@@ -554,12 +558,12 @@ void *allocate(uint64_t size) {
554
558
555
559
// Try to find a slab for the rounded up chunk size and allocate from it.
556
560
uint32_t chunk_size = impl::get_chunk_size (static_cast <uint32_t >(size));
557
- Slab *slab = find_slab (chunk_size);
561
+ uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
562
+ Slab *slab = find_slab (chunk_size, uniform);
558
563
if (!slab || slab == reinterpret_cast <Slab *>(SENTINEL))
559
564
return nullptr ;
560
565
561
566
uint64_t lane_mask = gpu::get_lane_mask ();
562
- uint64_t uniform = gpu::match_any (lane_mask, slab->get_global_index ());
563
567
void *ptr = slab->allocate (lane_mask, uniform);
564
568
return ptr;
565
569
}
0 commit comments