@@ -36,6 +36,10 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
3636// A sentinel used to indicate an invalid but non-null pointer value.
3737constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t >::max();
3838
39+ // The number of times we will try starting on a single index before skipping
40+ // past it.
41+ constexpr static uint32_t MAX_TRIES = 512 ;
42+
3943static_assert (!(ARRAY_SIZE & (ARRAY_SIZE - 1 )), " Must be a power of two" );
4044
4145namespace impl {
@@ -413,10 +417,11 @@ static Slab *find_slab(uint32_t chunk_size) {
413417 uint64_t uniform = gpu::match_any (lane_mask, chunk_size);
414418
415419 Slab *result = nullptr ;
420+ uint32_t nudge = 0 ;
416421 for (uint64_t mask = lane_mask; mask;
417- mask = gpu::ballot (lane_mask, !result)) {
422+ mask = gpu::ballot (lane_mask, !result), ++nudge ) {
418423 uint32_t index = cpp::numeric_limits<uint32_t >::max ();
419- for (uint32_t offset = 0 ;
424+ for (uint32_t offset = nudge / MAX_TRIES ;
420425 gpu::ballot (lane_mask, index == cpp::numeric_limits<uint32_t >::max ());
421426 offset += cpp::popcount (uniform & lane_mask)) {
422427 uint32_t candidate =
@@ -428,8 +433,9 @@ static Slab *find_slab(uint32_t chunk_size) {
428433 lane_mask, cpp::countr_zero (available & uniform), candidate);
429434
430435 // Each uniform group will use the first empty slot they find.
431- if (index == cpp::numeric_limits<uint32_t >::max () &&
432- (available & uniform))
436+ if (offset >= ARRAY_SIZE ||
437+ (index == cpp::numeric_limits<uint32_t >::max () &&
438+ (available & uniform)))
433439 index = new_index;
434440
435441 if (offset >= ARRAY_SIZE)
@@ -441,9 +447,6 @@ static Slab *find_slab(uint32_t chunk_size) {
441447 uint64_t reserved = 0 ;
442448 Slab *slab = slots[index].try_lock (lane_mask & mask, uniform & mask,
443449 reserved, chunk_size, index);
444- uint64_t claimed = gpu::ballot (
445- lane_mask & mask, reserved <= Slab::available_chunks (chunk_size));
446-
447450 // If we find a slab with a matching chunk size then we store the result.
448451 // Otherwise, we need to free the claimed lock and continue. In the case
449452 // of out-of-memory we return a sentinel value.
@@ -452,13 +455,14 @@ static Slab *find_slab(uint32_t chunk_size) {
452455 result = slab;
453456 } else if (slab && (reserved > Slab::available_chunks (chunk_size) ||
454457 slab->get_chunk_size () != chunk_size)) {
455- // Shuffle the start so we don't get stuck behind another slab forever.
456458 if (slab->get_chunk_size () != chunk_size)
457- start = impl::hash (start) ;
458- slots[index].unlock (lane_mask & mask & ~claimed ,
459- mask & ~claimed & uniform);
459+ start = index + 1 ;
460+ slots[index].unlock (gpu::get_lane_mask () ,
461+ gpu::get_lane_mask () & uniform);
460462 } else if (!slab && reserved == cpp::numeric_limits<uint64_t >::max ()) {
461463 result = reinterpret_cast <Slab *>(SENTINEL);
464+ } else {
465+ sleep_briefly ();
462466 }
463467 }
464468 }
0 commit comments