Skip to content

Commit 3ff2cfd

Browse files
committed
Improve find slab code for better register use
1 parent 0d5f5be commit 3ff2cfd

File tree

1 file changed

+58
-46
lines changed

1 file changed

+58
-46
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 58 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
3232
constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
3333
constexpr static uint32_t MIN_SIZE = 16;
3434

35+
// A sentinel used to indicate an invalid but non-null pointer value.
36+
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
37+
3538
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
3639

3740
namespace impl {
@@ -322,9 +325,6 @@ template <typename T> struct GuardPtr {
322325
cpp::Atomic<T *> ptr{nullptr};
323326
RefCounter ref{};
324327

325-
// A sentinel value used to claim the pointer slot.
326-
static constexpr uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
327-
328328
// Should be called be a single lane for each different pointer.
329329
template <typename... Args>
330330
T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
@@ -370,14 +370,14 @@ template <typename T> struct GuardPtr {
370370
result = try_lock_impl(cpp::popcount(uniform), count,
371371
cpp::forward<Args>(args)...);
372372
result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
373+
count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);
373374

374375
if (!result)
375376
return nullptr;
376377

377-
// Obtain the value of the reference counter for each lane given the
378-
// aggregate value.
379-
count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count) -
380-
cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
378+
if (count != cpp::numeric_limits<uint64_t>::max())
379+
count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
380+
381381
return result;
382382
}
383383

@@ -406,48 +406,60 @@ static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
406406
static Slab *find_slab(uint32_t chunk_size) {
407407
// We start at a hashed value to spread out different chunk sizes.
408408
uint32_t start = impl::hash(chunk_size);
409-
for (uint32_t offset = 0; offset < ARRAY_SIZE;) {
410-
uint32_t index = (offset + start) % ARRAY_SIZE;
411-
412-
// If this slot is too full we exit early.
413-
if (slots[index].use_count() >= Slab::available_chunks(chunk_size)) {
414-
offset++;
415-
sleep_briefly();
416-
continue;
409+
uint64_t lane_mask = gpu::get_lane_mask();
410+
uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
411+
412+
Slab *result = nullptr;
413+
for (uint64_t mask = lane_mask; mask;
414+
mask = gpu::ballot(lane_mask, !result)) {
415+
uint32_t index = cpp::numeric_limits<uint32_t>::max();
416+
for (uint32_t offset = 0;
417+
gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
418+
offset += cpp::popcount(uniform & lane_mask)) {
419+
uint32_t candidate =
420+
(start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
421+
uint64_t available =
422+
gpu::ballot(lane_mask, slots[candidate].use_count() <
423+
Slab::available_chunks(chunk_size));
424+
uint32_t new_index = gpu::shuffle(
425+
lane_mask, cpp::countr_zero(available & uniform), candidate);
426+
427+
// Each uniform group will use the first empty slot they find.
428+
if (index == cpp::numeric_limits<uint32_t>::max() &&
429+
(available & uniform))
430+
index = new_index;
431+
432+
if (offset >= ARRAY_SIZE)
433+
result = reinterpret_cast<Slab *>(SENTINEL);
417434
}
418435

419-
uint64_t lane_mask = gpu::get_lane_mask();
420-
uint64_t uniform = gpu::match_any(lane_mask, index);
421-
uint64_t reserved = 0;
422-
Slab *slab =
423-
slots[index].try_lock(lane_mask, uniform, reserved, chunk_size, index);
424-
gpu::sync_lane(lane_mask);
425-
426-
// We successfully obtained a slab with enough space for our allocation.
427-
// This guarantees that a call to Slab::allocate will always succeed.
428-
if (slab && reserved <= Slab::available_chunks(chunk_size) &&
429-
slab->get_chunk_size() == chunk_size)
430-
return slab;
431-
432-
// We encountered either a full slab or an slab with an incompatible chunk
433-
// size. Move to the next slot.
434-
if (slab && reserved > Slab::available_chunks(chunk_size) &&
435-
slab->get_chunk_size() == chunk_size) {
436-
slots[index].unlock(gpu::get_lane_mask(), gpu::get_lane_mask() & uniform);
437-
offset++;
436+
// Try to claim a slot for the found slot.
437+
if (!result) {
438+
uint64_t reserved = 0;
439+
Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
440+
reserved, chunk_size, index);
441+
uint64_t claimed = gpu::ballot(
442+
lane_mask & mask, reserved <= Slab::available_chunks(chunk_size));
443+
444+
// If we find a slab with a matching chunk size then we store the result.
445+
// Otherwise, we need to free the claimed lock and continue. In the case
446+
// of out-of-memory we return a sentinel value.
447+
if (slab && reserved <= Slab::available_chunks(chunk_size) &&
448+
slab->get_chunk_size() == chunk_size) {
449+
result = slab;
450+
} else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
451+
slab->get_chunk_size() != chunk_size)) {
452+
// Shuffle the start so we don't get stuck behind another slab forever.
453+
if (slab->get_chunk_size() != chunk_size)
454+
start = impl::hash(start);
455+
slots[index].unlock(lane_mask & mask & ~claimed,
456+
mask & ~claimed & uniform);
457+
} else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
458+
result = reinterpret_cast<Slab *>(SENTINEL);
459+
}
438460
}
439-
440-
// Malloc returned a null pointer and we are out-of-memory.
441-
if (!slab && reserved == cpp::numeric_limits<uint64_t>::max())
442-
return nullptr;
443-
444-
// The slab is in the process of being initialized. Start at the beginning
445-
// to prevent too many slab allocations from happening at once.
446-
if (!slab && reserved == 0)
447-
offset = 0;
448-
sleep_briefly();
449461
}
450-
return nullptr;
462+
return result;
451463
}
452464

453465
// Release the lock associated with a given slab.
@@ -471,7 +483,7 @@ void *allocate(uint64_t size) {
471483
// Try to find a slab for the rounded up chunk size and allocate from it.
472484
uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
473485
Slab *slab = find_slab(chunk_size);
474-
if (!slab)
486+
if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
475487
return nullptr;
476488

477489
uint64_t lane_mask = gpu::get_lane_mask();

0 commit comments

Comments
 (0)