Skip to content

Commit b232277

Browse files
authored
[libc] Reduce reference counter to a 32-bit integer (#150961)
Summary: This reference counter tracks how many threads are using a given slab. Currently it's a 64-bit integer, this patch reduces it to a 32-bit integer. The benefit of this is that we save a few registers now that we no longer need to use two for these operations. This increases the risk of overflow, but given that the largest value we accept for a single slab is ~131,000 it is a long way off of the maximum of four billion or so. Obviously we can oversubscribe the reference count by having threads attempt to claim the lock and then try to free it, but I assert that it is exceedingly unlikely that we will somehow have over four billion GPU threads stalled in the same place. A later optimization could be done to split the reference counter and pointers into a struct of arrays, that will save 128 KiB of static memory (as we currently use 512 KiB for the slab array).
1 parent 701de35 commit b232277

File tree

1 file changed

+27
-23
lines changed

1 file changed

+27
-23
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
3939
// The number of times to attempt claiming an in-progress slab allocation.
4040
constexpr static uint32_t MAX_TRIES = 1024;
4141

42-
// A sentinel used to indicate an invalid but non-null pointer value.
43-
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
44-
4542
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
4643

4744
namespace impl {
@@ -163,6 +160,11 @@ static inline uint32_t get_leader_id(uint64_t ballot, uint32_t id) {
163160
return BITS_IN_DWORD - cpp::countl_zero(ballot & ~mask) - 1;
164161
}
165162

163+
// We use a sentinal value to indicate a failed or in-progress allocation.
164+
template <typename T> bool is_sentinel(const T &x) {
165+
return x == cpp::numeric_limits<T>::max();
166+
}
167+
166168
} // namespace impl
167169

168170
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -343,20 +345,20 @@ struct GuardPtr {
343345
private:
344346
struct RefCounter {
345347
// Indicates that the object is in its deallocation phase and thus invalid.
346-
static constexpr uint64_t INVALID = uint64_t(1) << 63;
348+
static constexpr uint32_t INVALID = uint32_t(1) << 31;
347349

348350
// If a read preempts an unlock call we indicate this so the following
349351
// unlock call can swap out the helped bit and maintain exclusive ownership.
350-
static constexpr uint64_t HELPED = uint64_t(1) << 62;
352+
static constexpr uint32_t HELPED = uint32_t(1) << 30;
351353

352354
// Resets the reference counter, cannot be reset to zero safely.
353-
void reset(uint32_t n, uint64_t &count) {
355+
void reset(uint32_t n, uint32_t &count) {
354356
counter.store(n, cpp::MemoryOrder::RELAXED);
355357
count = n;
356358
}
357359

358360
// Acquire a slot in the reference counter if it is not invalid.
359-
bool acquire(uint32_t n, uint64_t &count) {
361+
bool acquire(uint32_t n, uint32_t &count) {
360362
count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
361363
return (count & INVALID) == 0;
362364
}
@@ -369,7 +371,7 @@ struct GuardPtr {
369371
// another thread resurrected the counter and we quit, or a parallel read
370372
// helped us invalidating it. For the latter, claim that flag and return.
371373
if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
372-
uint64_t expected = 0;
374+
uint32_t expected = 0;
373375
if (counter.compare_exchange_strong(expected, INVALID,
374376
cpp::MemoryOrder::RELAXED,
375377
cpp::MemoryOrder::RELAXED))
@@ -392,28 +394,29 @@ struct GuardPtr {
392394
return (val & INVALID) ? 0 : val;
393395
}
394396

395-
cpp::Atomic<uint64_t> counter{0};
397+
cpp::Atomic<uint32_t> counter{0};
396398
};
397399

398-
cpp::Atomic<Slab *> ptr{nullptr};
399-
RefCounter ref{};
400+
cpp::Atomic<Slab *> ptr;
401+
RefCounter ref;
400402

401403
// Should be called be a single lane for each different pointer.
402404
template <typename... Args>
403-
Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
405+
Slab *try_lock_impl(uint32_t n, uint32_t &count, Args &&...args) {
404406
Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
405407
if (!expected &&
406408
ptr.compare_exchange_strong(
407-
expected, reinterpret_cast<Slab *>(SENTINEL),
409+
expected,
410+
reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max()),
408411
cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
409-
count = cpp::numeric_limits<uint64_t>::max();
412+
count = cpp::numeric_limits<uint32_t>::max();
410413
void *raw = impl::rpc_allocate(sizeof(Slab));
411414
if (!raw)
412415
return nullptr;
413416
return new (raw) Slab(cpp::forward<Args>(args)...);
414417
}
415418

416-
if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
419+
if (!expected || impl::is_sentinel(reinterpret_cast<uintptr_t>(expected)))
417420
return nullptr;
418421

419422
if (!ref.acquire(n, count))
@@ -425,7 +428,7 @@ struct GuardPtr {
425428

426429
// Finalize the associated memory and signal that it is ready to use by
427430
// resetting the counter.
428-
void finalize(Slab *mem, uint32_t n, uint64_t &count) {
431+
void finalize(Slab *mem, uint32_t n, uint32_t &count) {
429432
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
430433
ptr.store(mem, cpp::MemoryOrder::RELAXED);
431434
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
@@ -438,7 +441,7 @@ struct GuardPtr {
438441
// The uniform mask represents which lanes share the same pointer. For each
439442
// uniform value we elect a leader to handle it on behalf of the other lanes.
440443
template <typename... Args>
441-
Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
444+
Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint32_t &count,
442445
Args &&...args) {
443446
count = 0;
444447
Slab *result = nullptr;
@@ -453,13 +456,13 @@ struct GuardPtr {
453456

454457
// We defer storing the newly allocated slab until now so that we can use
455458
// multiple lanes to initialize it and release it for use.
456-
if (count == cpp::numeric_limits<uint64_t>::max()) {
459+
if (impl::is_sentinel(count)) {
457460
result->initialize(uniform);
458461
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
459462
finalize(result, cpp::popcount(uniform), count);
460463
}
461464

462-
if (count != cpp::numeric_limits<uint64_t>::max())
465+
if (!impl::is_sentinel(count))
463466
count = count - cpp::popcount(uniform) +
464467
impl::lane_count(uniform, gpu::get_lane_id()) + 1;
465468

@@ -515,14 +518,15 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
515518
if (!offset ||
516519
slots[index].use_count() < Slab::available_chunks(chunk_size)) {
517520
uint64_t lane_mask = gpu::get_lane_mask();
518-
uint64_t reserved = 0;
521+
uint32_t reserved = 0;
519522

520523
Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
521524
reserved, chunk_size, index);
522525

523526
// If there is a slab allocation in progress we retry a few times.
524527
for (uint32_t retries = 0;
525-
retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
528+
!slab && !impl::is_sentinel(reserved) && retries < MAX_TRIES;
529+
retries++) {
526530
uint64_t lane_mask = gpu::get_lane_mask();
527531
slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
528532
chunk_size, index);
@@ -542,7 +546,7 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
542546
slab->get_chunk_size() != chunk_size)) {
543547
slots[index].unlock(gpu::get_lane_mask(),
544548
gpu::get_lane_mask() & uniform);
545-
} else if (!slab && reserved == SENTINEL) {
549+
} else if (!slab && impl::is_sentinel(reserved)) {
546550
uniform = uniform & gpu::get_lane_mask();
547551
return nullptr;
548552
} else {
@@ -575,7 +579,7 @@ void *allocate(uint64_t size) {
575579
uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
576580
uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
577581
Slab *slab = find_slab(chunk_size, uniform);
578-
if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
582+
if (!slab || impl::is_sentinel(reinterpret_cast<uintptr_t>(slab)))
579583
return nullptr;
580584

581585
uint64_t lane_mask = gpu::get_lane_mask();

0 commit comments

Comments
 (0)