Skip to content

Commit df1dd80

Browse files
jhuber6shiltian
andauthored
[libc] Cache the most recently used slot for a chunk size (#149751)
Summary: This patch changes the `find_slab` logic to simply cache the most successful slot. This means the happy fast path is now a single atomic load on this index. I removed the SIMT shuffling logic that did slab lookups wave-parallel. Here I am considering the actual traversal to be comparatively unlikely, so it's not overly bad that it takes longer. ideally one thread finds a slot and shared it with the rest so we only pay that cost once. --------- Co-authored-by: Shilei Tian <[email protected]>
1 parent 6a98171 commit df1dd80

File tree

1 file changed

+63
-59
lines changed

1 file changed

+63
-59
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 63 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,12 @@ constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
3434
constexpr static uint32_t MIN_SIZE = 16;
3535
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
3636

37+
// The number of times to attempt claiming an in-progress slab allocation.
38+
constexpr static uint32_t MAX_TRIES = 128;
39+
3740
// A sentinel used to indicate an invalid but non-null pointer value.
3841
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
3942

40-
// The number of times we will try starting on a single index before skipping
41-
// past it.
42-
constexpr static uint32_t MAX_TRIES = 512;
43-
4443
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
4544

4645
namespace impl {
@@ -92,20 +91,10 @@ static inline uint32_t xorshift32(uint32_t &state) {
9291
return state * 0x9e3779bb;
9392
}
9493

95-
// Final stage of murmurhash used to get a unique index for the global array
96-
static inline uint32_t hash(uint32_t x) {
97-
x ^= x >> 16;
98-
x *= 0x85ebca6b;
99-
x ^= x >> 13;
100-
x *= 0xc2b2ae35;
101-
x ^= x >> 16;
102-
return x;
103-
}
104-
10594
// Rounds the input value to the closest permitted chunk size. Here we accept
10695
// the sum of the closest three powers of two. For a 2MiB slab size this is 48
10796
// different chunk sizes. This gives us average internal fragmentation of 87.5%.
108-
static inline uint32_t get_chunk_size(uint32_t x) {
97+
static inline constexpr uint32_t get_chunk_size(uint32_t x) {
10998
uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
11099
uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
111100

@@ -123,6 +112,16 @@ static inline uint32_t get_chunk_size(uint32_t x) {
123112
return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
124113
}
125114

115+
// Converts a chunk size into an index suitable for a statically sized array.
116+
static inline constexpr uint32_t get_chunk_id(uint32_t x) {
117+
if (x <= MIN_SIZE)
118+
return 0;
119+
uint32_t y = x >> 4;
120+
if (x < MIN_SIZE << 2)
121+
return cpp::popcount(y);
122+
return cpp::popcount(y) + 3 * (BITS_IN_WORD - cpp::countl_zero(y)) - 7;
123+
}
124+
126125
// Rounds to the nearest power of two.
127126
template <uint32_t N, typename T>
128127
static inline constexpr T round_up(const T x) {
@@ -143,6 +142,12 @@ static inline constexpr bool is_pow2(uint64_t x) {
143142
return x && (x & (x - 1)) == 0;
144143
}
145144

145+
// Where this chunk size should start looking in the global array.
146+
static inline constexpr uint32_t start_index(uint32_t chunk_index) {
147+
return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) /
148+
impl::get_chunk_id(SLAB_SIZE / 2);
149+
}
150+
146151
} // namespace impl
147152

148153
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -451,66 +456,65 @@ struct GuardPtr {
451456
// The global array used to search for a valid slab to allocate from.
452457
static GuardPtr slots[ARRAY_SIZE] = {};
453458

459+
// Keep a cache of the last successful slot for each chunk size. Initialize it
460+
// to an even spread of the total size. Must be updated if the chunking scheme
461+
// changes.
462+
#define S(X) (impl::start_index(X))
463+
static cpp::Atomic<uint32_t> indices[] = {
464+
S(16), S(32), S(48), S(64), S(96), S(112), S(128),
465+
S(192), S(224), S(256), S(384), S(448), S(512), S(768),
466+
S(896), S(1024), S(1536), S(1792), S(2048), S(3072), S(3584),
467+
S(4096), S(6144), S(7168), S(8192), S(12288), S(14336), S(16384),
468+
S(24576), S(28672), S(32768), S(49152), S(57344), S(65536), S(98304),
469+
S(114688), S(131072), S(196608), S(229376), S(262144), S(393216), S(458752),
470+
S(524288), S(786432), S(917504), S(1048576)};
471+
#undef S
472+
454473
// Tries to find a slab in the table that can support the given chunk size.
455474
static Slab *find_slab(uint32_t chunk_size) {
456-
// We start at a hashed value to spread out different chunk sizes.
457-
uint32_t start = impl::hash(chunk_size);
458-
uint64_t lane_mask = gpu::get_lane_mask();
459-
uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
460-
461-
Slab *result = nullptr;
462-
uint32_t nudge = 0;
463-
for (uint64_t mask = lane_mask; mask;
464-
mask = gpu::ballot(lane_mask, !result), ++nudge) {
465-
uint32_t index = cpp::numeric_limits<uint32_t>::max();
466-
for (uint32_t offset = nudge / MAX_TRIES;
467-
gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
468-
offset += cpp::popcount(uniform & lane_mask)) {
469-
uint32_t candidate =
470-
(start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
471-
uint64_t available =
472-
gpu::ballot(lane_mask, slots[candidate].use_count() <
473-
Slab::available_chunks(chunk_size));
474-
uint32_t new_index = gpu::shuffle(
475-
lane_mask, cpp::countr_zero(available & uniform), candidate);
476-
477-
// Each uniform group will use the first empty slot they find.
478-
if ((index == cpp::numeric_limits<uint32_t>::max() &&
479-
(available & uniform)))
480-
index = new_index;
481-
482-
// Guaruntees that this loop will eventuall exit if there is no space.
483-
if (offset >= ARRAY_SIZE) {
484-
result = reinterpret_cast<Slab *>(SENTINEL);
485-
index = 0;
486-
}
487-
}
475+
// We start at the index of the last successful allocation for this kind.
476+
uint32_t chunk_id = impl::get_chunk_id(chunk_size);
477+
uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
478+
uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
479+
480+
for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
481+
uint32_t index =
482+
!offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
488483

489-
// Try to claim a slot for the found slot.
490-
if (!result) {
484+
if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
485+
uint64_t lane_mask = gpu::get_lane_mask();
491486
uint64_t reserved = 0;
492-
Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
487+
488+
Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
493489
reserved, chunk_size, index);
490+
491+
// If there is a slab allocation in progress we retry a few times.
492+
for (uint32_t retries = 0;
493+
retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
494+
uint64_t lane_mask = gpu::get_lane_mask();
495+
slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
496+
chunk_size, index);
497+
sleep_briefly();
498+
}
499+
494500
// If we find a slab with a matching chunk size then we store the result.
495501
// Otherwise, we need to free the claimed lock and continue. In the case
496-
// of out-of-memory we return a sentinel value.
502+
// of out-of-memory we receive a sentinel value and return a failure.
497503
if (slab && reserved <= Slab::available_chunks(chunk_size) &&
498504
slab->get_chunk_size() == chunk_size) {
499-
result = slab;
505+
if (index != start)
506+
indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
507+
return slab;
500508
} else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
501509
slab->get_chunk_size() != chunk_size)) {
502-
if (slab->get_chunk_size() != chunk_size)
503-
start = index + 1;
504510
slots[index].unlock(gpu::get_lane_mask(),
505511
gpu::get_lane_mask() & uniform);
506-
} else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
507-
result = reinterpret_cast<Slab *>(SENTINEL);
508-
} else {
509-
sleep_briefly();
512+
} else if (!slab && reserved == SENTINEL) {
513+
return nullptr;
510514
}
511515
}
512516
}
513-
return result;
517+
return nullptr;
514518
}
515519

516520
// Release the lock associated with a given slab.

0 commit comments

Comments
 (0)