Skip to content

Commit ddfb1b9

Browse files
jhuber6krishna2803
authored andcommitted
[libc] Start slab search at number of allocated bits
Summary: This patch changes the slab search to start at the number of allocated bits. Previously we would randomly search, but this gives very good performance when doing nothing but allocating, which is a common configuration. This will degrade performance when mixing malloc and free close to eachother as this is more likely to fail when the counter starts decreasing.
1 parent 355901f commit ddfb1b9

File tree

1 file changed

+34
-36
lines changed

1 file changed

+34
-36
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 34 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -266,38 +266,31 @@ struct Slab {
266266

267267
// Randomly walks the bitfield until it finds a free bit. Allocations attempt
268268
// to put lanes right next to each other for better caching and convergence.
269-
void *allocate(uint64_t lane_mask, uint64_t uniform) {
269+
void *allocate(uint64_t uniform, uint32_t reserved) {
270270
uint32_t chunk_size = get_chunk_size();
271271
uint32_t state = impl::entropy();
272272

273-
// The uniform mask represents which lanes contain a uniform target pointer.
274-
// We attempt to place these next to each other.
275-
void *result = nullptr;
276-
uint32_t after = ~0u;
277-
uint32_t old_index = 0;
278-
for (uint64_t mask = lane_mask; mask;
279-
mask = gpu::ballot(lane_mask, !result)) {
280-
if (result)
281-
continue;
282-
283-
// We try using any known empty bits from the previous attempt first.
284-
uint32_t start = gpu::shuffle(
285-
mask, cpp::countr_zero(uniform & mask),
286-
~after ? (old_index & ~(BITS_IN_WORD - 1)) + cpp::countr_zero(~after)
287-
: __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
273+
// Try to find the empty bit in the bitfield to finish the allocation. We
274+
// start at the number of allocations as this is guaranteed to be available
275+
// until the user starts freeing memory.
276+
uint64_t lane_mask = gpu::get_lane_mask();
277+
uint32_t start = gpu::shuffle(
278+
lane_mask, cpp::countr_zero(uniform & lane_mask), reserved);
279+
for (;;) {
280+
uint64_t lane_mask = gpu::get_lane_mask();
288281

289282
// Each lane tries to claim one bit in a single contiguous mask.
290-
uint32_t id = impl::lane_count(uniform & mask, gpu::get_lane_id());
283+
uint32_t id = impl::lane_count(uniform & lane_mask, gpu::get_lane_id());
291284
uint32_t index = (start + id) % usable_bits(chunk_size);
292285
uint32_t slot = index / BITS_IN_WORD;
293286
uint32_t bit = index % BITS_IN_WORD;
294287

295288
// Get the mask of bits destined for the same slot and coalesce it.
296289
uint32_t leader = impl::get_leader_id(
297-
uniform & gpu::ballot(mask, !id || index % BITS_IN_WORD == 0),
290+
uniform & gpu::ballot(lane_mask, !id || index % BITS_IN_WORD == 0),
298291
gpu::get_lane_id());
299-
uint32_t length = cpp::popcount(uniform & mask) -
300-
impl::lane_count(uniform & mask, leader);
292+
uint32_t length = cpp::popcount(uniform & lane_mask) -
293+
impl::lane_count(uniform & lane_mask, leader);
301294
uint32_t bitmask =
302295
static_cast<uint32_t>(
303296
(uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
@@ -307,18 +300,23 @@ struct Slab {
307300
if (gpu::get_lane_id() == leader)
308301
before = cpp::AtomicRef(get_bitfield()[slot])
309302
.fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
310-
before = gpu::shuffle(mask, leader, before);
311-
if (~before & (1 << bit))
312-
result = ptr_from_index(index, chunk_size);
313-
else
314-
sleep_briefly();
303+
before = gpu::shuffle(lane_mask, leader, before);
304+
if (~before & (1 << bit)) {
305+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
306+
return ptr_from_index(index, chunk_size);
307+
}
315308

316-
after = before | bitmask;
317-
old_index = index;
309+
// If the previous operation found an empty bit we move there, otherwise
310+
// we generate new random index to start at.
311+
uint32_t after = before | bitmask;
312+
start = gpu::shuffle(
313+
gpu::get_lane_mask(),
314+
cpp::countr_zero(uniform & gpu::get_lane_mask()),
315+
~after ? __builtin_align_down(index, BITS_IN_WORD) +
316+
cpp::countr_zero(~after)
317+
: __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
318+
sleep_briefly();
318319
}
319-
320-
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
321-
return result;
322320
}
323321

324322
// Deallocates memory by resetting its corresponding bit in the bitfield.
@@ -507,7 +505,8 @@ static cpp::Atomic<uint32_t> indices[] = {
507505
#undef S
508506

509507
// Tries to find a slab in the table that can support the given chunk size.
510-
static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
508+
static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform,
509+
uint32_t &reserved) {
511510
// We start at the index of the last successful allocation for this kind.
512511
uint32_t chunk_id = impl::get_chunk_id(chunk_size);
513512
uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
@@ -520,7 +519,6 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
520519
if (!offset ||
521520
slots[index].use_count() < Slab::available_chunks(chunk_size)) {
522521
uint64_t lane_mask = gpu::get_lane_mask();
523-
uint32_t reserved = 0;
524522

525523
Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
526524
reserved, chunk_size, index);
@@ -580,12 +578,12 @@ void *allocate(uint64_t size) {
580578
// Try to find a slab for the rounded up chunk size and allocate from it.
581579
uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
582580
uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
583-
Slab *slab = find_slab(chunk_size, uniform);
584-
if (!slab || impl::is_sentinel(reinterpret_cast<uintptr_t>(slab)))
581+
uint32_t reserved = 0;
582+
Slab *slab = find_slab(chunk_size, uniform, reserved);
583+
if (!slab)
585584
return nullptr;
586585

587-
uint64_t lane_mask = gpu::get_lane_mask();
588-
void *ptr = slab->allocate(lane_mask, uniform);
586+
void *ptr = slab->allocate(uniform, reserved);
589587
return ptr;
590588
}
591589

0 commit comments

Comments
 (0)