Skip to content

Commit 9975dfd

Browse files
committed
[libc] Small performance improvements to GPU allocator
Summary: This slightly increases performance in a few places. First, we optimistically assume the cached slab has ample space which lets us avoid the atomic load on the highly contended counter in the case that it is likely to succeed. Second, we no longer call `match_any` twice as we can calculate the uniform slabs at the moment we return them. Thirdly, we always choose a random index on a 32-bit boundary. This means that in the fast case we fulfil the allocation with a single `fetch_or`, and in the other case we quickly move to the free bit. This nets around a 7.75% improvement for the fast path case.
1 parent 166493d commit 9975dfd

File tree

2 files changed

+41
-12
lines changed

2 files changed

+41
-12
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ static inline constexpr bool is_pow2(uint64_t x) {
145145
// Where this chunk size should start looking in the global array. Small
146146
// allocations are much more likely than large ones, so we give them the most
147147
// space. We use a cubic easing function normalized on the possible chunks.
148-
static inline constexpr uint32_t start_index(uint32_t chunk_size) {
148+
static inline constexpr uint32_t get_start_index(uint32_t chunk_size) {
149149
constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2);
150150
uint64_t norm =
151151
(1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk;
@@ -270,10 +270,10 @@ struct Slab {
270270
continue;
271271

272272
// We try using any known empty bits from the previous attempt first.
273-
uint32_t start = gpu::shuffle(mask, cpp::countr_zero(uniform & mask),
274-
~after ? (old_index & ~(BITS_IN_WORD - 1)) +
275-
cpp::countr_zero(~after)
276-
: impl::xorshift32(state));
273+
uint32_t start = gpu::shuffle(
274+
mask, cpp::countr_zero(uniform & mask),
275+
~after ? (old_index & ~(BITS_IN_WORD - 1)) + cpp::countr_zero(~after)
276+
: __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
277277

278278
uint32_t id = impl::lane_count(uniform & mask);
279279
uint32_t index = (start + id) % usable_bits(chunk_size);
@@ -475,7 +475,7 @@ static GuardPtr slots[ARRAY_SIZE] = {};
475475
// Keep a cache of the last successful slot for each chunk size. Initialize it
476476
// to an even spread of the total size. Must be updated if the chunking scheme
477477
// changes.
478-
#define S(X) (impl::start_index(X))
478+
#define S(X) (impl::get_start_index(X))
479479
static cpp::Atomic<uint32_t> indices[] = {
480480
S(16), S(32), S(48), S(64), S(96), S(112), S(128),
481481
S(192), S(224), S(256), S(384), S(448), S(512), S(768),
@@ -487,18 +487,18 @@ static cpp::Atomic<uint32_t> indices[] = {
487487
#undef S
488488

489489
// Tries to find a slab in the table that can support the given chunk size.
490-
static Slab *find_slab(uint32_t chunk_size) {
490+
static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
491491
// We start at the index of the last successful allocation for this kind.
492492
uint32_t chunk_id = impl::get_chunk_id(chunk_size);
493493
uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
494-
uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
495494

496495
for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
497496
uint32_t index =
498497
!offset ? start
499-
: (impl::start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
498+
: (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
500499

501-
if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
500+
if (!offset ||
501+
slots[index].use_count() < Slab::available_chunks(chunk_size)) {
502502
uint64_t lane_mask = gpu::get_lane_mask();
503503
uint64_t reserved = 0;
504504

@@ -521,13 +521,17 @@ static Slab *find_slab(uint32_t chunk_size) {
521521
slab->get_chunk_size() == chunk_size) {
522522
if (index != start)
523523
indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
524+
uniform = uniform & gpu::get_lane_mask();
524525
return slab;
525526
} else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
526527
slab->get_chunk_size() != chunk_size)) {
527528
slots[index].unlock(gpu::get_lane_mask(),
528529
gpu::get_lane_mask() & uniform);
529530
} else if (!slab && reserved == SENTINEL) {
531+
uniform = uniform & gpu::get_lane_mask();
530532
return nullptr;
533+
} else {
534+
sleep_briefly();
531535
}
532536
}
533537
}
@@ -554,12 +558,12 @@ void *allocate(uint64_t size) {
554558

555559
// Try to find a slab for the rounded up chunk size and allocate from it.
556560
uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
557-
Slab *slab = find_slab(chunk_size);
561+
uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
562+
Slab *slab = find_slab(chunk_size, uniform);
558563
if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
559564
return nullptr;
560565

561566
uint64_t lane_mask = gpu::get_lane_mask();
562-
uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
563567
void *ptr = slab->allocate(lane_mask, uniform);
564568
return ptr;
565569
}

libc/test/integration/src/stdlib/gpu/malloc_stress.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,20 @@
1414

1515
using namespace LIBC_NAMESPACE;
1616

17+
static inline uint32_t entropy() {
18+
return (static_cast<uint32_t>(gpu::processor_clock()) ^
19+
(gpu::get_thread_id_x() * 0x632be59b) ^
20+
(gpu::get_block_id_x() * 0x85157af5)) *
21+
0x9e3779bb;
22+
}
23+
24+
static inline uint32_t xorshift32(uint32_t &state) {
25+
state ^= state << 13;
26+
state ^= state >> 17;
27+
state ^= state << 5;
28+
return state * 0x9e3779bb;
29+
}
30+
1731
static inline void use(uint8_t *ptr, uint32_t size) {
1832
EXPECT_NE(ptr, nullptr);
1933
for (int i = 0; i < size; ++i)
@@ -34,5 +48,16 @@ TEST_MAIN(int, char **, char **) {
3448

3549
for (int i = 0; i < 256; ++i)
3650
free(ptrs[i]);
51+
52+
uint32_t state = entropy();
53+
for (int i = 0; i < 1024; ++i) {
54+
if (xorshift32(state) % 2) {
55+
uint64_t size = xorshift32(state) % 256 + 1;
56+
void *ptr = malloc(size);
57+
ASSERT_TRUE(ptr);
58+
ASSERT_TRUE(__builtin_is_aligned(ptr, 16));
59+
free(ptr);
60+
}
61+
}
3762
return 0;
3863
}

0 commit comments

Comments
 (0)