Skip to content

Commit a764900

Browse files
committed
[libc] Rework match any use in hot allocate bitfield loop
Summary: We previously used `match_all` as the shortcut to figure out which threads were destined for which slots. This lowers to a for-loop, which even if it often only executes once still causes some slowdown especially when divergent. Instead we use a single ballot call and then calculate it. Here the ballot tells us which lanes are the first in a block, either the starting index or the barrier for a new 32-bit int. We then use some bit magic to figure out for each lane ID its closest leader. For the length we simply use the length calculated by the leader of the remaining bits to be written. This removes the match any and the shuffle, which improves the minimum number of cycles this takes by about 5%.
1 parent 9975dfd commit a764900

File tree

2 files changed

+32
-14
lines changed

2 files changed

+32
-14
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include "allocator.h"
1818

19+
#include "src/__support/CPP/algorithm.h"
1920
#include "src/__support/CPP/atomic.h"
2021
#include "src/__support/CPP/bit.h"
2122
#include "src/__support/CPP/new.h"
@@ -31,6 +32,7 @@ constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
3132
constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
3233
constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
3334
constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
35+
constexpr static uint32_t BITS_IN_DWORD = sizeof(uint64_t) * 8;
3436
constexpr static uint32_t MIN_SIZE = 16;
3537
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
3638

@@ -70,8 +72,8 @@ static void rpc_free(void *ptr) {
7072

7173
// Convert a potentially disjoint bitmask into an increasing integer per-lane
7274
// for use with indexing between gpu lanes.
73-
static inline uint32_t lane_count(uint64_t lane_mask) {
74-
return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
75+
static inline uint32_t lane_count(uint64_t lane_mask, uint32_t id) {
76+
return cpp::popcount(lane_mask & ((uint64_t(1) << id) - 1));
7577
}
7678

7779
// Obtain an initial value to seed a random number generator. We use the rounded
@@ -133,7 +135,8 @@ static inline constexpr T round_up(const T x) {
133135
void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
134136
uint64_t mask = gpu::get_lane_mask();
135137
uint32_t workers = cpp::popcount(uniform);
136-
for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers)
138+
for (uint32_t i = impl::lane_count(mask & uniform, gpu::get_lane_id()); i < n;
139+
i += workers)
137140
s[i] = c;
138141
}
139142

@@ -154,6 +157,12 @@ static inline constexpr uint32_t get_start_index(uint32_t chunk_size) {
154157
return static_cast<uint32_t>(((ARRAY_SIZE - 1) * inv) >> 16);
155158
}
156159

160+
// Returns the id of the lane below this one that acts as its leader.
161+
static inline uint32_t get_leader_id(uint64_t ballot, uint32_t id) {
162+
uint64_t mask = id < BITS_IN_DWORD ? ~0ull << (id + 1) : 0;
163+
return BITS_IN_DWORD - cpp::countl_zero(ballot & ~mask) - 1;
164+
}
165+
157166
} // namespace impl
158167

159168
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -275,23 +284,28 @@ struct Slab {
275284
~after ? (old_index & ~(BITS_IN_WORD - 1)) + cpp::countr_zero(~after)
276285
: __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
277286

278-
uint32_t id = impl::lane_count(uniform & mask);
287+
// Each lane tries to claim one bit in a single contiguous mask.
288+
uint32_t id = impl::lane_count(uniform & mask, gpu::get_lane_id());
279289
uint32_t index = (start + id) % usable_bits(chunk_size);
280290
uint32_t slot = index / BITS_IN_WORD;
281291
uint32_t bit = index % BITS_IN_WORD;
282292

283293
// Get the mask of bits destined for the same slot and coalesce it.
284-
uint64_t match = uniform & gpu::match_any(mask, slot);
285-
uint32_t length = cpp::popcount(match);
286-
uint32_t bitmask = gpu::shuffle(
287-
mask, cpp::countr_zero(match),
288-
static_cast<uint32_t>((uint64_t(1) << length) - 1) << bit);
294+
uint32_t leader = impl::get_leader_id(
295+
uniform & gpu::ballot(mask, !id || index % BITS_IN_WORD == 0),
296+
gpu::get_lane_id());
297+
uint32_t length = cpp::popcount(uniform & mask) -
298+
impl::lane_count(uniform & mask, leader);
299+
uint32_t bitmask =
300+
static_cast<uint32_t>(
301+
(uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
302+
<< bit;
289303

290304
uint32_t before = 0;
291-
if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
305+
if (gpu::get_lane_id() == leader)
292306
before = cpp::AtomicRef(get_bitfield()[slot])
293307
.fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
294-
before = gpu::shuffle(mask, cpp::countr_zero(match), before);
308+
before = gpu::shuffle(mask, leader, before);
295309
if (~before & (1 << bit))
296310
result = ptr_from_index(index, chunk_size);
297311
else
@@ -446,7 +460,8 @@ struct GuardPtr {
446460
}
447461

448462
if (count != cpp::numeric_limits<uint64_t>::max())
449-
count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
463+
count = count - cpp::popcount(uniform) +
464+
impl::lane_count(uniform, gpu::get_lane_id()) + 1;
450465

451466
return result;
452467
}

libc/test/integration/src/stdlib/gpu/malloc_stress.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,11 @@ TEST_MAIN(int, char **, char **) {
5252
uint32_t state = entropy();
5353
for (int i = 0; i < 1024; ++i) {
5454
if (xorshift32(state) % 2) {
55-
uint64_t size = xorshift32(state) % 256 + 1;
56-
void *ptr = malloc(size);
55+
uint64_t size = xorshift32(state) % 256 + 16;
56+
uint64_t *ptr = reinterpret_cast<uint64_t *>(malloc(size));
57+
*ptr = gpu::get_thread_id();
58+
59+
EXPECT_EQ(*ptr, gpu::get_thread_id());
5760
ASSERT_TRUE(ptr);
5861
ASSERT_TRUE(__builtin_is_aligned(ptr, 16));
5962
free(ptr);

0 commit comments

Comments
 (0)