Skip to content

Commit b685f72

Browse files
committed
Cleanup and comments
1 parent 22f677c commit b685f72

File tree

1 file changed

+26
-24
lines changed

1 file changed

+26
-24
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
3131
constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
3232
constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
3333
constexpr static uint32_t MIN_SIZE = 16;
34+
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
3435

3536
// A sentinel used to indicate an invalid but non-null pointer value.
3637
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
@@ -63,8 +64,8 @@ static void rpc_free(void *ptr) {
6364
port.close();
6465
}
6566

66-
// Convert a potentially disjoint bitmask into an increasing integer for use
67-
// with indexing between gpu lanes.
67+
// Convert a potentially disjoint bitmask into an increasing integer per-lane
68+
// for use with indexing between gpu lanes.
6869
static inline uint32_t lane_count(uint64_t lane_mask) {
6970
return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
7071
}
@@ -78,7 +79,7 @@ static inline uint32_t entropy() {
7879
0x9e3779bb;
7980
}
8081

81-
// Generate a random number and update the state using the xorshift*32 PRNG.
82+
// Generate a random number and update the state using the xorshift32* PRNG.
8283
static inline uint32_t xorshift32(uint32_t &state) {
8384
state ^= state << 13;
8485
state ^= state >> 17;
@@ -109,12 +110,12 @@ static inline uint32_t get_chunk_size(uint32_t x) {
109110
uint32_t s3 = 0b1000 << (pow2 - 3);
110111

111112
if (s0 > y)
112-
return (s0 + 15) & ~15;
113+
return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
113114
if (s1 > y)
114-
return (s1 + 15) & ~15;
115+
return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
115116
if (s2 > y)
116-
return (s2 + 15) & ~15;
117-
return (s3 + 15) & ~15;
117+
return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
118+
return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
118119
}
119120

120121
// Rounds to the nearest power of two.
@@ -126,7 +127,7 @@ static inline constexpr T round_up(const T x) {
126127

127128
} // namespace impl
128129

129-
/// A slab allocator used to hand out indentically sized slabs of memory.
130+
/// A slab allocator used to hand out identically sized slabs of memory.
130131
/// Allocation is done through random walks of a bitfield until a free bit is
131132
/// encountered. This reduces contention and is highly parallel on a GPU.
132133
///
@@ -158,39 +159,39 @@ struct Slab {
158159
__builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
159160
}
160161

161-
// Get the number of chunks that can theoretically fit inside this array.
162-
static uint32_t num_chunks(uint32_t chunk_size) {
162+
// Get the number of chunks that can theoretically fit inside this slab.
163+
constexpr static uint32_t num_chunks(uint32_t chunk_size) {
163164
return SLAB_SIZE / chunk_size;
164165
}
165166

166167
// Get the number of bytes needed to contain the bitfield bits.
167-
static uint32_t bitfield_bytes(uint32_t chunk_size) {
168+
constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
168169
return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8;
169170
}
170171

171172
// The actual amount of memory available excluding the bitfield and metadata.
172-
static uint32_t available_bytes(uint32_t chunk_size) {
173+
constexpr static uint32_t available_bytes(uint32_t chunk_size) {
173174
return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
174175
}
175176

176177
// The number of chunks that can be stored in this slab.
177-
static uint32_t available_chunks(uint32_t chunk_size) {
178+
constexpr static uint32_t available_chunks(uint32_t chunk_size) {
178179
return available_bytes(chunk_size) / chunk_size;
179180
}
180181

181182
// The length in bits of the bitfield.
182-
static uint32_t usable_bits(uint32_t chunk_size) {
183+
constexpr static uint32_t usable_bits(uint32_t chunk_size) {
183184
return available_bytes(chunk_size) / chunk_size;
184185
}
185186

186187
// Get the location in the memory where we will store the chunk size.
187188
uint32_t get_chunk_size() const {
188-
return *reinterpret_cast<const uint32_t *>(memory);
189+
return reinterpret_cast<const Header *>(memory)->chunk_size;
189190
}
190191

191192
// Get the location in the memory where we will store the global index.
192193
uint32_t get_global_index() const {
193-
return *reinterpret_cast<const uint32_t *>(memory + sizeof(uint32_t));
194+
return reinterpret_cast<const Header *>(memory)->global_index;
194195
}
195196

196197
// Get a pointer to where the bitfield is located in the memory.
@@ -200,8 +201,8 @@ struct Slab {
200201

201202
// Get a pointer to where the actual memory to be allocated lives.
202203
uint8_t *get_memory(uint32_t chunk_size) {
203-
return reinterpret_cast<uint8_t *>(memory) + bitfield_bytes(chunk_size) +
204-
sizeof(Header);
204+
return reinterpret_cast<uint8_t *>(get_bitfield()) +
205+
bitfield_bytes(chunk_size);
205206
}
206207

207208
// Get a pointer to the actual memory given an index into the bitfield.
@@ -221,11 +222,12 @@ struct Slab {
221222
void *allocate(uint64_t lane_mask, uint64_t uniform) {
222223
uint32_t chunk_size = get_chunk_size();
223224
uint32_t state = impl::entropy();
224-
void *result = nullptr;
225+
225226
// The uniform mask represents which lanes contain a uniform target pointer.
226227
// We attempt to place these next to each other.
227228
// TODO: We should coalesce these bits and use the result of `fetch_or` to
228229
// search for free bits in parallel.
230+
void *result = nullptr;
229231
for (uint64_t mask = lane_mask; mask;
230232
mask = gpu::ballot(lane_mask, !result)) {
231233
uint32_t id = impl::lane_count(uniform & mask);
@@ -235,7 +237,7 @@ struct Slab {
235237

236238
uint32_t slot = index / BITS_IN_WORD;
237239
uint32_t bit = index % BITS_IN_WORD;
238-
if (mask & (uint64_t(1) << gpu::get_lane_id())) {
240+
if (!result) {
239241
uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
240242
.fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
241243
if (~before & (1 << bit))
@@ -274,7 +276,7 @@ template <typename T> struct GuardPtr {
274276
static constexpr uint64_t INVALID = uint64_t(1) << 63;
275277

276278
// If a read preempts an unlock call we indicate this so the following
277-
// unlock call can swap out the helped bit and maintain exlusive ownership.
279+
// unlock call can swap out the helped bit and maintain exclusive ownership.
278280
static constexpr uint64_t HELPED = uint64_t(1) << 62;
279281

280282
// Resets the reference counter, cannot be reset to zero safely.
@@ -293,8 +295,8 @@ template <typename T> struct GuardPtr {
293295
// called following a valid acquire call.
294296
bool release(uint32_t n) {
295297
// If this thread caused the counter to reach zero we try to invalidate it
296-
// and obtain exclusive rights to descontruct it. If the CAS failed either
297-
// another thread resurrced the counter and we quit, or a parallel read
298+
// and obtain exclusive rights to deconstruct it. If the CAS failed either
299+
// another thread resurrected the counter and we quit, or a parallel read
298300
// helped us invalidating it. For the latter, claim that flag and return.
299301
if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
300302
uint64_t expected = 0;
@@ -497,7 +499,7 @@ void deallocate(void *ptr) {
497499
if (!ptr)
498500
return;
499501

500-
// All non-slab allocations will be alinged on a 2MiB boundary.
502+
// All non-slab allocations will be aligned on a 2MiB boundary.
501503
if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0)
502504
return impl::rpc_free(ptr);
503505

0 commit comments

Comments
 (0)