Skip to content

Commit 0d5f5be

Browse files
committed
Comments
1 parent 8122d60 commit 0d5f5be

File tree

1 file changed

+22
-18
lines changed

1 file changed

+22
-18
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,18 @@ static inline constexpr T round_up(const T x) {
137137
/// alignment and to indicate that if the pointer is not aligned by 2MiB it
138138
/// belongs to a slab rather than the global allocator.
139139
struct Slab {
140+
// Header metadata for the slab, aligned to the minimum alignment.
141+
struct alignas(MIN_SIZE) Header {
142+
uint32_t chunk_size;
143+
uint32_t global_index;
144+
};
145+
140146
// Initialize the slab with its chunk size and index in the global table for
141147
// use when freeing.
142148
Slab(uint32_t chunk_size, uint32_t global_index) {
143-
*reinterpret_cast<uint32_t *>(&memory[0]) = chunk_size;
144-
*reinterpret_cast<uint32_t *>(&memory[sizeof(uint32_t)]) = global_index;
149+
Header *header = reinterpret_cast<Header *>(memory);
150+
header->chunk_size = chunk_size;
151+
header->global_index = global_index;
145152

146153
// This memset is expensive and likely not necessary for the current 'kfd'
147154
// driver. Until zeroed pages are exposed by the API we must be careful.
@@ -155,13 +162,12 @@ struct Slab {
155162

156163
// Get the number of bytes needed to contain the bitfield bits.
157164
static uint32_t bitfield_bytes(uint32_t chunk_size) {
158-
return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) *
159-
sizeof(uint32_t);
165+
return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8;
160166
}
161167

162168
// The actual amount of memory available excluding the bitfield and metadata.
163169
static uint32_t available_bytes(uint32_t chunk_size) {
164-
return SLAB_SIZE - 2 * bitfield_bytes(chunk_size) - MIN_SIZE;
170+
return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
165171
}
166172

167173
// The number of chunks that can be stored in this slab.
@@ -171,7 +177,7 @@ struct Slab {
171177

172178
// The length in bits of the bitfield.
173179
static uint32_t usable_bits(uint32_t chunk_size) {
174-
return ((available_bytes(chunk_size) + chunk_size - 1) / chunk_size);
180+
return available_bytes(chunk_size) / chunk_size;
175181
}
176182

177183
// Get the location in the memory where we will store the chunk size.
@@ -186,13 +192,13 @@ struct Slab {
186192

187193
// Get a pointer to where the bitfield is located in the memory.
188194
uint32_t *get_bitfield() {
189-
return reinterpret_cast<uint32_t *>(memory + MIN_SIZE);
195+
return reinterpret_cast<uint32_t *>(memory + sizeof(Header));
190196
}
191197

192198
// Get a pointer to where the actual memory to be allocated lives.
193199
uint8_t *get_memory(uint32_t chunk_size) {
194200
return reinterpret_cast<uint8_t *>(memory) + bitfield_bytes(chunk_size) +
195-
MIN_SIZE;
201+
sizeof(Header);
196202
}
197203

198204
// Get a pointer to the actual memory given an index into the bitfield.
@@ -207,15 +213,14 @@ struct Slab {
207213
chunk_size;
208214
}
209215

210-
// Randomly walks the bitfield until it finds a free bit in the bitfield.
211-
// Allocations attempt to put lanes right next to eachother for better
212-
// caching and convergence.
216+
// Randomly walks the bitfield until it finds a free bit. Allocations attempt
217+
// to put lanes right next to each other for better caching and convergence.
213218
void *allocate(uint64_t lane_mask, uint64_t uniform) {
214219
uint32_t chunk_size = get_chunk_size();
215220
uint32_t state = impl::entropy();
216221
void *result = nullptr;
217222
// The uniform mask represents which lanes contain a uniform target pointer.
218-
// We attempt to place these next to eachother in the bitfield.
223+
// We attempt to place these next to each other.
219224
// TODO: We should coalesce these bits and use the result of `fetch_or` to
220225
// search for free bits in parallel.
221226
for (uint64_t mask = ~0ull; mask; mask = gpu::ballot(lane_mask, !result)) {
@@ -229,9 +234,8 @@ struct Slab {
229234
if (mask & (uint64_t(1) << gpu::get_lane_id())) {
230235
uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
231236
.fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
232-
if (~before & (1 << bit)) {
237+
if (~before & (1 << bit))
233238
result = ptr_from_index(index, chunk_size);
234-
}
235239
}
236240
}
237241

@@ -319,14 +323,14 @@ template <typename T> struct GuardPtr {
319323
RefCounter ref{};
320324

321325
// A sentinel value used to claim the pointer slot.
322-
static constexpr uint64_t sentinel = cpp::numeric_limits<uint64_t>::max();
326+
static constexpr uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
323327

324328
// Should be called be a single lane for each different pointer.
325329
template <typename... Args>
326330
T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
327331
T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
328332
if (!expected &&
329-
ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(sentinel),
333+
ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(SENTINEL),
330334
cpp::MemoryOrder::RELAXED,
331335
cpp::MemoryOrder::RELAXED)) {
332336
count = cpp::numeric_limits<uint64_t>::max();
@@ -343,7 +347,7 @@ template <typename T> struct GuardPtr {
343347
return mem;
344348
}
345349

346-
if (!expected || expected == reinterpret_cast<T *>(sentinel))
350+
if (!expected || expected == reinterpret_cast<T *>(SENTINEL))
347351
return nullptr;
348352

349353
if (!ref.acquire(n, count))
@@ -460,7 +464,7 @@ void *allocate(uint64_t size) {
460464
if (!size)
461465
return nullptr;
462466

463-
// Allocations larger than a single slab go directly to memory.
467+
// Allocations requiring a full slab or more go directly to memory.
464468
if (size >= SLAB_SIZE / 2)
465469
return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size));
466470

0 commit comments

Comments
 (0)