@@ -137,11 +137,18 @@ static inline constexpr T round_up(const T x) {
137137// / alignment and to indicate that if the pointer is not aligned by 2MiB it
138138// / belongs to a slab rather than the global allocator.
139139struct Slab {
140+ // Header metadata for the slab, aligned to the minimum alignment.
141+ struct alignas (MIN_SIZE) Header {
142+ uint32_t chunk_size;
143+ uint32_t global_index;
144+ };
145+
140146 // Initialize the slab with its chunk size and index in the global table for
141147 // use when freeing.
142148 Slab (uint32_t chunk_size, uint32_t global_index) {
143- *reinterpret_cast <uint32_t *>(&memory[0 ]) = chunk_size;
144- *reinterpret_cast <uint32_t *>(&memory[sizeof (uint32_t )]) = global_index;
149+ Header *header = reinterpret_cast <Header *>(memory);
150+ header->chunk_size = chunk_size;
151+ header->global_index = global_index;
145152
146153 // This memset is expensive and likely not necessary for the current 'kfd'
147154 // driver. Until zeroed pages are exposed by the API we must be careful.
@@ -155,13 +162,12 @@ struct Slab {
155162
156163 // Get the number of bytes needed to contain the bitfield bits.
157164 static uint32_t bitfield_bytes (uint32_t chunk_size) {
158- return ((num_chunks (chunk_size) + BITS_IN_WORD - 1 ) / BITS_IN_WORD) *
159- sizeof (uint32_t );
165+ return ((num_chunks (chunk_size) + BITS_IN_WORD - 1 ) / BITS_IN_WORD) * 8 ;
160166 }
161167
162168 // The actual amount of memory available excluding the bitfield and metadata.
163169 static uint32_t available_bytes (uint32_t chunk_size) {
164- return SLAB_SIZE - 2 * bitfield_bytes (chunk_size) - MIN_SIZE ;
170+ return SLAB_SIZE - bitfield_bytes (chunk_size) - sizeof (Header) ;
165171 }
166172
167173 // The number of chunks that can be stored in this slab.
@@ -171,7 +177,7 @@ struct Slab {
171177
172178 // The length in bits of the bitfield.
173179 static uint32_t usable_bits (uint32_t chunk_size) {
174- return (( available_bytes (chunk_size) + chunk_size - 1 ) / chunk_size) ;
180+ return available_bytes (chunk_size) / chunk_size;
175181 }
176182
177183 // Get the location in the memory where we will store the chunk size.
@@ -186,13 +192,13 @@ struct Slab {
186192
187193 // Get a pointer to where the bitfield is located in the memory.
188194 uint32_t *get_bitfield () {
189- return reinterpret_cast <uint32_t *>(memory + MIN_SIZE );
195+ return reinterpret_cast <uint32_t *>(memory + sizeof (Header) );
190196 }
191197
192198 // Get a pointer to where the actual memory to be allocated lives.
193199 uint8_t *get_memory (uint32_t chunk_size) {
194200 return reinterpret_cast <uint8_t *>(memory) + bitfield_bytes (chunk_size) +
195- MIN_SIZE ;
201+ sizeof (Header) ;
196202 }
197203
198204 // Get a pointer to the actual memory given an index into the bitfield.
@@ -207,15 +213,14 @@ struct Slab {
207213 chunk_size;
208214 }
209215
210- // Randomly walks the bitfield until it finds a free bit in the bitfield.
211- // Allocations attempt to put lanes right next to eachother for better
212- // caching and convergence.
216+ // Randomly walks the bitfield until it finds a free bit. Allocations attempt
217+ // to put lanes right next to each other for better caching and convergence.
213218 void *allocate (uint64_t lane_mask, uint64_t uniform) {
214219 uint32_t chunk_size = get_chunk_size ();
215220 uint32_t state = impl::entropy ();
216221 void *result = nullptr ;
217222 // The uniform mask represents which lanes contain a uniform target pointer.
218- // We attempt to place these next to eachother in the bitfield .
223+ // We attempt to place these next to each other .
219224 // TODO: We should coalesce these bits and use the result of `fetch_or` to
220225 // search for free bits in parallel.
221226 for (uint64_t mask = ~0ull ; mask; mask = gpu::ballot (lane_mask, !result)) {
@@ -229,9 +234,8 @@ struct Slab {
229234 if (mask & (uint64_t (1 ) << gpu::get_lane_id ())) {
230235 uint32_t before = cpp::AtomicRef (get_bitfield ()[slot])
231236 .fetch_or (1u << bit, cpp::MemoryOrder::RELAXED);
232- if (~before & (1 << bit)) {
237+ if (~before & (1 << bit))
233238 result = ptr_from_index (index, chunk_size);
234- }
235239 }
236240 }
237241
@@ -319,14 +323,14 @@ template <typename T> struct GuardPtr {
319323 RefCounter ref{};
320324
321325 // A sentinel value used to claim the pointer slot.
322- static constexpr uint64_t sentinel = cpp::numeric_limits<uint64_t >::max();
326+ static constexpr uint64_t SENTINEL = cpp::numeric_limits<uint64_t >::max();
323327
324328 // Should be called be a single lane for each different pointer.
325329 template <typename ... Args>
326330 T *try_lock_impl (uint32_t n, uint64_t &count, Args &&...args) {
327331 T *expected = ptr.load (cpp::MemoryOrder::RELAXED);
328332 if (!expected &&
329- ptr.compare_exchange_strong (expected, reinterpret_cast <T *>(sentinel ),
333+ ptr.compare_exchange_strong (expected, reinterpret_cast <T *>(SENTINEL ),
330334 cpp::MemoryOrder::RELAXED,
331335 cpp::MemoryOrder::RELAXED)) {
332336 count = cpp::numeric_limits<uint64_t >::max ();
@@ -343,7 +347,7 @@ template <typename T> struct GuardPtr {
343347 return mem;
344348 }
345349
346- if (!expected || expected == reinterpret_cast <T *>(sentinel ))
350+ if (!expected || expected == reinterpret_cast <T *>(SENTINEL ))
347351 return nullptr ;
348352
349353 if (!ref.acquire (n, count))
@@ -460,7 +464,7 @@ void *allocate(uint64_t size) {
460464 if (!size)
461465 return nullptr ;
462466
463- // Allocations larger than a single slab go directly to memory.
467+ // Allocations requiring a full slab or more go directly to memory.
464468 if (size >= SLAB_SIZE / 2 )
465469 return impl::rpc_allocate (impl::round_up<SLAB_SIZE>(size));
466470
0 commit comments