@@ -31,6 +31,7 @@ constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
3131constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1 ;
3232constexpr static uint32_t BITS_IN_WORD = sizeof (uint32_t ) * 8 ;
3333constexpr static uint32_t MIN_SIZE = 16 ;
34+ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1 ;
3435
3536// A sentinel used to indicate an invalid but non-null pointer value.
3637constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t >::max();
@@ -63,8 +64,8 @@ static void rpc_free(void *ptr) {
6364 port.close ();
6465}
6566
66- // Convert a potentially disjoint bitmask into an increasing integer for use
67- // with indexing between gpu lanes.
67+ // Convert a potentially disjoint bitmask into an increasing integer per-lane
68+ // for use with indexing between gpu lanes.
6869static inline uint32_t lane_count (uint64_t lane_mask) {
6970 return cpp::popcount (lane_mask & ((uint64_t (1 ) << gpu::get_lane_id ()) - 1 ));
7071}
@@ -78,7 +79,7 @@ static inline uint32_t entropy() {
7879 0x9e3779bb ;
7980}
8081
81- // Generate a random number and update the state using the xorshift*32 PRNG.
82+ // Generate a random number and update the state using the xorshift32* PRNG.
8283static inline uint32_t xorshift32 (uint32_t &state) {
8384 state ^= state << 13 ;
8485 state ^= state >> 17 ;
@@ -109,12 +110,12 @@ static inline uint32_t get_chunk_size(uint32_t x) {
109110 uint32_t s3 = 0b1000 << (pow2 - 3 );
110111
111112 if (s0 > y)
112- return (s0 + 15 ) & ~15 ;
113+ return (s0 + MIN_ALIGNMENT ) & ~MIN_ALIGNMENT ;
113114 if (s1 > y)
114- return (s1 + 15 ) & ~15 ;
115+ return (s1 + MIN_ALIGNMENT ) & ~MIN_ALIGNMENT ;
115116 if (s2 > y)
116- return (s2 + 15 ) & ~15 ;
117- return (s3 + 15 ) & ~15 ;
117+ return (s2 + MIN_ALIGNMENT ) & ~MIN_ALIGNMENT ;
118+ return (s3 + MIN_ALIGNMENT ) & ~MIN_ALIGNMENT ;
118119}
119120
120121// Rounds to the nearest power of two.
@@ -126,7 +127,7 @@ static inline constexpr T round_up(const T x) {
126127
127128} // namespace impl
128129
129- // / A slab allocator used to hand out indentically sized slabs of memory.
130+ // / A slab allocator used to hand out identically sized slabs of memory.
130131// / Allocation is done through random walks of a bitfield until a free bit is
131132// / encountered. This reduces contention and is highly parallel on a GPU.
132133// /
@@ -158,39 +159,39 @@ struct Slab {
158159 __builtin_memset (get_bitfield (), 0 , bitfield_bytes (chunk_size));
159160 }
160161
161- // Get the number of chunks that can theoretically fit inside this array .
162- static uint32_t num_chunks (uint32_t chunk_size) {
162+ // Get the number of chunks that can theoretically fit inside this slab .
163+ constexpr static uint32_t num_chunks (uint32_t chunk_size) {
163164 return SLAB_SIZE / chunk_size;
164165 }
165166
166167 // Get the number of bytes needed to contain the bitfield bits.
167- static uint32_t bitfield_bytes (uint32_t chunk_size) {
168+ constexpr static uint32_t bitfield_bytes (uint32_t chunk_size) {
168169 return ((num_chunks (chunk_size) + BITS_IN_WORD - 1 ) / BITS_IN_WORD) * 8 ;
169170 }
170171
171172 // The actual amount of memory available excluding the bitfield and metadata.
172- static uint32_t available_bytes (uint32_t chunk_size) {
173+ constexpr static uint32_t available_bytes (uint32_t chunk_size) {
173174 return SLAB_SIZE - bitfield_bytes (chunk_size) - sizeof (Header);
174175 }
175176
176177 // The number of chunks that can be stored in this slab.
177- static uint32_t available_chunks (uint32_t chunk_size) {
178+ constexpr static uint32_t available_chunks (uint32_t chunk_size) {
178179 return available_bytes (chunk_size) / chunk_size;
179180 }
180181
181182 // The length in bits of the bitfield.
182- static uint32_t usable_bits (uint32_t chunk_size) {
183+ constexpr static uint32_t usable_bits (uint32_t chunk_size) {
183184 return available_bytes (chunk_size) / chunk_size;
184185 }
185186
186187 // Get the location in the memory where we will store the chunk size.
187188 uint32_t get_chunk_size () const {
188- return * reinterpret_cast <const uint32_t *>(memory);
189+ return reinterpret_cast <const Header *>(memory)-> chunk_size ;
189190 }
190191
191192 // Get the location in the memory where we will store the global index.
192193 uint32_t get_global_index () const {
193- return * reinterpret_cast <const uint32_t *>(memory + sizeof ( uint32_t )) ;
194+ return reinterpret_cast <const Header *>(memory)-> global_index ;
194195 }
195196
196197 // Get a pointer to where the bitfield is located in the memory.
@@ -200,8 +201,8 @@ struct Slab {
200201
201202 // Get a pointer to where the actual memory to be allocated lives.
202203 uint8_t *get_memory (uint32_t chunk_size) {
203- return reinterpret_cast <uint8_t *>(memory) + bitfield_bytes (chunk_size ) +
204- sizeof (Header );
204+ return reinterpret_cast <uint8_t *>(get_bitfield () ) +
205+ bitfield_bytes (chunk_size );
205206 }
206207
207208 // Get a pointer to the actual memory given an index into the bitfield.
@@ -221,11 +222,12 @@ struct Slab {
221222 void *allocate (uint64_t lane_mask, uint64_t uniform) {
222223 uint32_t chunk_size = get_chunk_size ();
223224 uint32_t state = impl::entropy ();
224- void *result = nullptr ;
225+
225226 // The uniform mask represents which lanes contain a uniform target pointer.
226227 // We attempt to place these next to each other.
227228 // TODO: We should coalesce these bits and use the result of `fetch_or` to
228229 // search for free bits in parallel.
230+ void *result = nullptr ;
229231 for (uint64_t mask = lane_mask; mask;
230232 mask = gpu::ballot (lane_mask, !result)) {
231233 uint32_t id = impl::lane_count (uniform & mask);
@@ -235,7 +237,7 @@ struct Slab {
235237
236238 uint32_t slot = index / BITS_IN_WORD;
237239 uint32_t bit = index % BITS_IN_WORD;
238- if (mask & ( uint64_t ( 1 ) << gpu::get_lane_id ()) ) {
240+ if (!result ) {
239241 uint32_t before = cpp::AtomicRef (get_bitfield ()[slot])
240242 .fetch_or (1u << bit, cpp::MemoryOrder::RELAXED);
241243 if (~before & (1 << bit))
@@ -274,7 +276,7 @@ template <typename T> struct GuardPtr {
274276 static constexpr uint64_t INVALID = uint64_t (1 ) << 63 ;
275277
276278 // If a read preempts an unlock call we indicate this so the following
277- // unlock call can swap out the helped bit and maintain exlusive ownership.
279+ // unlock call can swap out the helped bit and maintain exclusive ownership.
278280 static constexpr uint64_t HELPED = uint64_t (1 ) << 62 ;
279281
280282 // Resets the reference counter, cannot be reset to zero safely.
@@ -293,8 +295,8 @@ template <typename T> struct GuardPtr {
293295 // called following a valid acquire call.
294296 bool release (uint32_t n) {
295297 // If this thread caused the counter to reach zero we try to invalidate it
296- // and obtain exclusive rights to descontruct it. If the CAS failed either
297- // another thread resurrced the counter and we quit, or a parallel read
298+ // and obtain exclusive rights to deconstruct it. If the CAS failed either
299+ // another thread resurrected the counter and we quit, or a parallel read
298300 // helped us invalidating it. For the latter, claim that flag and return.
299301 if (counter.fetch_sub (n, cpp::MemoryOrder::RELAXED) == n) {
300302 uint64_t expected = 0 ;
@@ -497,7 +499,7 @@ void deallocate(void *ptr) {
497499 if (!ptr)
498500 return ;
499501
500- // All non-slab allocations will be alinged on a 2MiB boundary.
502+ // All non-slab allocations will be aligned on a 2MiB boundary.
501503 if ((reinterpret_cast <uintptr_t >(ptr) & SLAB_ALIGNMENT) == 0 )
502504 return impl::rpc_free (ptr);
503505
0 commit comments