@@ -39,9 +39,6 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
39
39
// The number of times to attempt claiming an in-progress slab allocation.
40
40
constexpr static uint32_t MAX_TRIES = 1024 ;
41
41
42
- // A sentinel used to indicate an invalid but non-null pointer value.
43
- constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t >::max();
44
-
45
42
static_assert (!(ARRAY_SIZE & (ARRAY_SIZE - 1 )), " Must be a power of two" );
46
43
47
44
namespace impl {
@@ -163,6 +160,11 @@ static inline uint32_t get_leader_id(uint64_t ballot, uint32_t id) {
163
160
return BITS_IN_DWORD - cpp::countl_zero (ballot & ~mask) - 1 ;
164
161
}
165
162
163
+ // We use a sentinal value to indicate a failed or in-progress allocation.
164
+ template <typename T> bool is_sentinel (const T &x) {
165
+ return x == cpp::numeric_limits<T>::max ();
166
+ }
167
+
166
168
} // namespace impl
167
169
168
170
// / A slab allocator used to hand out identically sized slabs of memory.
@@ -343,20 +345,20 @@ struct GuardPtr {
343
345
private:
344
346
struct RefCounter {
345
347
// Indicates that the object is in its deallocation phase and thus invalid.
346
- static constexpr uint64_t INVALID = uint64_t (1 ) << 63 ;
348
+ static constexpr uint32_t INVALID = uint32_t (1 ) << 31 ;
347
349
348
350
// If a read preempts an unlock call we indicate this so the following
349
351
// unlock call can swap out the helped bit and maintain exclusive ownership.
350
- static constexpr uint64_t HELPED = uint64_t (1 ) << 62 ;
352
+ static constexpr uint32_t HELPED = uint32_t (1 ) << 30 ;
351
353
352
354
// Resets the reference counter, cannot be reset to zero safely.
353
- void reset (uint32_t n, uint64_t &count) {
355
+ void reset (uint32_t n, uint32_t &count) {
354
356
counter.store (n, cpp::MemoryOrder::RELAXED);
355
357
count = n;
356
358
}
357
359
358
360
// Acquire a slot in the reference counter if it is not invalid.
359
- bool acquire (uint32_t n, uint64_t &count) {
361
+ bool acquire (uint32_t n, uint32_t &count) {
360
362
count = counter.fetch_add (n, cpp::MemoryOrder::RELAXED) + n;
361
363
return (count & INVALID) == 0 ;
362
364
}
@@ -369,7 +371,7 @@ struct GuardPtr {
369
371
// another thread resurrected the counter and we quit, or a parallel read
370
372
// helped us invalidating it. For the latter, claim that flag and return.
371
373
if (counter.fetch_sub (n, cpp::MemoryOrder::RELAXED) == n) {
372
- uint64_t expected = 0 ;
374
+ uint32_t expected = 0 ;
373
375
if (counter.compare_exchange_strong (expected, INVALID,
374
376
cpp::MemoryOrder::RELAXED,
375
377
cpp::MemoryOrder::RELAXED))
@@ -392,28 +394,29 @@ struct GuardPtr {
392
394
return (val & INVALID) ? 0 : val;
393
395
}
394
396
395
- cpp::Atomic<uint64_t > counter{0 };
397
+ cpp::Atomic<uint32_t > counter{0 };
396
398
};
397
399
398
- cpp::Atomic<Slab *> ptr{ nullptr } ;
399
- RefCounter ref{} ;
400
+ cpp::Atomic<Slab *> ptr;
401
+ RefCounter ref;
400
402
401
403
// Should be called be a single lane for each different pointer.
402
404
template <typename ... Args>
403
- Slab *try_lock_impl (uint32_t n, uint64_t &count, Args &&...args) {
405
+ Slab *try_lock_impl (uint32_t n, uint32_t &count, Args &&...args) {
404
406
Slab *expected = ptr.load (cpp::MemoryOrder::RELAXED);
405
407
if (!expected &&
406
408
ptr.compare_exchange_strong (
407
- expected, reinterpret_cast <Slab *>(SENTINEL),
409
+ expected,
410
+ reinterpret_cast <Slab *>(cpp::numeric_limits<uintptr_t >::max ()),
408
411
cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
409
- count = cpp::numeric_limits<uint64_t >::max ();
412
+ count = cpp::numeric_limits<uint32_t >::max ();
410
413
void *raw = impl::rpc_allocate (sizeof (Slab));
411
414
if (!raw)
412
415
return nullptr ;
413
416
return new (raw) Slab (cpp::forward<Args>(args)...);
414
417
}
415
418
416
- if (!expected || expected == reinterpret_cast <Slab *>(SENTINEL ))
419
+ if (!expected || impl::is_sentinel ( reinterpret_cast <uintptr_t >(expected) ))
417
420
return nullptr ;
418
421
419
422
if (!ref.acquire (n, count))
@@ -425,7 +428,7 @@ struct GuardPtr {
425
428
426
429
// Finalize the associated memory and signal that it is ready to use by
427
430
// resetting the counter.
428
- void finalize (Slab *mem, uint32_t n, uint64_t &count) {
431
+ void finalize (Slab *mem, uint32_t n, uint32_t &count) {
429
432
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
430
433
ptr.store (mem, cpp::MemoryOrder::RELAXED);
431
434
cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
@@ -438,7 +441,7 @@ struct GuardPtr {
438
441
// The uniform mask represents which lanes share the same pointer. For each
439
442
// uniform value we elect a leader to handle it on behalf of the other lanes.
440
443
template <typename ... Args>
441
- Slab *try_lock (uint64_t lane_mask, uint64_t uniform, uint64_t &count,
444
+ Slab *try_lock (uint64_t lane_mask, uint64_t uniform, uint32_t &count,
442
445
Args &&...args) {
443
446
count = 0 ;
444
447
Slab *result = nullptr ;
@@ -453,13 +456,13 @@ struct GuardPtr {
453
456
454
457
// We defer storing the newly allocated slab until now so that we can use
455
458
// multiple lanes to initialize it and release it for use.
456
- if (count == cpp::numeric_limits< uint64_t >:: max ( )) {
459
+ if (impl::is_sentinel (count )) {
457
460
result->initialize (uniform);
458
461
if (gpu::get_lane_id () == uint32_t (cpp::countr_zero (uniform)))
459
462
finalize (result, cpp::popcount (uniform), count);
460
463
}
461
464
462
- if (count != cpp::numeric_limits< uint64_t >:: max ( ))
465
+ if (! impl::is_sentinel (count ))
463
466
count = count - cpp::popcount (uniform) +
464
467
impl::lane_count (uniform, gpu::get_lane_id ()) + 1 ;
465
468
@@ -515,14 +518,15 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
515
518
if (!offset ||
516
519
slots[index].use_count () < Slab::available_chunks (chunk_size)) {
517
520
uint64_t lane_mask = gpu::get_lane_mask ();
518
- uint64_t reserved = 0 ;
521
+ uint32_t reserved = 0 ;
519
522
520
523
Slab *slab = slots[index].try_lock (lane_mask, uniform & lane_mask,
521
524
reserved, chunk_size, index);
522
525
523
526
// If there is a slab allocation in progress we retry a few times.
524
527
for (uint32_t retries = 0 ;
525
- retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
528
+ !slab && !impl::is_sentinel (reserved) && retries < MAX_TRIES;
529
+ retries++) {
526
530
uint64_t lane_mask = gpu::get_lane_mask ();
527
531
slab = slots[index].try_lock (lane_mask, uniform & lane_mask, reserved,
528
532
chunk_size, index);
@@ -542,7 +546,7 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
542
546
slab->get_chunk_size () != chunk_size)) {
543
547
slots[index].unlock (gpu::get_lane_mask (),
544
548
gpu::get_lane_mask () & uniform);
545
- } else if (!slab && reserved == SENTINEL ) {
549
+ } else if (!slab && impl::is_sentinel ( reserved) ) {
546
550
uniform = uniform & gpu::get_lane_mask ();
547
551
return nullptr ;
548
552
} else {
@@ -575,7 +579,7 @@ void *allocate(uint64_t size) {
575
579
uint32_t chunk_size = impl::get_chunk_size (static_cast <uint32_t >(size));
576
580
uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
577
581
Slab *slab = find_slab (chunk_size, uniform);
578
- if (!slab || slab == reinterpret_cast <Slab *>(SENTINEL ))
582
+ if (!slab || impl::is_sentinel ( reinterpret_cast <uintptr_t >(slab) ))
579
583
return nullptr ;
580
584
581
585
uint64_t lane_mask = gpu::get_lane_mask ();
0 commit comments