@@ -34,13 +34,12 @@ constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
34
34
constexpr static uint32_t MIN_SIZE = 16 ;
35
35
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1 ;
36
36
37
+ // The number of times to attempt claiming an in-progress slab allocation.
38
+ constexpr static uint32_t MAX_TRIES = 128 ;
39
+
37
40
// A sentinel used to indicate an invalid but non-null pointer value.
38
41
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t >::max();
39
42
40
- // The number of times we will try starting on a single index before skipping
41
- // past it.
42
- constexpr static uint32_t MAX_TRIES = 512 ;
43
-
44
43
static_assert (!(ARRAY_SIZE & (ARRAY_SIZE - 1 )), "Must be a power of two");
45
44
46
45
namespace impl {
@@ -92,20 +91,10 @@ static inline uint32_t xorshift32(uint32_t &state) {
92
91
return state * 0x9e3779bb ;
93
92
}
94
93
95
- // Final stage of murmurhash used to get a unique index for the global array
96
- static inline uint32_t hash (uint32_t x) {
97
- x ^= x >> 16 ;
98
- x *= 0x85ebca6b ;
99
- x ^= x >> 13 ;
100
- x *= 0xc2b2ae35 ;
101
- x ^= x >> 16 ;
102
- return x;
103
- }
104
-
105
94
// Rounds the input value to the closest permitted chunk size. Here we accept
106
95
// the sum of the closest three powers of two. For a 2MiB slab size this is 48
107
96
// different chunk sizes. This gives us average internal fragmentation of 87.5%.
108
- static inline uint32_t get_chunk_size (uint32_t x) {
97
+ static inline constexpr uint32_t get_chunk_size (uint32_t x) {
109
98
uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
110
99
uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero (y - 1 );
111
100
@@ -123,6 +112,16 @@ static inline uint32_t get_chunk_size(uint32_t x) {
123
112
return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
124
113
}
125
114
115
+ // Converts a chunk size into an index suitable for a statically sized array.
116
+ static inline constexpr uint32_t get_chunk_id (uint32_t x) {
117
+ if (x <= MIN_SIZE)
118
+ return 0 ;
119
+ uint32_t y = x >> 4 ;
120
+ if (x < MIN_SIZE << 2 )
121
+ return cpp::popcount (y);
122
+ return cpp::popcount (y) + 3 * (BITS_IN_WORD - cpp::countl_zero (y)) - 7 ;
123
+ }
124
+
126
125
// Rounds to the nearest power of two.
127
126
template <uint32_t N, typename T>
128
127
static inline constexpr T round_up (const T x) {
@@ -143,6 +142,12 @@ static inline constexpr bool is_pow2(uint64_t x) {
143
142
return x && (x & (x - 1 )) == 0 ;
144
143
}
145
144
145
+ // Where this chunk size should start looking in the global array.
146
+ static inline constexpr uint32_t start_index (uint32_t chunk_index) {
147
+ return (ARRAY_SIZE * impl::get_chunk_id (chunk_index)) /
148
+ impl::get_chunk_id (SLAB_SIZE / 2 );
149
+ }
150
+
146
151
} // namespace impl
147
152
148
153
// / A slab allocator used to hand out identically sized slabs of memory.
@@ -451,66 +456,65 @@ struct GuardPtr {
451
456
// The global array used to search for a valid slab to allocate from.
452
457
static GuardPtr slots[ARRAY_SIZE] = {};
453
458
459
+ // Keep a cache of the last successful slot for each chunk size. Initialize it
460
+ // to an even spread of the total size. Must be updated if the chunking scheme
461
+ // changes.
462
+ #define S (X ) (impl::start_index(X))
463
+ static cpp::Atomic<uint32_t > indices[] = {
464
+ S (16 ), S (32 ), S (48 ), S (64 ), S (96 ), S (112 ), S (128 ),
465
+ S (192 ), S (224 ), S (256 ), S (384 ), S (448 ), S (512 ), S (768 ),
466
+ S (896 ), S (1024 ), S (1536 ), S (1792 ), S (2048 ), S (3072 ), S (3584 ),
467
+ S (4096 ), S (6144 ), S (7168 ), S (8192 ), S (12288 ), S (14336 ), S (16384 ),
468
+ S (24576 ), S (28672 ), S (32768 ), S (49152 ), S (57344 ), S (65536 ), S (98304 ),
469
+ S (114688 ), S (131072 ), S (196608 ), S (229376 ), S (262144 ), S (393216 ), S (458752 ),
470
+ S (524288 ), S (786432 ), S (917504 ), S (1048576 )};
471
+ #undef S
472
+
454
473
// Tries to find a slab in the table that can support the given chunk size.
455
474
static Slab *find_slab (uint32_t chunk_size) {
456
- // We start at a hashed value to spread out different chunk sizes.
457
- uint32_t start = impl::hash (chunk_size);
458
- uint64_t lane_mask = gpu::get_lane_mask ();
459
- uint64_t uniform = gpu::match_any (lane_mask, chunk_size);
460
-
461
- Slab *result = nullptr ;
462
- uint32_t nudge = 0 ;
463
- for (uint64_t mask = lane_mask; mask;
464
- mask = gpu::ballot (lane_mask, !result), ++nudge) {
465
- uint32_t index = cpp::numeric_limits<uint32_t >::max ();
466
- for (uint32_t offset = nudge / MAX_TRIES;
467
- gpu::ballot (lane_mask, index == cpp::numeric_limits<uint32_t >::max ());
468
- offset += cpp::popcount (uniform & lane_mask)) {
469
- uint32_t candidate =
470
- (start + offset + impl::lane_count (uniform & lane_mask)) % ARRAY_SIZE;
471
- uint64_t available =
472
- gpu::ballot (lane_mask, slots[candidate].use_count () <
473
- Slab::available_chunks (chunk_size));
474
- uint32_t new_index = gpu::shuffle (
475
- lane_mask, cpp::countr_zero (available & uniform), candidate);
476
-
477
- // Each uniform group will use the first empty slot they find.
478
- if ((index == cpp::numeric_limits<uint32_t >::max () &&
479
- (available & uniform)))
480
- index = new_index;
481
-
482
- // Guaruntees that this loop will eventuall exit if there is no space.
483
- if (offset >= ARRAY_SIZE) {
484
- result = reinterpret_cast <Slab *>(SENTINEL);
485
- index = 0 ;
486
- }
487
- }
475
+ // We start at the index of the last successful allocation for this kind.
476
+ uint32_t chunk_id = impl::get_chunk_id (chunk_size);
477
+ uint32_t start = indices[chunk_id].load (cpp::MemoryOrder::RELAXED);
478
+ uint64_t uniform = gpu::match_any (gpu::get_lane_mask (), chunk_size);
479
+
480
+ for (uint32_t offset = 0 ; offset < ARRAY_SIZE; ++offset) {
481
+ uint32_t index =
482
+ !offset ? start : (impl::start_index (chunk_size) + offset) % ARRAY_SIZE;
488
483
489
- // Try to claim a slot for the found slot.
490
- if (!result) {
484
+ if (slots[index]. use_count () < Slab::available_chunks (chunk_size)) {
485
+ uint64_t lane_mask = gpu::get_lane_mask ();
491
486
uint64_t reserved = 0 ;
492
- Slab *slab = slots[index].try_lock (lane_mask & mask, uniform & mask,
487
+
488
+ Slab *slab = slots[index].try_lock (lane_mask, uniform & lane_mask,
493
489
reserved, chunk_size, index);
490
+
491
+ // If there is a slab allocation in progress we retry a few times.
492
+ for (uint32_t retries = 0 ;
493
+ retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
494
+ uint64_t lane_mask = gpu::get_lane_mask ();
495
+ slab = slots[index].try_lock (lane_mask, uniform & lane_mask, reserved,
496
+ chunk_size, index);
497
+ sleep_briefly ();
498
+ }
499
+
494
500
// If we find a slab with a matching chunk size then we store the result.
495
501
// Otherwise, we need to free the claimed lock and continue. In the case
496
- // of out-of-memory we return a sentinel value.
502
+ // of out-of-memory we receive a sentinel value and return a failure .
497
503
if (slab && reserved <= Slab::available_chunks (chunk_size) &&
498
504
slab->get_chunk_size () == chunk_size) {
499
- result = slab;
505
+ if (index != start)
506
+ indices[chunk_id].store (index, cpp::MemoryOrder::RELAXED);
507
+ return slab;
500
508
} else if (slab && (reserved > Slab::available_chunks (chunk_size) ||
501
509
slab->get_chunk_size () != chunk_size)) {
502
- if (slab->get_chunk_size () != chunk_size)
503
- start = index + 1 ;
504
510
slots[index].unlock (gpu::get_lane_mask (),
505
511
gpu::get_lane_mask () & uniform);
506
- } else if (!slab && reserved == cpp::numeric_limits<uint64_t >::max ()) {
507
- result = reinterpret_cast <Slab *>(SENTINEL);
508
- } else {
509
- sleep_briefly ();
512
+ } else if (!slab && reserved == SENTINEL) {
513
+ return nullptr ;
510
514
}
511
515
}
512
516
}
513
- return result ;
517
+ return nullptr ;
514
518
}
515
519
516
520
// Release the lock associated with a given slab.
0 commit comments