@@ -229,24 +229,34 @@ struct Slab {
229229
230230 // The uniform mask represents which lanes contain a uniform target pointer.
231231 // We attempt to place these next to each other.
232- // TODO: We should coalesce these bits and use the result of `fetch_or` to
233- // search for free bits in parallel.
234232 void *result = nullptr ;
235233 for (uint64_t mask = lane_mask; mask;
236234 mask = gpu::ballot (lane_mask, !result)) {
237- uint32_t id = impl::lane_count (uniform & mask);
238- uint32_t index =
239- ( gpu::broadcast_value (lane_mask, impl::xorshift32 (state)) + id) %
240- usable_bits (chunk_size );
235+ if (result)
236+ continue ;
237+
238+ uint32_t start = gpu::broadcast_value (lane_mask, impl::xorshift32 (state) );
241239
240+ uint32_t id = impl::lane_count (uniform & mask);
241+ uint32_t index = (start + id) % usable_bits (chunk_size);
242242 uint32_t slot = index / BITS_IN_WORD;
243243 uint32_t bit = index % BITS_IN_WORD;
244- if (!result) {
245- uint32_t before = cpp::AtomicRef (get_bitfield ()[slot])
246- .fetch_or (1u << bit, cpp::MemoryOrder::RELAXED);
247- if (~before & (1 << bit))
248- result = ptr_from_index (index, chunk_size);
249- }
244+
245+ // Get the mask of bits destined for the same slot and coalesce it.
246+ uint64_t match = uniform & gpu::match_any (mask, slot);
247+ uint32_t length = cpp::popcount (match);
248+ uint32_t bitmask = static_cast <uint32_t >((uint64_t (1 ) << length) - 1 )
249+ << bit;
250+
251+ uint32_t before = 0 ;
252+ if (gpu::get_lane_id () == static_cast <uint32_t >(cpp::countr_zero (match)))
253+ before = cpp::AtomicRef (get_bitfield ()[slot])
254+ .fetch_or (bitmask, cpp::MemoryOrder::RELAXED);
255+ before = gpu::shuffle (mask, cpp::countr_zero (match), before);
256+ if (~before & (1 << bit))
257+ result = ptr_from_index (index, chunk_size);
258+ else
259+ sleep_briefly ();
250260 }
251261
252262 cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
0 commit comments