@@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) {
129129 return (x + N) & ~(N - 1 );
130130}
131131
132+ // Perform a lane parallel memset on a uint32_t pointer.
133+ void uniform_memset (uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
134+ uint64_t mask = gpu::get_lane_mask ();
135+ uint32_t workers = cpp::popcount (uniform);
136+ for (uint32_t i = impl::lane_count (mask & uniform); i < n; i += workers)
137+ s[i] = c;
138+ }
139+
132140} // namespace impl
133141
134142// / A slab allocator used to hand out identically sized slabs of memory.
@@ -163,13 +171,9 @@ struct Slab {
163171 // must be called before the bitfield can be accessed safely, memory is not
164172 // guaranteed to be zero initialized in the current implementation.
165173 void initialize (uint64_t uniform) {
166- uint64_t mask = gpu::get_lane_mask ();
167- uint32_t *bitfield = get_bitfield ();
168- uint32_t workers = cpp::popcount (uniform);
169- uint32_t words = (bitfield_bytes (get_chunk_size ()) + sizeof (uint32_t ) - 1 ) /
170- sizeof (uint32_t );
171- for (uint32_t i = impl::lane_count (mask & uniform); i < words; i += workers)
172- bitfield[i] = 0 ;
174+ uint32_t size = (bitfield_bytes (get_chunk_size ()) + sizeof (uint32_t ) - 1 ) /
175+ sizeof (uint32_t );
176+ impl::uniform_memset (get_bitfield (), 0 , size, uniform);
173177 }
174178
175179 // Get the number of chunks that can theoretically fit inside this slab.
0 commit comments