@@ -113,12 +113,12 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
113113// linear search
114114#pragma unroll
115115 for (cuda::std::uint32_t i = 0 ; i < storage_ref_.metadata ().num_containers ; i++) {
116+ cuda::std::byte const * key_ptr =
117+ storage_ref_.key_cards () + (i * 2 ) * sizeof (cuda::std::uint16_t );
116118 if constexpr (Aligned) {
117- key = aligned_load<cuda::std::uint16_t >(storage_ref_.key_cards () +
118- (i * 2 ) * sizeof (cuda::std::uint16_t ));
119+ key = aligned_load<cuda::std::uint16_t >(key_ptr);
119120 } else {
120- key = misaligned_load<cuda::std::uint16_t >(storage_ref_.key_cards () +
121- (i * 2 ) * sizeof (cuda::std::uint16_t ));
121+ key = misaligned_load<cuda::std::uint16_t >(key_ptr);
122122 }
123123 if (key == upper) { return this ->contains_container <Aligned>(lower, i); }
124124 if (key > upper) { return false ; }
@@ -129,12 +129,12 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
129129 cuda::std::uint32_t right = storage_ref_.metadata ().num_containers ;
130130 while (left < right) {
131131 cuda::std::uint32_t mid = left + (right - left) / 2 ;
132+ cuda::std::byte const * key_ptr =
133+ storage_ref_.key_cards () + (mid * 2 ) * sizeof (cuda::std::uint16_t );
132134 if constexpr (Aligned) {
133- key = aligned_load<cuda::std::uint16_t >(storage_ref_.key_cards () +
134- (mid * 2 ) * sizeof (cuda::std::uint16_t ));
135+ key = aligned_load<cuda::std::uint16_t >(key_ptr);
135136 } else {
136- key = misaligned_load<cuda::std::uint16_t >(storage_ref_.key_cards () +
137- (mid * 2 ) * sizeof (cuda::std::uint16_t ));
137+ key = misaligned_load<cuda::std::uint16_t >(key_ptr);
138138 }
139139
140140 if (key == upper) {
@@ -170,29 +170,29 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
170170 __device__ bool contains_container (cuda::std::uint16_t lower, cuda::std::uint32_t index) const
171171 {
172172 cuda::std::uint32_t offset;
173+ cuda::std::byte const * offset_ptr =
174+ storage_ref_.container_offsets () + index * sizeof (cuda::std::uint32_t );
173175 if (offsets_aligned_) {
174- offset = aligned_load<cuda::std::uint32_t >(storage_ref_.container_offsets () +
175- index * sizeof (cuda::std::uint32_t ));
176+ offset = aligned_load<cuda::std::uint32_t >(offset_ptr);
176177 } else {
177- offset = misaligned_load<cuda::std::uint32_t >(storage_ref_.container_offsets () +
178- index * sizeof (cuda::std::uint32_t ));
178+ offset = misaligned_load<cuda::std::uint32_t >(offset_ptr);
179179 }
180180 cuda::std::byte const * container = storage_ref_.data () + offset;
181181 if (storage_ref_.metadata ().has_run and check_bit (storage_ref_.run_container_bitmap (), index)) {
182182 return this ->contains_run_container <Aligned>(container, lower);
183183 } else {
184184 cuda::std::uint32_t card;
185+ cuda::std::byte const * card_ptr =
186+ storage_ref_.key_cards () + (index * 2 + 1 ) * sizeof (cuda::std::uint16_t );
185187 if constexpr (Aligned) {
186- card = 1u + aligned_load<cuda::std::uint16_t >(
187- storage_ref_.key_cards () + (index * 2 + 1 ) * sizeof (cuda::std::uint16_t ));
188+ card = 1u + aligned_load<cuda::std::uint16_t >(card_ptr);
188189 } else {
189- card = 1u + misaligned_load<cuda::std::uint16_t >(
190- storage_ref_.key_cards () + (index * 2 + 1 ) * sizeof (cuda::std::uint16_t ));
190+ card = 1u + misaligned_load<cuda::std::uint16_t >(card_ptr);
191191 }
192192 if (card <= storage_ref_type::metadata_type::max_array_container_card) {
193193 return this ->contains_array_container <Aligned>(container, lower, card);
194194 } else {
195- return this ->contains_bitset_container (container, lower, card );
195+ return this ->contains_bitset_container (container, lower);
196196 }
197197 }
198198 }
@@ -206,10 +206,11 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
206206 // Use linear search for small arrays, binary search for larger ones
207207 if (card < binary_search_threshold) {
208208 for (cuda::std::uint32_t i = 0 ; i < card; i++) {
209+ cuda::std::byte const * elem_ptr = container + i * sizeof (cuda::std::uint16_t );
209210 if constexpr (Aligned) {
210- elem = aligned_load<cuda::std::uint16_t >(container + i * sizeof (cuda::std:: uint16_t ) );
211+ elem = aligned_load<cuda::std::uint16_t >(elem_ptr );
211212 } else {
212- elem = misaligned_load<cuda::std::uint16_t >(container + i * sizeof (cuda::std:: uint16_t ) );
213+ elem = misaligned_load<cuda::std::uint16_t >(elem_ptr );
213214 }
214215 if (elem == lower) { return true ; }
215216 }
@@ -219,12 +220,12 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
219220 cuda::std::uint32_t right = card;
220221
221222 while (left < right) {
222- cuda::std::uint32_t mid = left + (right - left) / 2 ;
223+ cuda::std::uint32_t mid = left + (right - left) / 2 ;
224+ cuda::std::byte const * elem_ptr = container + mid * sizeof (cuda::std::uint16_t );
223225 if constexpr (Aligned) {
224- elem = aligned_load<cuda::std::uint16_t >(container + mid * sizeof (cuda::std:: uint16_t ) );
226+ elem = aligned_load<cuda::std::uint16_t >(elem_ptr );
225227 } else {
226- elem =
227- misaligned_load<cuda::std::uint16_t >(container + mid * sizeof (cuda::std::uint16_t ));
228+ elem = misaligned_load<cuda::std::uint16_t >(elem_ptr);
228229 }
229230 if (elem == lower) {
230231 return true ;
@@ -239,11 +240,9 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
239240 }
240241
241242 __device__ bool contains_bitset_container (cuda::std::byte const * container,
242- cuda::std::uint16_t lower,
243- cuda::std::uint32_t card) const
243+ cuda::std::uint16_t lower) const
244244 {
245- return static_cast <cuda::std::uint8_t >(container[lower / 8 ]) &
246- (cuda::std::uint8_t (1 ) << (lower % 8 ));
245+ return check_bit (container, lower);
247246 }
248247
249248 template <bool Aligned>
@@ -262,19 +261,18 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
262261 cuda::std::uint32_t end;
263262
264263 for (cuda::std::uint32_t i = 0 ; i < num_runs; i++) {
264+ // the first 16 bits of the run container denotes the number of runs
265+ // followed by the sequence of runs as (start, end) U16 pairs
266+ cuda::std::byte const * start_ptr = container + (i * 2 + 1 ) * sizeof (cuda::std::uint16_t );
265267 // TODO load start+end in one instruction
266268 if constexpr (Aligned) {
267- start =
268- aligned_load<cuda::std::uint16_t >(container + (i * 2 + 1 ) * sizeof (cuda::std::uint16_t ));
269- end =
270- static_cast <cuda::std::uint32_t >(start) +
271- aligned_load<cuda::std::uint16_t >(container + (i * 2 + 2 ) * sizeof (cuda::std::uint16_t ));
269+ start = aligned_load<cuda::std::uint16_t >(start_ptr);
270+ end = static_cast <cuda::std::uint32_t >(start) +
271+ aligned_load<cuda::std::uint16_t >(start_ptr + sizeof (cuda::std::uint16_t ));
272272 } else {
273- start = misaligned_load<cuda::std::uint16_t >(container +
274- (i * 2 + 1 ) * sizeof (cuda::std::uint16_t ));
273+ start = misaligned_load<cuda::std::uint16_t >(start_ptr);
275274 end = static_cast <cuda::std::uint32_t >(start) +
276- misaligned_load<cuda::std::uint16_t >(container +
277- (i * 2 + 2 ) * sizeof (cuda::std::uint16_t ));
275+ misaligned_load<cuda::std::uint16_t >(start_ptr + sizeof (cuda::std::uint16_t ));
278276 }
279277 if (start <= lower && end >= lower) { return true ; }
280278 if (start > lower) { break ; }
0 commit comments