@@ -1220,6 +1220,7 @@ class open_addressing_ref_impl {
12201220 }
12211221
12221222 // Fill the buffer if any matching keys are found
1223+ auto const lane_id = probing_tile.thread_rank ();
12231224 if (thrust::any_of (thrust::seq, exists, exists + bucket_size, thrust::identity{})) {
12241225 if constexpr (IsOuter) { found_match = true ; }
12251226
@@ -1233,7 +1234,7 @@ class open_addressing_ref_impl {
12331234 thrust::reduce (thrust::seq, num_matches, num_matches + bucket_size);
12341235
12351236 int32_t output_idx;
1236- if (probing_tile. thread_rank () == 0 ) {
1237+ if (lane_id == 0 ) {
12371238 auto ref =
12381239 cuda::atomic_ref<int32_t , cuda::thread_scope_block>{counters[flushing_tile_id]};
12391240 output_idx = ref.fetch_add (total_matches, cuda::memory_order_relaxed);
@@ -1244,8 +1245,7 @@ class open_addressing_ref_impl {
12441245#pragma unroll buffer_size
12451246 for (int32_t i = 0 ; i < bucket_size; ++i) {
12461247 if (equals[i]) {
1247- auto const lane_offset =
1248- detail::count_least_significant_bits (exists[i], probing_tile.thread_rank ());
1248+ auto const lane_offset = detail::count_least_significant_bits (exists[i], lane_id);
12491249 buffers[flushing_tile_id][output_idx + matche_offset + lane_offset] = {
12501250 probe_key, bucket_slots[i]};
12511251 }
@@ -1255,7 +1255,7 @@ class open_addressing_ref_impl {
12551255 // Special handling for outer cases where no match is found
12561256 if constexpr (IsOuter) {
12571257 if (!running) {
1258- if (!found_match and probing_tile. thread_rank () == 0 ) {
1258+ if (!found_match and lane_id == 0 ) {
12591259 auto ref =
12601260 cuda::atomic_ref<int32_t , cuda::thread_scope_block>{counters[flushing_tile_id]};
12611261 auto const output_idx = ref.fetch_add (1 , cuda::memory_order_relaxed);
0 commit comments