Use cooperative-groups for warp-parallel kernels in strings functions (rapidsai#18959)

davidwendt · web-flow · commit 6bc515d82195 · 2025-06-03T17:10:16.000Z
Replaces some warp-parallel logic in strings internal functions to use cooperative groups instead. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Yunsong Wang (https://github.com/PointKernel) URL: rapidsai#18959
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
@@ -35,7 +35,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cub/cub.cuh>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -285,17 +286,13 @@ CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
                                     column_device_view d_strings,
                                     size_type* d_sizes)
 {
-  auto idx = cudf::detail::grid_1d::global_thread_id();
-  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
+  namespace cg        = cooperative_groups;
+  auto const warp     = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+  auto const lane_idx = warp.thread_rank();
 
-  auto const str_idx  = idx / cudf::detail::warp_size;
-  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx = warp.meta_group_rank();
+  if (str_idx >= d_strings.size() or d_strings.is_null(str_idx)) { return; }
 
-  // initialize the output for the atomicAdd
-  if (lane_idx == 0) { d_sizes[str_idx] = 0; }
-  __syncwarp();
-
-  if (d_strings.is_null(str_idx)) { return; }
   auto const d_str   = d_strings.element<string_view>(str_idx);
   auto const str_ptr = d_str.data();
 
@@ -311,11 +308,9 @@ CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
       size += converter.process_character(u8);
     }
   }
-  // this is slightly faster than using the cub::warp_reduce
-  if (size > 0) {
-    cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_sizes + str_idx)};
-    ref.fetch_add(size, cuda::std::memory_order_relaxed);
-  }
+
+  auto out_size = cg::reduce(warp, size, cg::plus<size_type>());
+  if (lane_idx == 0) { d_sizes[str_idx] = out_size; }
 }
 
 /**
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <cooperative_groups/scan.h>
 #include <cub/cub.cuh>
 
 namespace cudf {
@@ -202,11 +205,14 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
   __shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
 
-  auto const global_thread_id =
-    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
-  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  namespace cg     = cooperative_groups;
+  auto const block = cg::this_thread_block();
+  auto const warp  = cg::tiled_partition<cudf::detail::warp_size>(block);
+
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id    = static_cast<size_type>(warp.meta_group_rank());
+  auto const warp_lane        = static_cast<size_type>(warp.thread_rank());
   auto const nwarps =
     static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
@@ -241,7 +247,7 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
         in_chars_shared[char_idx] = in_idx < string_length ? in_chars[in_idx] : 0;
       }
 
-      __syncwarp();
+      warp.sync();
 
       // `char_idx_start` represents the start character index of the current warp.
       for (size_type char_idx_start = 0; char_idx_start < string_length_block;
@@ -258,7 +264,7 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
 
         if (warp_lane == 0) { escape_char_count += total_escape_char; }
 
-        __syncwarp();
+        warp.sync();
       }
     }
     // URL decoding replaces 3 bytes with 1 for each escape character.
@@ -289,11 +295,14 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
   __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
   __shared__ size_type out_idx[num_warps_per_threadblock];
 
-  auto const global_thread_id =
-    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
-  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  namespace cg     = cooperative_groups;
+  auto const block = cg::this_thread_block();
+  auto const warp  = cg::tiled_partition<cudf::detail::warp_size>(block);
+
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id    = static_cast<size_type>(warp.meta_group_rank());
+  auto const warp_lane        = static_cast<size_type>(warp.thread_rank());
   auto const nwarps =
     static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
@@ -326,7 +335,7 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
         in_chars_shared[char_idx] = in_idx >= 0 && in_idx < string_length ? in_chars[in_idx] : 0;
       }
 
-      __syncwarp();
+      warp.sync();
 
       // `char_idx_start` represents the start character index of the current warp.
       for (size_type char_idx_start = 0; char_idx_start < string_length_block;
@@ -364,7 +373,7 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
           out_idx[local_warp_id] += (out_offset + out_size);
         }
 
-        __syncwarp();
+        warp.sync();
       }
     }
   }
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <cuda/atomic>
 #include <cuda/std/utility>
 #include <thrust/binary_search.h>
@@ -121,17 +122,12 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
                                          size_type const stop,
                                          size_type* d_results)
 {
-  auto const idx = cudf::detail::grid_1d::global_thread_id();
-
-  auto const str_idx = idx / cudf::detail::warp_size;
-  if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = idx % cudf::detail::warp_size;
-
-  if (d_strings.is_null(str_idx)) { return; }
+  namespace cg        = cooperative_groups;
+  auto const warp     = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+  auto const lane_idx = warp.thread_rank();
 
-  // initialize the output for the atomicMin/Max
-  if (lane_idx == 0) { d_results[str_idx] = forward ? std::numeric_limits<size_type>::max() : -1; }
-  __syncwarp();
+  auto const str_idx = warp.meta_group_rank();
+  if (str_idx >= d_strings.size() or d_strings.is_null(str_idx)) { return; }
 
   auto const d_str    = d_strings.element<string_view>(str_idx);
   auto const d_target = d_targets[str_idx];
@@ -158,16 +154,12 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
   }
 
   // find stores the minimum position while rfind stores the maximum position
-  // note that this was slightly faster than using cub::WarpReduce
-  cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_results + str_idx)};
-  forward ? ref.fetch_min(position, cuda::std::memory_order_relaxed)
-          : ref.fetch_max(position, cuda::std::memory_order_relaxed);
-  __syncwarp();
+  auto const result = forward ? cg::reduce(warp, position, cg::less<size_type>())
+                              : cg::reduce(warp, position, cg::greater<size_type>());
 
   if (lane_idx == 0) {
     // the final result needs to be fixed up convert max() to -1
     // and a byte position to a character position
-    auto const result = d_results[str_idx];
     d_results[str_idx] =
       ((result < std::numeric_limits<size_type>::max()) && (result >= begin))
         ? start_char_pos + characters_in_string(d_str.data() + begin, result - begin)
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,7 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
     itr += cudf::detail::warp_size;
   }
 
-  __syncwarp();
+  warp.sync();
 
   if (warp.thread_rank() == 0) {
     if (start >= char_count) {

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2019-2024, NVIDIA CORPORATION.`
	`2`	`+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.`
`3`	`3`	`*`
`4`	`4`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`* you may not use this file except in compliance with the License.`
`@@ -129,7 +129,7 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,`
`129`	`129`	`itr += cudf::detail::warp_size;`
`130`	`130`	`}`
`131`	`131`
`132`		`- __syncwarp();`
	`132`	`+ warp.sync();`
`133`	`133`
`134`	`134`	`if (warp.thread_rank() == 0) {`
`135`	`135`	`if (start >= char_count) {`