Merge pull request #1107 from beomki-yeo/optimize-block-inclusive-scan

stephenswat · web-flow · commit a887f7f854d5 · 2025-07-31T13:48:53.000+02:00
Optimize the dimension of scanning kernels
diff --git a/device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu b/device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu
@@ -404,8 +404,27 @@ greedy_ambiguity_resolution_algorithm::operator()(
     unsigned int nThreads_full = 1024;
     unsigned int nBlocks_full = (n_tracks + 1023) / 1024;
 
-    unsigned int nThreads_scan = 1024;
-    unsigned int nBlocks_scan = (n_accepted + 1023) / 1024;
+    // Compute the threadblock dimension for scanning kernels
+    auto compute_scan_config = [&](unsigned int n_accepted) {
+        unsigned int nThreads_scan = m_warp_size * 4;
+        unsigned int nBlocks_scan =
+            (n_accepted + nThreads_scan - 1) / nThreads_scan;
+
+        while (nThreads_scan <= 1024) {
+            if (nBlocks_scan > 1024) {
+                nThreads_scan *= 2;
+                nBlocks_scan = (n_accepted + nThreads_scan - 1) / nThreads_scan;
+            } else {
+                break;
+            }
+        }
+
+        return std::make_pair(nThreads_scan, nBlocks_scan);
+    };
+
+    auto scan_dim = compute_scan_config(n_accepted);
+    unsigned int nThreads_scan = scan_dim.first;
+    unsigned int nBlocks_scan = scan_dim.second;
 
     assert(nBlocks_scan <= 1024 &&
            "nBlocks_scan larger than 1024 will cause invalid arguments in "
@@ -423,7 +442,10 @@ greedy_ambiguity_resolution_algorithm::operator()(
         nBlocks_adaptive =
             (n_accepted + nThreads_adaptive - 1) / nThreads_adaptive;
         nBlocks_warp = (n_accepted + nThreads_warp - 1) / nThreads_warp;
-        nBlocks_scan = (n_accepted + 1023) / 1024;
+
+        scan_dim = compute_scan_config(n_accepted);
+        nThreads_scan = scan_dim.first;
+        nBlocks_scan = scan_dim.second;
 
         // Make CUDA Graph
         cudaGraph_t graph;
diff --git a/device/cuda/src/ambiguity_resolution/kernels/block_inclusive_scan.cu b/device/cuda/src/ambiguity_resolution/kernels/block_inclusive_scan.cu
@@ -32,8 +32,6 @@ __global__ void block_inclusive_scan(
 
     auto globalIndex = threadIdx.x + blockIdx.x * blockDim.x;
     auto threadIndex = threadIdx.x;
-    auto blockIndex = blockIdx.x;
-    auto blockSize = blockDim.x;
 
     const unsigned int n_accepted = *(payload.n_accepted);
 
@@ -46,7 +44,7 @@ __global__ void block_inclusive_scan(
     __syncthreads();
 
     // inclusive scan in shared memory
-    for (int stride = 1; stride < blockSize; stride *= 2) {
+    for (int stride = 1; stride < blockDim.x; stride *= 2) {
         int val = 0;
         if (threadIndex >= stride) {
             val = shared_temp[threadIndex - stride];
@@ -64,8 +62,8 @@ __global__ void block_inclusive_scan(
 
     __syncthreads();
 
-    if (threadIndex == blockSize - 1) {
-        block_offsets[blockIndex] = shared_temp[threadIndex];
+    if (threadIndex == blockDim.x - 1) {
+        block_offsets[blockIdx.x] = shared_temp[threadIndex];
     }
 }
 
diff --git a/device/cuda/src/ambiguity_resolution/kernels/scan_block_offsets.cu b/device/cuda/src/ambiguity_resolution/kernels/scan_block_offsets.cu
@@ -27,22 +27,20 @@ __global__ void scan_block_offsets(device::scan_block_offsets_payload payload) {
     vecmem::device_vector<int> scanned_block_offsets(
         payload.scanned_block_offsets_view);
 
-    int n_blocks = (*(payload.n_accepted) + 1023) / 1024;
-
-    __syncthreads();
-
+    // The number of blocks in the previous block_inclusive_scan = the nubmer of
+    // threads of this kernel
+    int n_blocks_prev = blockDim.x;
     auto threadIndex = threadIdx.x;
 
     // 1. Load from global to shared
-    int value = 0;
-    if (threadIndex < n_blocks) {
-        value = block_offsets[threadIndex];
+    shared_temp[threadIndex] = 0;
+    if (threadIndex < n_blocks_prev) {
+        shared_temp[threadIndex] = block_offsets[threadIndex];
     }
-    shared_temp[threadIndex] = value;
     __syncthreads();
 
     // 2. Inclusive scan (Hillis-Steele style)
-    for (int offset = 1; offset < n_blocks; offset *= 2) {
+    for (int offset = 1; offset < n_blocks_prev; offset *= 2) {
         int temp = 0;
         if (threadIndex >= offset) {
             temp = shared_temp[threadIndex - offset];
@@ -53,7 +51,7 @@ __global__ void scan_block_offsets(device::scan_block_offsets_payload payload) {
     }
 
     // 3. Write back
-    if (threadIndex < n_blocks) {
+    if (threadIndex < n_blocks_prev) {
         scanned_block_offsets[threadIndex] = shared_temp[threadIndex];
     }
 }