Merge pull request acts-project#1106 from beomki-yeo/optimize-sort-updated-tracks

stephenswat · web-flow · commit daa38f3910ba · 2025-08-06T08:14:41.000+02:00
Replace the bubble sort with bitonic sort
diff --git a/device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu b/device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu
@@ -542,8 +542,7 @@ greedy_ambiguity_resolution_algorithm::operator()(
         // when the number of updated tracks <= 1024) and might be faster
         // with large number of updated tracks
 
-        kernels::sort_updated_tracks<<<1, 1024, 1024 * sizeof(unsigned int),
-                                       stream>>>(
+        kernels::sort_updated_tracks<<<1, 512, 0, stream>>>(
             device::sort_updated_tracks_payload{
                 .rel_shared_view = rel_shared_buffer,
                 .pvals_view = pvals_buffer,
diff --git a/device/cuda/src/ambiguity_resolution/kernels/count_removable_tracks.cu b/device/cuda/src/ambiguity_resolution/kernels/count_removable_tracks.cu
@@ -171,6 +171,9 @@ __launch_bounds__(512) __global__ void count_removable_tracks(
 
     const auto tid = threadIndex;
     for (int k = 2; k <= N; k <<= 1) {
+
+        bool ascending = ((tid & k) == 0);
+
         for (int j = k >> 1; j > 0; j >>= 1) {
             int ixj = tid ^ j;
 
@@ -180,7 +183,6 @@ __launch_bounds__(512) __global__ void count_removable_tracks(
                 auto thread_i = sh_threads[tid];
                 auto thread_j = sh_threads[ixj];
 
-                bool ascending = ((tid & k) == 0);
                 bool should_swap =
                     (meas_i > meas_j ||
                      (meas_i == meas_j && thread_i > thread_j)) == ascending;
diff --git a/device/cuda/src/ambiguity_resolution/kernels/sort_updated_tracks.cu b/device/cuda/src/ambiguity_resolution/kernels/sort_updated_tracks.cu
@@ -15,63 +15,82 @@
 
 namespace traccc::cuda::kernels {
 
-__global__ void sort_updated_tracks(
-    device::sort_updated_tracks_payload payload) {
+__launch_bounds__(512) __global__
+    void sort_updated_tracks(device::sort_updated_tracks_payload payload) {
 
     if (*(payload.terminate) == 1 || *(payload.n_updated_tracks) == 0) {
         return;
     }
 
-    extern __shared__ unsigned int shared_mem_tracks[];
+    __shared__ unsigned int shared_mem_tracks[512];
 
     vecmem::device_vector<const traccc::scalar> rel_shared(
         payload.rel_shared_view);
     vecmem::device_vector<const traccc::scalar> pvals(payload.pvals_view);
     vecmem::device_vector<unsigned int> updated_tracks(
         payload.updated_tracks_view);
 
-    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    const unsigned int N = *(payload.n_updated_tracks);
+    const unsigned int tid = threadIdx.x;
 
     // Load to shared memory
-    if (tid < N) {
+    shared_mem_tracks[tid] = std::numeric_limits<unsigned int>::max();
+
+    if (tid < *(payload.n_updated_tracks)) {
         shared_mem_tracks[tid] = updated_tracks[tid];
     }
 
     __syncthreads();
 
-    for (int iter = 0; iter < N; ++iter) {
-        bool is_even = (iter % 2 == 0);
-        int i = tid;
+    // Padding the number of tracks to the power of 2
+    const unsigned int N = 1 << (32 - __clz(*(payload.n_updated_tracks) - 1));
+
+    traccc::scalar rel_i;
+    traccc::scalar rel_j;
+    traccc::scalar pval_i;
+    traccc::scalar pval_j;
+
+    // Bitonic sort
+    for (int k = 2; k <= N; k <<= 1) {
+
+        bool ascending = ((tid & k) == 0);
 
-        if (i < N / 2) {
-            int idx = 2 * i + (is_even ? 0 : 1);
-            if (idx + 1 < N) {
-                unsigned int a = shared_mem_tracks[idx];
-                unsigned int b = shared_mem_tracks[idx + 1];
+        for (int j = k >> 1; j > 0; j >>= 1) {
+            int ixj = tid ^ j;
 
-                traccc::scalar rel_a = rel_shared[a];
-                traccc::scalar rel_b = rel_shared[b];
-                traccc::scalar pv_a = pvals[a];
-                traccc::scalar pv_b = pvals[b];
+            if (ixj > tid && ixj < N && tid < N) {
+                unsigned int trk_i = shared_mem_tracks[tid];
+                unsigned int trk_j = shared_mem_tracks[ixj];
 
-                bool swap = false;
-                if (rel_a != rel_b) {
-                    swap = rel_a > rel_b;
+                if (trk_i == std::numeric_limits<unsigned int>::max()) {
+                    rel_i = std::numeric_limits<traccc::scalar>::max();
+                    pval_i = 0.f;
                 } else {
-                    swap = pv_a < pv_b;
+                    rel_i = rel_shared[trk_i];
+                    pval_i = pvals[trk_i];
                 }
 
-                if (swap) {
-                    shared_mem_tracks[idx] = b;
-                    shared_mem_tracks[idx + 1] = a;
+                if (trk_j == std::numeric_limits<unsigned int>::max()) {
+                    rel_j = std::numeric_limits<traccc::scalar>::max();
+                    pval_j = 0.f;
+                } else {
+                    rel_j = rel_shared[trk_j];
+                    pval_j = pvals[trk_j];
+                }
+
+                bool should_swap =
+                    (rel_i > rel_j || (rel_i == rel_j && pval_i < pval_j)) ==
+                    ascending;
+
+                if (should_swap) {
+                    shared_mem_tracks[tid] = trk_j;
+                    shared_mem_tracks[ixj] = trk_i;
                 }
             }
+            __syncthreads();
         }
-        __syncthreads();
     }
 
-    if (tid < N) {
+    if (tid < *(payload.n_updated_tracks)) {
         updated_tracks[tid] = shared_mem_tracks[tid];
     }
 }