|
15 | 15 |
|
16 | 16 | namespace traccc::cuda::kernels { |
17 | 17 |
|
18 | | -__global__ void sort_updated_tracks( |
19 | | - device::sort_updated_tracks_payload payload) { |
| 18 | +__launch_bounds__(512) __global__ |
| 19 | + void sort_updated_tracks(device::sort_updated_tracks_payload payload) { |
20 | 20 |
|
21 | 21 | if (*(payload.terminate) == 1 || *(payload.n_updated_tracks) == 0) { |
22 | 22 | return; |
23 | 23 | } |
24 | 24 |
|
25 | | - extern __shared__ unsigned int shared_mem_tracks[]; |
| 25 | + __shared__ unsigned int shared_mem_tracks[512]; |
26 | 26 |
|
27 | 27 | vecmem::device_vector<const traccc::scalar> rel_shared( |
28 | 28 | payload.rel_shared_view); |
29 | 29 | vecmem::device_vector<const traccc::scalar> pvals(payload.pvals_view); |
30 | 30 | vecmem::device_vector<unsigned int> updated_tracks( |
31 | 31 | payload.updated_tracks_view); |
32 | 32 |
|
33 | | - const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; |
34 | | - const unsigned int N = *(payload.n_updated_tracks); |
| 33 | + const unsigned int tid = threadIdx.x; |
35 | 34 |
|
36 | 35 | // Load to shared memory |
37 | | - if (tid < N) { |
| 36 | + shared_mem_tracks[tid] = std::numeric_limits<unsigned int>::max(); |
| 37 | + |
| 38 | + if (tid < *(payload.n_updated_tracks)) { |
38 | 39 | shared_mem_tracks[tid] = updated_tracks[tid]; |
39 | 40 | } |
40 | 41 |
|
41 | 42 | __syncthreads(); |
42 | 43 |
|
43 | | - for (int iter = 0; iter < N; ++iter) { |
44 | | - bool is_even = (iter % 2 == 0); |
45 | | - int i = tid; |
| 44 | + // Padding the number of tracks to the power of 2 |
| 45 | + const unsigned int N = 1 << (32 - __clz(*(payload.n_updated_tracks) - 1)); |
| 46 | + |
| 47 | + traccc::scalar rel_i; |
| 48 | + traccc::scalar rel_j; |
| 49 | + traccc::scalar pval_i; |
| 50 | + traccc::scalar pval_j; |
| 51 | + |
| 52 | + // Bitonic sort |
| 53 | + for (int k = 2; k <= N; k <<= 1) { |
| 54 | + |
| 55 | + bool ascending = ((tid & k) == 0); |
46 | 56 |
|
47 | | - if (i < N / 2) { |
48 | | - int idx = 2 * i + (is_even ? 0 : 1); |
49 | | - if (idx + 1 < N) { |
50 | | - unsigned int a = shared_mem_tracks[idx]; |
51 | | - unsigned int b = shared_mem_tracks[idx + 1]; |
| 57 | + for (int j = k >> 1; j > 0; j >>= 1) { |
| 58 | + int ixj = tid ^ j; |
52 | 59 |
|
53 | | - traccc::scalar rel_a = rel_shared[a]; |
54 | | - traccc::scalar rel_b = rel_shared[b]; |
55 | | - traccc::scalar pv_a = pvals[a]; |
56 | | - traccc::scalar pv_b = pvals[b]; |
| 60 | + if (ixj > tid && ixj < N && tid < N) { |
| 61 | + unsigned int trk_i = shared_mem_tracks[tid]; |
| 62 | + unsigned int trk_j = shared_mem_tracks[ixj]; |
57 | 63 |
|
58 | | - bool swap = false; |
59 | | - if (rel_a != rel_b) { |
60 | | - swap = rel_a > rel_b; |
| 64 | + if (trk_i == std::numeric_limits<unsigned int>::max()) { |
| 65 | + rel_i = std::numeric_limits<traccc::scalar>::max(); |
| 66 | + pval_i = 0.f; |
61 | 67 | } else { |
62 | | - swap = pv_a < pv_b; |
| 68 | + rel_i = rel_shared[trk_i]; |
| 69 | + pval_i = pvals[trk_i]; |
63 | 70 | } |
64 | 71 |
|
65 | | - if (swap) { |
66 | | - shared_mem_tracks[idx] = b; |
67 | | - shared_mem_tracks[idx + 1] = a; |
| 72 | + if (trk_j == std::numeric_limits<unsigned int>::max()) { |
| 73 | + rel_j = std::numeric_limits<traccc::scalar>::max(); |
| 74 | + pval_j = 0.f; |
| 75 | + } else { |
| 76 | + rel_j = rel_shared[trk_j]; |
| 77 | + pval_j = pvals[trk_j]; |
| 78 | + } |
| 79 | + |
| 80 | + bool should_swap = |
| 81 | + (rel_i > rel_j || (rel_i == rel_j && pval_i < pval_j)) == |
| 82 | + ascending; |
| 83 | + |
| 84 | + if (should_swap) { |
| 85 | + shared_mem_tracks[tid] = trk_j; |
| 86 | + shared_mem_tracks[ixj] = trk_i; |
68 | 87 | } |
69 | 88 | } |
| 89 | + __syncthreads(); |
70 | 90 | } |
71 | | - __syncthreads(); |
72 | 91 | } |
73 | 92 |
|
74 | | - if (tid < N) { |
| 93 | + if (tid < *(payload.n_updated_tracks)) { |
75 | 94 | updated_tracks[tid] = shared_mem_tracks[tid]; |
76 | 95 | } |
77 | 96 | } |
|
0 commit comments