Skip to content

Commit 00cef6f

Browse files
committed
Merge find_max_shared and gather_tracks kernel
1 parent d59aba4 commit 00cef6f

File tree

9 files changed

+106
-186
lines changed

9 files changed

+106
-186
lines changed

device/common/include/traccc/ambiguity_resolution/device/find_max_shared.hpp

Lines changed: 0 additions & 45 deletions
This file was deleted.

device/common/include/traccc/ambiguity_resolution/device/gather_tracks.hpp renamed to device/common/include/traccc/ambiguity_resolution/device/update_status.hpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
namespace traccc::device {
1414

1515
/// (Event Data) Payload for the @c
16-
/// traccc::device::gather_tracks function
17-
struct gather_tracks_payload {
16+
/// traccc::device::update_status function
17+
struct update_status_payload {
1818

1919
/**
2020
* @brief Whether to terminate the calculation
@@ -50,6 +50,16 @@ struct gather_tracks_payload {
5050
* @brief View object to the whether track id is updated
5151
*/
5252
vecmem::data::vector_view<int> is_updated_view;
53+
54+
/**
55+
* @brief View object to the vector of number of shared measurements
56+
*/
57+
vecmem::data::vector_view<const unsigned int> n_shared_view;
58+
59+
/**
60+
* @brief The number of max shared
61+
*/
62+
unsigned int* max_shared;
5363
};
5464

5565
} // namespace traccc::device

device/cuda/CMakeLists.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,6 @@ traccc_add_library( traccc_cuda cuda TYPE SHARED
9191
"src/ambiguity_resolution/kernels/block_inclusive_scan.cuh"
9292
"src/ambiguity_resolution/kernels/count_shared_measurements.cu"
9393
"src/ambiguity_resolution/kernels/count_shared_measurements.cuh"
94-
"src/ambiguity_resolution/kernels/find_max_shared.cu"
95-
"src/ambiguity_resolution/kernels/find_max_shared.cuh"
9694
"src/ambiguity_resolution/kernels/fill_inverted_ids.cu"
9795
"src/ambiguity_resolution/kernels/fill_inverted_ids.cuh"
9896
"src/ambiguity_resolution/kernels/fill_tracks_per_measurement.cu"
@@ -103,8 +101,8 @@ traccc_add_library( traccc_cuda cuda TYPE SHARED
103101
"src/ambiguity_resolution/kernels/fill_track_candidates.cuh"
104102
"src/ambiguity_resolution/kernels/fill_vectors.cu"
105103
"src/ambiguity_resolution/kernels/fill_vectors.cuh"
106-
"src/ambiguity_resolution/kernels/gather_tracks.cu"
107-
"src/ambiguity_resolution/kernels/gather_tracks.cuh"
104+
"src/ambiguity_resolution/kernels/update_status.cu"
105+
"src/ambiguity_resolution/kernels/update_status.cuh"
108106
"src/ambiguity_resolution/kernels/rearrange_tracks.cu"
109107
"src/ambiguity_resolution/kernels/rearrange_tracks.cuh"
110108
"src/ambiguity_resolution/kernels/scan_block_offsets.cu"

device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@
1616
#include "./kernels/fill_tracks_per_measurement.cuh"
1717
#include "./kernels/fill_unique_meas_id_map.cuh"
1818
#include "./kernels/fill_vectors.cuh"
19-
#include "./kernels/find_max_shared.cuh"
20-
#include "./kernels/gather_tracks.cuh"
2119
#include "./kernels/rearrange_tracks.cuh"
2220
#include "./kernels/remove_tracks.cuh"
2321
#include "./kernels/scan_block_offsets.cuh"
2422
#include "./kernels/sort_tracks_per_measurement.cuh"
2523
#include "./kernels/sort_updated_tracks.cuh"
24+
#include "./kernels/update_status.cuh"
2625
#include "traccc/cuda/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.hpp"
2726
#include "traccc/definitions/math.hpp"
2827

@@ -406,20 +405,21 @@ greedy_ambiguity_resolution_algorithm::operator()(
406405
int terminate = 0;
407406
vecmem::unique_alloc_ptr<int> terminate_device =
408407
vecmem::make_unique_alloc<int>(m_mr.main);
408+
auto max_shared = thrust::max_element(thrust::device, n_shared_buffer.ptr(),
409+
n_shared_buffer.ptr() + n_tracks);
409410
vecmem::unique_alloc_ptr<unsigned int> max_shared_device =
410411
vecmem::make_unique_alloc<unsigned int>(m_mr.main);
412+
cudaMemcpyAsync(max_shared_device.get(), max_shared, sizeof(unsigned int),
413+
cudaMemcpyHostToDevice, stream);
414+
411415
vecmem::unique_alloc_ptr<unsigned int> n_updated_tracks_device =
412416
vecmem::make_unique_alloc<unsigned int>(m_mr.main);
413417

414418
// Thread block size
415-
unsigned int nThreads_adaptive = m_warp_size * 4;
419+
unsigned int nThreads_adaptive = m_warp_size;
416420
unsigned int nBlocks_adaptive =
417421
(n_accepted + nThreads_adaptive - 1) / nThreads_adaptive;
418422

419-
unsigned int nThreads_warp = m_warp_size;
420-
unsigned int nBlocks_warp =
421-
(n_accepted + nThreads_warp - 1) / nThreads_warp;
422-
423423
unsigned int nThreads_rearrange = 1024;
424424
unsigned int nBlocks_rearrange =
425425
(n_accepted + (nThreads_rearrange / kernels::nThreads_per_track) - 1) /
@@ -462,7 +462,6 @@ greedy_ambiguity_resolution_algorithm::operator()(
462462
while (!terminate && n_accepted > 0) {
463463
nBlocks_adaptive =
464464
(n_accepted + nThreads_adaptive - 1) / nThreads_adaptive;
465-
nBlocks_warp = (n_accepted + nThreads_warp - 1) / nThreads_warp;
466465

467466
scan_dim = compute_scan_config(n_accepted);
468467
nThreads_scan = scan_dim.first;
@@ -478,14 +477,6 @@ greedy_ambiguity_resolution_algorithm::operator()(
478477

479478
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
480479

481-
kernels::find_max_shared<<<nBlocks_warp, nThreads_warp, 0, stream>>>(
482-
device::find_max_shared_payload{
483-
.sorted_ids_view = sorted_ids_buffer,
484-
.n_accepted = n_accepted_device.get(),
485-
.n_shared_view = n_shared_buffer,
486-
.terminate = terminate_device.get(),
487-
.max_shared = max_shared_device.get()});
488-
489480
kernels::remove_tracks<<<1, 512, 0, stream>>>(
490481
device::remove_tracks_payload{
491482
.sorted_ids_view = sorted_ids_buffer,
@@ -589,15 +580,17 @@ greedy_ambiguity_resolution_algorithm::operator()(
589580
});
590581

591582
kernels::
592-
gather_tracks<<<nBlocks_adaptive, nThreads_adaptive, 0, stream>>>(
593-
device::gather_tracks_payload{
583+
update_status<<<nBlocks_adaptive, nThreads_adaptive, 0, stream>>>(
584+
device::update_status_payload{
594585
.terminate = terminate_device.get(),
595586
.n_accepted = n_accepted_device.get(),
596587
.n_updated_tracks = n_updated_tracks_device.get(),
597588
.temp_sorted_ids_view = temp_sorted_ids_buffer,
598589
.sorted_ids_view = sorted_ids_buffer,
599590
.updated_tracks_view = updated_tracks_buffer,
600-
.is_updated_view = is_updated_buffer});
591+
.is_updated_view = is_updated_buffer,
592+
.n_shared_view = n_shared_buffer,
593+
.max_shared = max_shared_device.get()});
601594

602595
cudaStreamEndCapture(stream, &graph);
603596
cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0);

device/cuda/src/ambiguity_resolution/kernels/find_max_shared.cu

Lines changed: 0 additions & 53 deletions
This file was deleted.

device/cuda/src/ambiguity_resolution/kernels/find_max_shared.cuh

Lines changed: 0 additions & 16 deletions
This file was deleted.

device/cuda/src/ambiguity_resolution/kernels/gather_tracks.cu

Lines changed: 0 additions & 45 deletions
This file was deleted.
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/** TRACCC library, part of the ACTS project (R&D line)
2+
*
3+
* (c) 2025 CERN for the benefit of the ACTS project
4+
*
5+
* Mozilla Public License Version 2.0
6+
*/
7+
8+
// Local include(s).
9+
#include "../../utils/global_index.hpp"
10+
#include "update_status.cuh"
11+
12+
// VecMem include(s).
13+
#include <vecmem/containers/device_vector.hpp>
14+
15+
namespace traccc::cuda::kernels {
16+
17+
__global__ void update_status(device::update_status_payload payload) {
18+
19+
if (*(payload.terminate) == 1) {
20+
return;
21+
}
22+
23+
vecmem::device_vector<const unsigned int> temp_sorted_ids(
24+
payload.temp_sorted_ids_view);
25+
vecmem::device_vector<unsigned int> sorted_ids(payload.sorted_ids_view);
26+
vecmem::device_vector<unsigned int> updated_tracks(
27+
payload.updated_tracks_view);
28+
vecmem::device_vector<int> is_updated(payload.is_updated_view);
29+
vecmem::device_vector<const unsigned int> n_shared(payload.n_shared_view);
30+
31+
auto globalIndex = threadIdx.x + blockIdx.x * blockDim.x;
32+
const unsigned int n_accepted = *(payload.n_accepted);
33+
const unsigned int n_updated = *(payload.n_updated_tracks);
34+
35+
/***********************
36+
* Update Max Shared
37+
***********************/
38+
39+
unsigned int shared = 0;
40+
41+
if (globalIndex < n_accepted) {
42+
auto tid = sorted_ids[globalIndex];
43+
shared = n_shared[tid];
44+
}
45+
46+
for (int offset = 16; offset > 0; offset >>= 1) {
47+
unsigned int other_shared =
48+
__shfl_down_sync(0xffffffff, shared, offset);
49+
50+
if (other_shared > shared) {
51+
shared = other_shared;
52+
}
53+
}
54+
55+
if (threadIdx.x == 0) {
56+
atomicMax(payload.max_shared, shared);
57+
}
58+
59+
/***********************
60+
* Update Sorted Ids
61+
***********************/
62+
63+
if (n_updated == 0) {
64+
return;
65+
}
66+
67+
// Reset is_updated vector
68+
if (globalIndex < n_updated) {
69+
is_updated[updated_tracks[globalIndex]] = 0;
70+
}
71+
72+
if (globalIndex < n_accepted) {
73+
auto tid = temp_sorted_ids[globalIndex];
74+
sorted_ids[globalIndex] = tid;
75+
}
76+
}
77+
78+
} // namespace traccc::cuda::kernels

device/cuda/src/ambiguity_resolution/kernels/gather_tracks.cuh renamed to device/cuda/src/ambiguity_resolution/kernels/update_status.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
#pragma once
99

1010
// Project include(s).
11-
#include "traccc/ambiguity_resolution/device/gather_tracks.hpp"
11+
#include "traccc/ambiguity_resolution/device/update_status.hpp"
1212

1313
namespace traccc::cuda::kernels {
1414

15-
__global__ void gather_tracks(device::gather_tracks_payload payload);
15+
__global__ void update_status(device::update_status_payload payload);
1616
}

0 commit comments

Comments
 (0)