Skip to content

Commit 891c9bb

Browse files
CrossRstephenswat
andauthored
Alpaka: Parallel Algorithm Unification (#1074)
* Add thrust/oneDPL abstraction, for Alpaka. Hides the selection of a execution policy, as well as the specific algorithm in use. * Update algorithms to use new wrapper. * Fixes for SYCL + CPU backend. The memory resource isn't used for SYCL, and neither are used for CPU. * Fix template deduction fail with older HIP compiler. * Address PR comments. * Convert missed thrust call. --------- Co-authored-by: Stephen Nicholas Swatman <[email protected]>
1 parent a887f7f commit 891c9bb

File tree

6 files changed

+166
-63
lines changed

6 files changed

+166
-63
lines changed

device/alpaka/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ traccc_add_alpaka_library( traccc_alpaka alpaka TYPE SHARED
3535
"src/utils/get_queue.cpp"
3636
"include/traccc/alpaka/utils/vecmem_objects.hpp"
3737
"src/utils/vecmem_objects.cpp"
38+
"src/utils/oneDPL.hpp"
39+
"src/utils/parallel_algorithms.hpp"
3840
# Clusterization
3941
"include/traccc/alpaka/clusterization/clusterization_algorithm.hpp"
4042
"src/clusterization/clusterization_algorithm.cpp"

device/alpaka/src/clusterization/measurement_sorting_algorithm.cpp

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,7 @@
99
#include "traccc/alpaka/clusterization/measurement_sorting_algorithm.hpp"
1010

1111
#include "../utils/get_queue.hpp"
12-
13-
// Thrust include(s).
14-
#include <thrust/execution_policy.h>
15-
#include <thrust/sort.h>
12+
#include "../utils/parallel_algorithms.hpp"
1613

1714
// System include(s).
1815
#include <memory_resource>
@@ -33,29 +30,14 @@ measurement_sorting_algorithm::operator()(
3330

3431
// Get the number of measurements. This is necessary because the input
3532
// container may not be fixed sized. And we can't give invalid pointers /
36-
// iterators to Thrust.
33+
// iterators to Thrust / oneDPL.
3734
const measurement_collection_types::view::size_type n_measurements =
3835
m_copy.get().get_size(measurements_view);
3936

4037
// Sort the measurements in place
41-
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
42-
auto stream = ::alpaka::getNativeHandle(queue);
43-
auto execPolicy =
44-
thrust::cuda::par_nosync(std::pmr::polymorphic_allocator(&(m_mr.main)))
45-
.on(stream);
46-
#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
47-
auto stream = ::alpaka::getNativeHandle(queue);
48-
auto execPolicy =
49-
thrust::hip_rocprim::par_nosync(
50-
std::pmr::polymorphic_allocator<std::byte>(&(m_mr.main)))
51-
.on(stream);
52-
#else
53-
auto execPolicy = thrust::host;
54-
#endif
55-
56-
thrust::sort(execPolicy, measurements_view.ptr(),
57-
measurements_view.ptr() + n_measurements,
58-
measurement_sort_comp());
38+
details::sort(queue, m_mr, measurements_view.ptr(),
39+
measurements_view.ptr() + n_measurements,
40+
measurement_sort_comp());
5941

6042
// Return the view of the sorted measurements.
6143
return measurements_view;

device/alpaka/src/finding/combinatorial_kalman_filter.hpp

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
// Local include(s).
1111
#include "../utils/barrier.hpp"
12+
#include "../utils/parallel_algorithms.hpp"
1213
#include "../utils/thread_id.hpp"
1314
#include "../utils/utils.hpp"
1415

@@ -36,14 +37,6 @@
3637
// VecMem include(s).
3738
#include <vecmem/utils/copy.hpp>
3839

39-
// Thrust include(s).
40-
#include <thrust/copy.h>
41-
#include <thrust/execution_policy.h>
42-
#include <thrust/fill.h>
43-
#include <thrust/scan.h>
44-
#include <thrust/sort.h>
45-
#include <thrust/unique.h>
46-
4740
namespace traccc::alpaka::details {
4841
namespace kernels {
4942

@@ -219,13 +212,6 @@ combinatorial_kalman_filter(
219212
// Create a logger.
220213
auto logger = [&log]() -> const Logger& { return log; };
221214

222-
/// Thrust policy to use.
223-
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
224-
auto thrustExecPolicy = thrust::device;
225-
#else
226-
auto thrustExecPolicy = thrust::host;
227-
#endif
228-
229215
// Number of threads per block to use.
230216
const Idx threadsPerBlock = getWarpSize<Acc>() * 2;
231217

@@ -243,9 +229,9 @@ combinatorial_kalman_filter(
243229
measurement_collection_types::device uniques(uniques_buffer);
244230

245231
measurement_collection_types::device::iterator uniques_end =
246-
thrust::unique_copy(thrustExecPolicy, measurements.ptr(),
247-
measurements.ptr() + n_measurements,
248-
uniques.begin(), measurement_equal_comp());
232+
details::unique_copy(queue, mr, measurements.ptr(),
233+
measurements.ptr() + n_measurements,
234+
uniques.begin(), measurement_equal_comp());
249235
const unsigned int n_modules =
250236
static_cast<unsigned int>(uniques_end - uniques.begin());
251237

@@ -255,10 +241,10 @@ combinatorial_kalman_filter(
255241
copy.setup(upper_bounds_buffer)->wait();
256242
vecmem::device_vector<unsigned int> upper_bounds(upper_bounds_buffer);
257243

258-
thrust::upper_bound(thrustExecPolicy, measurements.ptr(),
259-
measurements.ptr() + n_measurements, uniques.begin(),
260-
uniques.begin() + n_modules, upper_bounds.begin(),
261-
measurement_sort_comp());
244+
details::upper_bound(queue, mr, measurements.ptr(),
245+
measurements.ptr() + n_measurements, uniques.begin(),
246+
uniques.begin() + n_modules, upper_bounds.begin(),
247+
measurement_sort_comp());
262248

263249
/*****************************************************************
264250
* Kernel1: Create barcode sequence
@@ -479,8 +465,8 @@ combinatorial_kalman_filter(
479465
link_last_measurement_buffer);
480466
vecmem::device_vector<unsigned int> param_ids_device(
481467
param_ids_buffer);
482-
thrust::sort_by_key(thrustExecPolicy, keys_device.begin(),
483-
keys_device.end(), param_ids_device.begin());
468+
details::sort_by_key(queue, mr, keys_device.begin(),
469+
keys_device.end(), param_ids_device.begin());
484470

485471
/*
486472
* Then, we run the actual duplicate removal kernel.
@@ -542,9 +528,9 @@ combinatorial_kalman_filter(
542528
keys_buffer);
543529
vecmem::device_vector<unsigned int> param_ids_device(
544530
param_ids_buffer);
545-
thrust::sort_by_key(thrustExecPolicy, keys_device.begin(),
546-
keys_device.end(),
547-
param_ids_device.begin());
531+
details::sort_by_key(queue, mr, keys_device.begin(),
532+
keys_device.end(),
533+
param_ids_device.begin());
548534
}
549535

550536
/*****************************************************************

device/alpaka/src/fitting/kalman_fitting.hpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#pragma once
99

1010
// Local include(s).
11+
#include "../utils/parallel_algorithms.hpp"
1112
#include "../utils/utils.hpp"
1213

1314
// Project include(s).
@@ -26,10 +27,6 @@
2627
// VecMem include(s).
2728
#include <vecmem/utils/copy.hpp>
2829

29-
// Thrust include(s).
30-
#include <thrust/execution_policy.h>
31-
#include <thrust/sort.h>
32-
3330
namespace traccc::alpaka::details {
3431
namespace kernels {
3532

@@ -122,13 +119,6 @@ track_state_container_types::buffer kalman_fitting(
122119
const fitting_config& config, const memory_resource& mr, vecmem::copy& copy,
123120
Queue& queue) {
124121

125-
/// Thrust policy to use.
126-
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
127-
auto thrustExecPolicy = thrust::device;
128-
#else
129-
auto thrustExecPolicy = thrust::host;
130-
#endif
131-
132122
// Number of threads per block to use.
133123
const Idx threadsPerBlock = getWarpSize<Acc>() * 2;
134124

@@ -199,8 +189,8 @@ track_state_container_types::buffer kalman_fitting(
199189
// Sort the key to get the sorted parameter ids
200190
vecmem::device_vector<device::sort_key> keys_device(keys_buffer);
201191
vecmem::device_vector<unsigned int> param_ids_device(param_ids_buffer);
202-
thrust::sort_by_key(thrustExecPolicy, keys_device.begin(),
203-
keys_device.end(), param_ids_device.begin());
192+
details::sort_by_key(queue, mr, keys_device.begin(), keys_device.end(),
193+
param_ids_device.begin());
204194

205195
// Run the fitting, using the sorted parameter IDs.
206196
track_state_container_types::view track_states_view = track_states_buffer;

device/alpaka/src/utils/oneDPL.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/** TRACCC library, part of the ACTS project (R&D line)
2+
*
3+
* (c) 2025 CERN for the benefit of the ACTS project
4+
*
5+
* Mozilla Public License Version 2.0
6+
*/
7+
8+
#pragma once
9+
10+
// Mark this as a "system header". To suppress all warnings from oneDPL.
11+
// This is needed because at the time of writing we cannot provide oneDPL with
12+
// "-isystem" to the oneAPI compiler.
13+
#pragma clang system_header
14+
15+
// oneDPL include(s).
16+
#include <oneapi/dpl/algorithm>
17+
#include <oneapi/dpl/execution>
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/**
2+
* traccc library, part of the ACTS project (R&D line)
3+
*
4+
* (c) 2025 CERN for the benefit of the ACTS project
5+
*
6+
* Mozilla Public License Version 2.0
7+
*/
8+
9+
#pragma once
10+
11+
// Local include(s).
12+
#include "utils.hpp"
13+
14+
// Project include(s).
15+
#include "traccc/utils/memory_resource.hpp"
16+
17+
// Thrust include(s).
18+
#if !defined(ALPAKA_ACC_SYCL_ENABLED)
19+
#include <thrust/binary_search.h>
20+
#include <thrust/copy.h>
21+
#include <thrust/execution_policy.h>
22+
#include <thrust/fill.h>
23+
#include <thrust/scan.h>
24+
#include <thrust/sort.h>
25+
#include <thrust/unique.h>
26+
#endif
27+
28+
// OneDPL include.
29+
//
30+
// This is left to a separate file to turn off warnings from oneDPL.
31+
#if defined(ALPAKA_ACC_SYCL_ENABLED)
32+
#include "oneDPL.hpp"
33+
#endif
34+
35+
namespace traccc::alpaka::details {
36+
37+
inline auto getExecutionPolicy([[maybe_unused]] Queue &q,
38+
[[maybe_unused]] const memory_resource &mr) {
39+
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
40+
auto stream = ::alpaka::getNativeHandle(q);
41+
return thrust::cuda::par_nosync(std::pmr::polymorphic_allocator(&(mr.main)))
42+
.on(stream);
43+
#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
44+
auto stream = ::alpaka::getNativeHandle(q);
45+
return thrust::hip_rocprim::par_nosync(
46+
std::pmr::polymorphic_allocator<std::byte>(&(mr.main)))
47+
.on(stream);
48+
#elif defined(ALPAKA_ACC_SYCL_ENABLED)
49+
auto queue = ::alpaka::getNativeHandle(q);
50+
return oneapi::dpl::execution::device_policy{queue};
51+
#else
52+
return thrust::host;
53+
#endif
54+
}
55+
56+
template <typename RandomAccessIterator, typename Compare>
57+
void sort(Queue &q, const memory_resource mr, RandomAccessIterator first,
58+
RandomAccessIterator last, Compare comp) {
59+
auto execPolicy = getExecutionPolicy(q, mr);
60+
61+
#if defined(ALPAKA_ACC_SYCL_ENABLED)
62+
oneapi::dpl::sort(execPolicy, first, last, comp);
63+
#else
64+
thrust::sort(execPolicy, first, last, comp);
65+
#endif
66+
}
67+
68+
template <typename RandomAccessIterator1, typename RandomAccessIterator2,
69+
typename Compare>
70+
void sort_by_key(Queue &q, const memory_resource &mr,
71+
RandomAccessIterator1 keys_first,
72+
RandomAccessIterator1 keys_last,
73+
RandomAccessIterator2 values_first, Compare comp) {
74+
auto execPolicy = getExecutionPolicy(q, mr);
75+
76+
#if defined(ALPAKA_ACC_SYCL_ENABLED)
77+
oneapi::dpl::sort_by_key(execPolicy, keys_first, keys_last, values_first,
78+
comp);
79+
#else
80+
thrust::sort_by_key(execPolicy, keys_first, keys_last, values_first, comp);
81+
#endif
82+
}
83+
84+
template <typename RandomAccessIterator1, typename RandomAccessIterator2>
85+
void sort_by_key(Queue &q, const memory_resource &mr,
86+
RandomAccessIterator1 keys_first,
87+
RandomAccessIterator1 keys_last,
88+
RandomAccessIterator2 values_first) {
89+
auto execPolicy = getExecutionPolicy(q, mr);
90+
91+
#if defined(ALPAKA_ACC_SYCL_ENABLED)
92+
oneapi::dpl::sort_by_key(execPolicy, keys_first, keys_last, values_first);
93+
#else
94+
thrust::sort_by_key(execPolicy, keys_first, keys_last, values_first);
95+
#endif
96+
}
97+
98+
template <typename ForwardIt1, typename ForwardIt2, typename OutputIt,
99+
typename Compare>
100+
void upper_bound(Queue &q, const memory_resource &mr, ForwardIt1 first1,
101+
ForwardIt1 last1, ForwardIt2 first2, ForwardIt2 last2,
102+
OutputIt d_first, Compare comp) {
103+
104+
auto execPolicy = getExecutionPolicy(q, mr);
105+
#if defined(ALPAKA_ACC_SYCL_ENABLED)
106+
oneapi::dpl::upper_bound(execPolicy, first1, last1, first2, last2, d_first,
107+
comp);
108+
#else
109+
thrust::upper_bound(execPolicy, first1, last1, first2, last2, d_first,
110+
comp);
111+
#endif
112+
}
113+
114+
template <typename InputIt, typename OutputIt, typename Compare>
115+
OutputIt unique_copy(Queue &q, const memory_resource &mr, InputIt first,
116+
InputIt last, OutputIt d_first, Compare comp) {
117+
auto execPolicy = getExecutionPolicy(q, mr);
118+
119+
#if defined(ALPAKA_ACC_SYCL_ENABLED)
120+
return oneapi::dpl::unique_copy(execPolicy, first, last, d_first, comp);
121+
#else
122+
return thrust::unique_copy(execPolicy, first, last, d_first, comp);
123+
#endif
124+
}
125+
126+
} // namespace traccc::alpaka::details

0 commit comments

Comments
 (0)