Skip to content

Commit e746fdf

Browse files
committed
Fix a race condition in splitVertices
Add alpaka::syncBlockThreads(acc); at the end of the loop on the vertices to ensure that all threads are properly synchronised before resetting the shared memory. Clean up the kernel to use the SoA accessors and the cms::alpakatools utilities.
1 parent f6e0fc4 commit e746fdf

File tree

1 file changed

+33
-48
lines changed

1 file changed

+33
-48
lines changed

RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
1919
using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
2020
template <typename TAcc>
2121
ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices(const TAcc& acc,
22-
VtxSoAView& pdata,
23-
WsSoAView& pws,
22+
VtxSoAView& data,
23+
WsSoAView& ws,
2424
float maxChi2) {
2525
constexpr bool verbose = false; // in principle the compiler should optmize out if false
26-
const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
27-
28-
auto& __restrict__ data = pdata;
29-
auto& __restrict__ ws = pws;
30-
auto nt = ws.ntrks();
31-
float const* __restrict__ zt = ws.zt();
32-
float const* __restrict__ ezt2 = ws.ezt2();
33-
float* __restrict__ zv = data.zv();
34-
float* __restrict__ wv = data.wv();
35-
float const* __restrict__ chi2 = data.chi2();
36-
uint32_t& nvFinal = data.nvFinal();
37-
38-
int32_t const* __restrict__ nn = data.ndof();
39-
int32_t* __restrict__ iv = ws.iv();
40-
41-
ALPAKA_ASSERT_ACC(zt);
42-
ALPAKA_ASSERT_ACC(wv);
43-
ALPAKA_ASSERT_ACC(chi2);
44-
ALPAKA_ASSERT_ACC(nn);
45-
4626
constexpr uint32_t MAXTK = 512;
4727

4828
auto& it = alpaka::declareSharedVar<uint32_t[MAXTK], __COUNTER__>(acc); // track index
@@ -51,32 +31,33 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
5131
auto& ww = alpaka::declareSharedVar<float[MAXTK], __COUNTER__>(acc); // z weight
5232
auto& nq = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc); // number of track for this vertex
5333

54-
const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
55-
const uint32_t gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
56-
5734
// one vertex per block
58-
for (auto kv = blockIdx; kv < nvFinal; kv += gridDimension) {
59-
if (nn[kv] < 4)
35+
for (auto kv : cms::alpakatools::independent_groups(acc, data.nvFinal())) {
36+
int32_t ndof = data[kv].ndof();
37+
if (ndof < 4)
6038
continue;
61-
if (chi2[kv] < maxChi2 * float(nn[kv]))
39+
if (data[kv].chi2() < maxChi2 * float(ndof))
6240
continue;
6341

64-
ALPAKA_ASSERT_ACC(nn[kv] < int32_t(MAXTK));
42+
ALPAKA_ASSERT_ACC(ndof < int32_t(MAXTK));
6543

66-
if ((uint32_t)nn[kv] >= MAXTK)
44+
if ((uint32_t)ndof >= MAXTK)
6745
continue; // too bad FIXME
6846

69-
nq = 0u;
47+
if (cms::alpakatools::once_per_block(acc)) {
48+
// reset the number of tracks for the current vertex
49+
nq = 0u;
50+
}
7051
alpaka::syncBlockThreads(acc);
7152

72-
// copy to local
73-
for (auto k : cms::alpakatools::independent_group_elements(acc, nt)) {
74-
if (iv[k] == int(kv)) {
75-
auto old = alpaka::atomicInc(acc, &nq, MAXTK, alpaka::hierarchy::Threads{});
76-
zz[old] = zt[k] - zv[kv];
77-
newV[old] = zz[old] < 0 ? 0 : 1;
78-
ww[old] = 1.f / ezt2[k];
79-
it[old] = k;
53+
// cache the data of the tracks associated to the current vertex into shared memory
54+
for (auto k : cms::alpakatools::independent_group_elements(acc, ws.ntrks())) {
55+
if (ws[k].iv() == int(kv)) {
56+
auto index = alpaka::atomicInc(acc, &nq, MAXTK, alpaka::hierarchy::Threads{});
57+
it[index] = k;
58+
zz[index] = ws[k].zt() - data[kv].zv();
59+
newV[index] = zz[index] < 0 ? 0 : 1;
60+
ww[index] = 1.f / ws[k].ezt2();
8061
}
8162
}
8263

@@ -85,14 +66,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
8566
auto& wnew = alpaka::declareSharedVar<float[2], __COUNTER__>(acc);
8667
alpaka::syncBlockThreads(acc);
8768

88-
ALPAKA_ASSERT_ACC(int(nq) == nn[kv] + 1);
69+
ALPAKA_ASSERT_ACC(int(nq) == ndof + 1);
8970

9071
int maxiter = 20;
9172
// kt-min....
9273
bool more = true;
9374
while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
9475
more = false;
95-
if (0 == threadIdxLocal) {
76+
if (cms::alpakatools::once_per_block(acc)) {
9677
znew[0] = 0;
9778
znew[1] = 0;
9879
wnew[0] = 0;
@@ -107,7 +88,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
10788
}
10889
alpaka::syncBlockThreads(acc);
10990

110-
if (0 == threadIdxLocal) {
91+
if (cms::alpakatools::once_per_block(acc)) {
11192
znew[0] /= wnew[0];
11293
znew[1] /= wnew[1];
11394
}
@@ -134,30 +115,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
134115

135116
auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
136117

137-
if (verbose && 0 == threadIdxLocal)
138-
printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
118+
if constexpr (verbose) {
119+
if (cms::alpakatools::once_per_block(acc))
120+
printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * data[kv].wv());
121+
}
139122

140123
if (chi2Dist < 4)
141124
continue;
142125

143126
// get a new global vertex
144127
auto& igv = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
145-
if (0 == threadIdxLocal)
128+
if (cms::alpakatools::once_per_block(acc))
146129
igv = alpaka::atomicAdd(acc, &ws.nvIntermediate(), 1u, alpaka::hierarchy::Blocks{});
147130
alpaka::syncBlockThreads(acc);
148131
for (auto k : cms::alpakatools::uniform_elements(acc, nq)) {
149132
if (1 == newV[k])
150-
iv[it[k]] = igv;
133+
ws[it[k]].iv() = igv;
151134
}
152135

136+
// synchronise the threads before starting the next iteration of the loop over the vertices and resetting the shared memory
137+
alpaka::syncBlockThreads(acc);
153138
} // loop on vertices
154139
}
155140

156141
class SplitVerticesKernel {
157142
public:
158143
template <typename TAcc>
159-
ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws, float maxChi2) const {
160-
splitVertices(acc, pdata, pws, maxChi2);
144+
ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView data, WsSoAView ws, float maxChi2) const {
145+
splitVertices(acc, data, ws, maxChi2);
161146
}
162147
};
163148

0 commit comments

Comments
 (0)