@@ -19,30 +19,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
1919 using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
2020 template <typename TAcc>
2121 ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__ ((always_inline)) void splitVertices(const TAcc& acc,
22- VtxSoAView& pdata ,
23- WsSoAView& pws ,
22+ VtxSoAView& data ,
23+ WsSoAView& ws ,
2424 float maxChi2) {
2525 constexpr bool verbose = false ; // in principle the compiler should optmize out if false
26- const uint32_t threadIdxLocal (alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u ]);
27-
28- auto & __restrict__ data = pdata;
29- auto & __restrict__ ws = pws;
30- auto nt = ws.ntrks ();
31- float const * __restrict__ zt = ws.zt ();
32- float const * __restrict__ ezt2 = ws.ezt2 ();
33- float * __restrict__ zv = data.zv ();
34- float * __restrict__ wv = data.wv ();
35- float const * __restrict__ chi2 = data.chi2 ();
36- uint32_t & nvFinal = data.nvFinal ();
37-
38- int32_t const * __restrict__ nn = data.ndof ();
39- int32_t * __restrict__ iv = ws.iv ();
40-
41- ALPAKA_ASSERT_ACC (zt);
42- ALPAKA_ASSERT_ACC (wv);
43- ALPAKA_ASSERT_ACC (chi2);
44- ALPAKA_ASSERT_ACC (nn);
45-
4626 constexpr uint32_t MAXTK = 512 ;
4727
4828 auto & it = alpaka::declareSharedVar<uint32_t [MAXTK], __COUNTER__>(acc); // track index
@@ -51,32 +31,33 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
5131 auto & ww = alpaka::declareSharedVar<float [MAXTK], __COUNTER__>(acc); // z weight
5232 auto & nq = alpaka::declareSharedVar<uint32_t , __COUNTER__>(acc); // number of track for this vertex
5333
54- const uint32_t blockIdx (alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u ]);
55- const uint32_t gridDimension (alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u ]);
56-
5734 // one vertex per block
58- for (auto kv = blockIdx; kv < nvFinal; kv += gridDimension) {
59- if (nn[kv] < 4 )
35+ for (auto kv : cms::alpakatools::independent_groups (acc, data.nvFinal ())) {
36+ int32_t ndof = data[kv].ndof ();
37+ if (ndof < 4 )
6038 continue ;
61- if (chi2 [kv] < maxChi2 * float (nn[kv] ))
39+ if (data [kv]. chi2 () < maxChi2 * float (ndof ))
6240 continue ;
6341
64- ALPAKA_ASSERT_ACC (nn[kv] < int32_t (MAXTK));
42+ ALPAKA_ASSERT_ACC (ndof < int32_t (MAXTK));
6543
66- if ((uint32_t )nn[kv] >= MAXTK)
44+ if ((uint32_t )ndof >= MAXTK)
6745 continue ; // too bad FIXME
6846
69- nq = 0u ;
47+ if (cms::alpakatools::once_per_block (acc)) {
48+ // reset the number of tracks for the current vertex
49+ nq = 0u ;
50+ }
7051 alpaka::syncBlockThreads (acc);
7152
72- // copy to local
73- for (auto k : cms::alpakatools::independent_group_elements (acc, nt )) {
74- if (iv [k] == int (kv)) {
75- auto old = alpaka::atomicInc (acc, &nq, MAXTK, alpaka::hierarchy::Threads{});
76- zz[old ] = zt[k] - zv[kv] ;
77- newV[old ] = zz[old] < 0 ? 0 : 1 ;
78- ww[old ] = 1 . f / ezt2[k] ;
79- it[old ] = k ;
53+ // cache the data of the tracks associated to the current vertex into shared memory
54+ for (auto k : cms::alpakatools::independent_group_elements (acc, ws. ntrks () )) {
55+ if (ws [k]. iv () == int (kv)) {
56+ auto index = alpaka::atomicInc (acc, &nq, MAXTK, alpaka::hierarchy::Threads{});
57+ it[index ] = k ;
58+ zz[index ] = ws[k]. zt () - data[kv]. zv () ;
59+ newV[index ] = zz[index] < 0 ? 0 : 1 ;
60+ ww[index ] = 1 . f / ws[k]. ezt2 () ;
8061 }
8162 }
8263
@@ -85,14 +66,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
8566 auto & wnew = alpaka::declareSharedVar<float [2 ], __COUNTER__>(acc);
8667 alpaka::syncBlockThreads (acc);
8768
88- ALPAKA_ASSERT_ACC (int (nq) == nn[kv] + 1 );
69+ ALPAKA_ASSERT_ACC (int (nq) == ndof + 1 );
8970
9071 int maxiter = 20 ;
9172 // kt-min....
9273 bool more = true ;
9374 while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
9475 more = false ;
95- if (0 == threadIdxLocal ) {
76+ if (cms::alpakatools::once_per_block (acc) ) {
9677 znew[0 ] = 0 ;
9778 znew[1 ] = 0 ;
9879 wnew[0 ] = 0 ;
@@ -107,7 +88,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
10788 }
10889 alpaka::syncBlockThreads (acc);
10990
110- if (0 == threadIdxLocal ) {
91+ if (cms::alpakatools::once_per_block (acc) ) {
11192 znew[0 ] /= wnew[0 ];
11293 znew[1 ] /= wnew[1 ];
11394 }
@@ -134,30 +115,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
134115
135116 auto chi2Dist = dist2 / (1 .f / wnew[0 ] + 1 .f / wnew[1 ]);
136117
137- if (verbose && 0 == threadIdxLocal)
138- printf (" inter %d %f %f\n " , 20 - maxiter, chi2Dist, dist2 * wv[kv]);
118+ if constexpr (verbose) {
119+ if (cms::alpakatools::once_per_block (acc))
120+ printf (" inter %d %f %f\n " , 20 - maxiter, chi2Dist, dist2 * data[kv].wv ());
121+ }
139122
140123 if (chi2Dist < 4 )
141124 continue ;
142125
143126 // get a new global vertex
144127 auto & igv = alpaka::declareSharedVar<uint32_t , __COUNTER__>(acc);
145- if (0 == threadIdxLocal )
128+ if (cms::alpakatools::once_per_block (acc) )
146129 igv = alpaka::atomicAdd (acc, &ws.nvIntermediate (), 1u , alpaka::hierarchy::Blocks{});
147130 alpaka::syncBlockThreads (acc);
148131 for (auto k : cms::alpakatools::uniform_elements (acc, nq)) {
149132 if (1 == newV[k])
150- iv [it[k]] = igv;
133+ ws [it[k]]. iv () = igv;
151134 }
152135
136+ // synchronise the threads before starting the next iteration of the loop over the vertices and resetting the shared memory
137+ alpaka::syncBlockThreads (acc);
153138 } // loop on vertices
154139 }
155140
156141 class SplitVerticesKernel {
157142 public:
158143 template <typename TAcc>
159- ALPAKA_FN_ACC void operator ()(const TAcc& acc, VtxSoAView pdata , WsSoAView pws , float maxChi2) const {
160- splitVertices (acc, pdata, pws , maxChi2);
144+ ALPAKA_FN_ACC void operator ()(const TAcc& acc, VtxSoAView data , WsSoAView ws , float maxChi2) const {
145+ splitVertices (acc, data, ws , maxChi2);
161146 }
162147 };
163148
0 commit comments