@@ -71,6 +71,14 @@ struct VectorAddBlockKernel {
7171 // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
7272 // the inner loop is needed for backends that use more than one element per thread
7373 for (auto block : cms::alpakatools::blocks_with_stride (acc, size)) {
74+ // only one thread per block: initialise the shared memory
75+ if (cms::alpakatools::once_per_block (acc)) {
76+ // not really necessary, just to show how to use "once_per_block"
77+ for (Idx local = 0 ; local < blockSize; ++local)
78+ buffer[local] = 0 .;
79+ }
80+ // synchronise all threads in the block
81+ alpaka::syncBlockThreads (acc);
7482 // read the first set of data into shared memory
7583 for (auto index : cms::alpakatools::elements_in_block (acc, block, size)) {
7684 buffer[index.local ] = in1[index.global ];
@@ -91,6 +99,49 @@ struct VectorAddBlockKernel {
9199 }
92100};
93101
102+ /* Run all operations in a single thread.
103+ * Written in an inefficient way to test "once_per_grid".
104+ */
105+
106+ struct VectorAddKernelSerial {
107+ template <typename TAcc, typename T>
108+ ALPAKA_FN_ACC void operator ()(
109+ TAcc const & acc, T const * __restrict__ in1, T const * __restrict__ in2, T* __restrict__ out, size_t size) const {
110+ // the operations are performed by a single thread
111+ if (cms::alpakatools::once_per_grid (acc)) {
112+ for (Idx index = 0 ; index < size; ++index) {
113+ out[index] += in1[index];
114+ out[index] += in2[index];
115+ }
116+ }
117+ }
118+ };
119+
120+ /* Run all operations in one thread per block.
121+ * Written in an inefficient way to test "once_per_block".
122+ */
123+
124+ struct VectorAddKernelBlockSerial {
125+ template <typename TAcc, typename T>
126+ ALPAKA_FN_ACC void operator ()(
127+ TAcc const & acc, T const * __restrict__ in1, T const * __restrict__ in2, T* __restrict__ out, size_t size) const {
128+ // block size
129+ auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u ];
130+ // the loop is used to repeat the "block" as many times as needed to cover the whole problem space
131+ for (auto block : cms::alpakatools::blocks_with_stride (acc, size)) {
132+ // the operations are performed by a single thread in each "logical" block
133+ const auto first = blockSize * block;
134+ const auto range = std::min<size_t >(first + blockSize, size);
135+ if (cms::alpakatools::once_per_block (acc)) {
136+ for (Idx index = first; index < range; ++index) {
137+ out[index] += in1[index];
138+ out[index] += in2[index];
139+ }
140+ }
141+ }
142+ }
143+ };
144+
94145namespace alpaka ::trait {
95146 // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
96147 // block shared dynamic memory for the VectorAddBlockKernel kernel
@@ -296,5 +347,25 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP
296347 // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
297348 std::cout << " Test 1D vector block-level addition with large block size, using scalar dimensions\n " ;
298349 testVectorAddKernel (100 , 1 , 1024 , VectorAddBlockKernel{});
350+
351+ // launch the 1-dimensional kernel with a small block size and a small number of blocks;
352+ // this relies on the kernel to loop over the "problem space" and do more work per block
353+ std::cout << " Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n " ;
354+ testVectorAddKernel (10000 , 32 , 32 , VectorAddKernelSerial{});
355+
356+ // launch the 1-dimensional kernel with a large block size and a single block;
357+ // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
358+ std::cout << " Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n " ;
359+ testVectorAddKernel (100 , 1 , 1024 , VectorAddKernelSerial{});
360+
361+ // launch the 1-dimensional kernel with a small block size and a small number of blocks;
362+ // this relies on the kernel to loop over the "problem space" and do more work per block
363+ std::cout << " Test 1D vector block-level serial addition with small block size, using scalar dimensions\n " ;
364+ testVectorAddKernel (10000 , 32 , 32 , VectorAddKernelBlockSerial{});
365+
366+ // launch the 1-dimensional kernel with a large block size and a single block;
367+ // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
368+ std::cout << " Test 1D vector block-level serial addition with large block size, using scalar dimensions\n " ;
369+ testVectorAddKernel (100 , 1 , 1024 , VectorAddKernelBlockSerial{});
299370 }
300371}
0 commit comments