@@ -56,6 +56,60 @@ struct VectorAddKernel3D {
5656 }
5757};
5858
59+ /* This is not an efficient approach; it is only a test of using dynamic shared memory,
60+ * split block and element loops, and block-level synchronisation
61+ */
62+
63+ struct VectorAddBlockKernel {
64+ template <typename TAcc, typename T>
65+ ALPAKA_FN_ACC void operator ()(
66+ TAcc const & acc, T const * __restrict__ in1, T const * __restrict__ in2, T* __restrict__ out, size_t size) const {
67+ // block size
68+ auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u ];
69+ // get the dynamic shared memory buffer
70+ T* buffer = alpaka::getDynSharedMem<T>(acc);
71+ // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
72+ // the inner loop is needed for backends that use more than one element per thread
73+ for (auto block : cms::alpakatools::blocks_with_stride (acc, size)) {
74+ // read the first set of data into shared memory
75+ for (auto index : cms::alpakatools::elements_in_block (acc, block, size)) {
76+ buffer[index.local ] = in1[index.global ];
77+ }
78+ // synchronise all threads in the block
79+ alpaka::syncBlockThreads (acc);
80+ // add the second set of data into shared memory
81+ for (auto index : cms::alpakatools::elements_in_block (acc, block, size)) {
82+ buffer[index.local ] += in2[index.global ];
83+ }
84+ // synchronise all threads in the block
85+ alpaka::syncBlockThreads (acc);
86+ // store the results into global memory
87+ for (auto index : cms::alpakatools::elements_in_block (acc, block, size)) {
88+ out[index.global ] = buffer[index.local ];
89+ }
90+ }
91+ }
92+ };
93+
94+ namespace alpaka ::trait {
95+ // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
96+ // block shared dynamic memory for the VectorAddBlockKernel kernel
97+ template <typename TAcc>
98+ struct BlockSharedMemDynSizeBytes <VectorAddBlockKernel, TAcc> {
99+ // the size in bytes of the shared memory allocated for a block
100+ template <typename T>
101+ ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes (VectorAddBlockKernel const & /* kernel */ ,
102+ Vec1D threads,
103+ Vec1D elements,
104+ T const * __restrict__ /* in1 */ ,
105+ T const * __restrict__ /* in2 */ ,
106+ T* __restrict__ /* out */ ,
107+ size_t size) {
108+ return static_cast <std::size_t >(threads[0 ] * elements[0 ] * sizeof (T));
109+ }
110+ };
111+ } // namespace alpaka::trait
112+
59113// test the 1-dimensional kernel on all devices
60114template <typename TKernel>
61115void testVectorAddKernel (std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
@@ -232,5 +286,15 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP
232286 // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
233287 std::cout << " Test 3D vector addition with large block size\n " ;
234288 testVectorAddKernelND<Dim3D>({5 , 5 , 5 }, {1 , 1 , 1 }, {8 , 8 , 8 }, VectorAddKernel3D{});
289+
290+ // launch the 1-dimensional kernel with a small block size and a small number of blocks;
291+ // this relies on the kernel to loop over the "problem space" and do more work per block
292+ std::cout << " Test 1D vector block-level addition with small block size, using scalar dimensions\n " ;
293+ testVectorAddKernel (10000 , 32 , 32 , VectorAddBlockKernel{});
294+
295+ // launch the 1-dimensional kernel with a large block size and a single block;
296+ // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
297+ std::cout << " Test 1D vector block-level addition with large block size, using scalar dimensions\n " ;
298+ testVectorAddKernel (100 , 1 , 1024 , VectorAddBlockKernel{});
235299 }
236300}
0 commit comments