Skip to content

Commit 42360ab

Browse files
committed
Add a test for blocks_with_stride and elements_in_block
1 parent 7272683 commit 42360ab

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed

HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,60 @@ struct VectorAddKernel3D {
5656
}
5757
};
5858

59+
/* This is not an efficient approach; it is only a test of using dynamic shared memory,
60+
* split block and element loops, and block-level synchronisation
61+
*/
62+
63+
struct VectorAddBlockKernel {
64+
template <typename TAcc, typename T>
65+
ALPAKA_FN_ACC void operator()(
66+
TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
67+
// block size
68+
auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
69+
// get the dynamic shared memory buffer
70+
T* buffer = alpaka::getDynSharedMem<T>(acc);
71+
// the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
72+
// the inner loop is needed for backends that use more than one element per thread
73+
for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
74+
// read the first set of data into shared memory
75+
for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
76+
buffer[index.local] = in1[index.global];
77+
}
78+
// synchronise all threads in the block
79+
alpaka::syncBlockThreads(acc);
80+
// add the second set of data into shared memory
81+
for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
82+
buffer[index.local] += in2[index.global];
83+
}
84+
// synchronise all threads in the block
85+
alpaka::syncBlockThreads(acc);
86+
// store the results into global memory
87+
for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
88+
out[index.global] = buffer[index.local];
89+
}
90+
}
91+
}
92+
};
93+
94+
namespace alpaka::trait {
95+
// specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
96+
// block shared dynamic memory for the VectorAddBlockKernel kernel
97+
template <typename TAcc>
98+
struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, TAcc> {
99+
// the size in bytes of the shared memory allocated for a block
100+
template <typename T>
101+
ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& /* kernel */,
102+
Vec1D threads,
103+
Vec1D elements,
104+
T const* __restrict__ /* in1 */,
105+
T const* __restrict__ /* in2 */,
106+
T* __restrict__ /* out */,
107+
size_t size) {
108+
return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
109+
}
110+
};
111+
} // namespace alpaka::trait
112+
59113
// test the 1-dimensional kernel on all devices
60114
template <typename TKernel>
61115
void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
@@ -232,5 +286,15 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP
232286
// this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
233287
std::cout << "Test 3D vector addition with large block size\n";
234288
testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
289+
290+
// launch the 1-dimensional kernel with a small block size and a small number of blocks;
291+
// this relies on the kernel to loop over the "problem space" and do more work per block
292+
std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n";
293+
testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{});
294+
295+
// launch the 1-dimensional kernel with a large block size and a single block;
296+
// this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
297+
std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
298+
testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
235299
}
236300
}

0 commit comments

Comments
 (0)