Skip to content

Commit 8c859bc

Browse files
committed
Add a test for once_per_grid and once_per_block
1 parent 6462fcf commit 8c859bc

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed

HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ struct VectorAddBlockKernel {
7171
// the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
7272
// the inner loop is needed for backends that use more than one element per thread
7373
for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
74+
// only one thread per block: initialise the shared memory
75+
if (cms::alpakatools::once_per_block(acc)) {
76+
// not really necessary, just to show how to use "once_per_block"
77+
for (Idx local = 0; local < blockSize; ++local)
78+
buffer[local] = 0.;
79+
}
80+
// synchronise all threads in the block
81+
alpaka::syncBlockThreads(acc);
7482
// read the first set of data into shared memory
7583
for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
7684
buffer[index.local] = in1[index.global];
@@ -91,6 +99,49 @@ struct VectorAddBlockKernel {
9199
}
92100
};
93101

102+
/* Run all operations in a single thread.
103+
* Written in an inefficient way to test "once_per_grid".
104+
*/
105+
106+
struct VectorAddKernelSerial {
107+
template <typename TAcc, typename T>
108+
ALPAKA_FN_ACC void operator()(
109+
TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
110+
// the operations are performed by a single thread
111+
if (cms::alpakatools::once_per_grid(acc)) {
112+
for (Idx index = 0; index < size; ++index) {
113+
out[index] += in1[index];
114+
out[index] += in2[index];
115+
}
116+
}
117+
}
118+
};
119+
120+
/* Run all operations in one thread per block.
121+
* Written in an inefficient way to test "once_per_block".
122+
*/
123+
124+
struct VectorAddKernelBlockSerial {
125+
template <typename TAcc, typename T>
126+
ALPAKA_FN_ACC void operator()(
127+
TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
128+
// block size
129+
auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
130+
// the loop is used to repeat the "block" as many times as needed to cover the whole problem space
131+
for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
132+
// the operations are performed by a single thread in each "logical" block
133+
const auto first = blockSize * block;
134+
const auto range = std::min<size_t>(first + blockSize, size);
135+
if (cms::alpakatools::once_per_block(acc)) {
136+
for (Idx index = first; index < range; ++index) {
137+
out[index] += in1[index];
138+
out[index] += in2[index];
139+
}
140+
}
141+
}
142+
}
143+
};
144+
94145
namespace alpaka::trait {
95146
// specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
96147
// block shared dynamic memory for the VectorAddBlockKernel kernel
@@ -296,5 +347,25 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP
296347
// this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
297348
std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
298349
testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
350+
351+
// launch the 1-dimensional kernel with a small block size and a small number of blocks;
352+
// this relies on the kernel to loop over the "problem space" and do more work per block
353+
std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n";
354+
testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{});
355+
356+
// launch the 1-dimensional kernel with a large block size and a single block;
357+
// this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
358+
std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n";
359+
testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{});
360+
361+
// launch the 1-dimensional kernel with a small block size and a small number of blocks;
362+
// this relies on the kernel to loop over the "problem space" and do more work per block
363+
std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n";
364+
testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{});
365+
366+
// launch the 1-dimensional kernel with a large block size and a single block;
367+
// this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
368+
std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
369+
testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
299370
}
300371
}

0 commit comments

Comments
 (0)