Skip to content

Commit 80c43fa

Browse files
authored
Merge pull request #46629 from fwyzard/testPrefixScan_fix_warpSize
Update testPrefixScan to work with different warp sizes
2 parents e773726 + 2d77f78 commit 80c43fa

File tree

1 file changed

+23
-7
lines changed

1 file changed

+23
-7
lines changed

HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,17 @@ template <typename T>
3434
struct testPrefixScan {
3535
template <typename TAcc>
3636
ALPAKA_FN_ACC void operator()(const TAcc& acc, unsigned int size) const {
37+
// alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value
38+
#if defined(__CUDA_ARCH__)
39+
// CUDA always has a warp size of 32
3740
auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
41+
#elif defined(__HIP_DEVICE_COMPILE__)
42+
// HIP/ROCm defines warpSize as a constant expression with value 32 or 64 depending on the target device
43+
auto& ws = alpaka::declareSharedVar<T[warpSize], __COUNTER__>(acc);
44+
#else
45+
// CPU back-ends always have a warp size of 1
46+
auto& ws = alpaka::declareSharedVar<T[1], __COUNTER__>(acc);
47+
#endif
3848
auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
3949
auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
4050

@@ -78,7 +88,7 @@ struct testWarpPrefixScan {
7888
template <typename TAcc>
7989
ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const {
8090
if constexpr (!requires_single_thread_per_block_v<TAcc>) {
81-
ALPAKA_ASSERT_ACC(size <= 32);
91+
ALPAKA_ASSERT_ACC(size <= static_cast<uint32_t>(alpaka::warp::getSize(acc)));
8292
auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
8393
auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
8494

@@ -87,7 +97,8 @@ struct testWarpPrefixScan {
8797
auto i = blockThreadIdx;
8898
c[i] = 1;
8999
alpaka::syncBlockThreads(acc);
90-
auto laneId = blockThreadIdx & 0x1f;
100+
// a compile-time constant would be faster, but this is more portable
101+
auto laneId = blockThreadIdx % alpaka::warp::getSize(acc);
91102

92103
warpPrefixScan(acc, laneId, c, co, i);
93104
warpPrefixScan(acc, laneId, c, i);
@@ -152,21 +163,26 @@ int main() {
152163
if constexpr (!requires_single_thread_per_block_v<Acc1D>) {
153164
std::cout << "warp level" << std::endl;
154165

155-
const auto threadsPerBlockOrElementsPerThread = 32;
166+
const auto threadsPerBlockOrElementsPerThread = warpSize;
156167
const auto blocksPerGrid = 1;
157168
const auto workDivWarp = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
158169

159-
alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 32));
160-
alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 16));
161-
alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 5));
170+
if (warpSize >= 64)
171+
alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 64));
172+
if (warpSize >= 32)
173+
alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 32));
174+
if (warpSize >= 16)
175+
alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 12));
176+
if (warpSize >= 8)
177+
alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 5));
162178
}
163179

164180
// PORTABLE BLOCK PREFIXSCAN
165181
std::cout << "block level" << std::endl;
166182

167183
// Running kernel with 1 block, and bs threads per block or elements per thread.
168184
// NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version.
169-
for (int bs = 32; bs <= 1024; bs += 32) {
185+
for (int bs = warpSize; bs <= 1024; bs += warpSize) {
170186
const auto blocksPerGrid2 = 1;
171187
const auto workDivSingleBlock = make_workdiv<Acc1D>(blocksPerGrid2, bs);
172188

0 commit comments

Comments
 (0)