@@ -34,7 +34,17 @@ template <typename T>
3434struct testPrefixScan {
3535 template <typename TAcc>
3636 ALPAKA_FN_ACC void operator ()(const TAcc& acc, unsigned int size) const {
37+ // alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value
38+ #if defined(__CUDA_ARCH__)
39+ // CUDA always has a warp size of 32
3740 auto & ws = alpaka::declareSharedVar<T[32 ], __COUNTER__>(acc);
41+ #elif defined(__HIP_DEVICE_COMPILE__)
42+ // HIP/ROCm defines warpSize as a constant expression with value 32 or 64 depending on the target device
43+ auto & ws = alpaka::declareSharedVar<T[warpSize], __COUNTER__>(acc);
44+ #else
45+ // CPU back-ends always have a warp size of 1
46+ auto & ws = alpaka::declareSharedVar<T[1 ], __COUNTER__>(acc);
47+ #endif
3848 auto & c = alpaka::declareSharedVar<T[1024 ], __COUNTER__>(acc);
3949 auto & co = alpaka::declareSharedVar<T[1024 ], __COUNTER__>(acc);
4050
@@ -78,7 +88,7 @@ struct testWarpPrefixScan {
7888 template <typename TAcc>
7989 ALPAKA_FN_ACC void operator ()(const TAcc& acc, uint32_t size) const {
8090 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
81- ALPAKA_ASSERT_ACC (size <= 32 );
91+ ALPAKA_ASSERT_ACC (size <= static_cast < uint32_t >( alpaka::warp::getSize (acc)) );
8292 auto & c = alpaka::declareSharedVar<T[1024 ], __COUNTER__>(acc);
8393 auto & co = alpaka::declareSharedVar<T[1024 ], __COUNTER__>(acc);
8494
@@ -87,7 +97,8 @@ struct testWarpPrefixScan {
8797 auto i = blockThreadIdx;
8898 c[i] = 1 ;
8999 alpaka::syncBlockThreads (acc);
90- auto laneId = blockThreadIdx & 0x1f ;
100+ // a compile-time constant would be faster, but this is more portable
101+ auto laneId = blockThreadIdx % alpaka::warp::getSize (acc);
91102
92103 warpPrefixScan (acc, laneId, c, co, i);
93104 warpPrefixScan (acc, laneId, c, i);
@@ -152,21 +163,26 @@ int main() {
152163 if constexpr (!requires_single_thread_per_block_v<Acc1D>) {
153164 std::cout << " warp level" << std::endl;
154165
155- const auto threadsPerBlockOrElementsPerThread = 32 ;
166+ const auto threadsPerBlockOrElementsPerThread = warpSize ;
156167 const auto blocksPerGrid = 1 ;
157168 const auto workDivWarp = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
158169
159- alpaka::enqueue (queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int >(), 32 ));
160- alpaka::enqueue (queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int >(), 16 ));
161- alpaka::enqueue (queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int >(), 5 ));
170+ if (warpSize >= 64 )
171+ alpaka::enqueue (queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int >(), 64 ));
172+ if (warpSize >= 32 )
173+ alpaka::enqueue (queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int >(), 32 ));
174+ if (warpSize >= 16 )
175+ alpaka::enqueue (queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int >(), 12 ));
176+ if (warpSize >= 8 )
177+ alpaka::enqueue (queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int >(), 5 ));
162178 }
163179
164180 // PORTABLE BLOCK PREFIXSCAN
165181 std::cout << " block level" << std::endl;
166182
167183 // Running kernel with 1 block, and bs threads per block or elements per thread.
168184 // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version.
169- for (int bs = 32 ; bs <= 1024 ; bs += 32 ) {
185+ for (int bs = warpSize ; bs <= 1024 ; bs += warpSize ) {
170186 const auto blocksPerGrid2 = 1 ;
171187 const auto workDivSingleBlock = make_workdiv<Acc1D>(blocksPerGrid2, bs);
172188
0 commit comments