Skip to content

Commit 8a0294f

Browse files
authored
Merge pull request #48947 from fwyzard/update_warpsize_for_ROCm_7.0
Update the compile-time warp size constant for ROCm 7.0
2 parents 041a321 + 8d51780 commit 8a0294f

File tree

2 files changed

+17
-13
lines changed

2 files changed

+17
-13
lines changed

HeterogeneousCore/AlpakaInterface/interface/warpsize.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,20 @@ namespace cms::alpakatools {
1212
// CUDA always has a warp size of 32
1313
inline constexpr int warpSize = 32;
1414
#elif defined(__HIP_DEVICE_COMPILE__)
15-
// HIP/ROCm defines warpSize as a constant expression in device code, with value 32 or 64 depending on the target device
16-
inline constexpr int warpSize = ::warpSize;
15+
// HIP/ROCm may have a warp size of 32 or 64 depending on the target device
16+
#if defined(__gfx900__) or defined(__gfx902__) or defined(__gfx903__) or defined(__gfx906__) or defined(__gfx908__) or \
17+
defined(__gfx909__) or defined(__gfx90a__) or defined(__gfx90c__) or defined(__gfx942__) or defined(__gfx950__)
18+
inline constexpr int warpSize = 64;
19+
#elif defined(__gfx1010__) or defined(__gfx1011__) or defined(__gfx1012__) or defined(__gfx1013__) or \
20+
defined(__gfx1030__) or defined(__gfx1031__) or defined(__gfx1032__) or defined(__gfx1033__) or \
21+
defined(__gfx1034__) or defined(__gfx1035__) or defined(__gfx1036__) or defined(__gfx1100__) or \
22+
defined(__gfx1101__) or defined(__gfx1102__) or defined(__gfx1103__) or defined(__gfx1150__) or \
23+
defined(__gfx1151__) or defined(__gfx1152__) or defined(__gfx1153__) or defined(__gfx1200__) or \
24+
defined(__gfx1201__) or defined(__gfx1250__) or defined(__gfx1251__)
25+
inline constexpr int warpSize = 32;
26+
#else
27+
#error "Unknown AMDGCN architecture"
28+
#endif
1729
#else
1830
// CPU back-ends always have a warp size of 1
1931
inline constexpr int warpSize = 1;

HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
1313
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
1414
#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
15+
#include "HeterogeneousCore/AlpakaInterface/interface/warpsize.h"
1516

1617
using namespace cms::alpakatools;
1718
using namespace ALPAKA_ACCELERATOR_NAMESPACE;
@@ -33,17 +34,8 @@ struct format_traits<float> {
3334
template <typename T>
3435
struct testPrefixScan {
3536
ALPAKA_FN_ACC void operator()(Acc1D const& acc, unsigned int size) const {
36-
// alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value
37-
#if defined(__CUDA_ARCH__)
38-
// CUDA always has a warp size of 32
39-
auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
40-
#elif defined(__HIP_DEVICE_COMPILE__)
41-
// HIP/ROCm defines warpSize as a constant expression with value 32 or 64 depending on the target device
42-
auto& ws = alpaka::declareSharedVar<T[warpSize], __COUNTER__>(acc);
43-
#else
44-
// CPU back-ends always have a warp size of 1
45-
auto& ws = alpaka::declareSharedVar<T[1], __COUNTER__>(acc);
46-
#endif
37+
// alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value, so we use cms::alpakatools::warpSize
38+
auto& ws = alpaka::declareSharedVar<T[cms::alpakatools::warpSize], __COUNTER__>(acc);
4739
auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
4840
auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
4941

0 commit comments

Comments
 (0)