Skip to content

Commit c9bc05d

Browse files
Kewen12ronlieb
authored andcommitted
[OpenMP][Offload][AMDGPU] Updated the method to get the wavefront size
This patch updated the method for getting the wavefront size in occupancy computation. It removed the hardcoded variable to make the computation more flexible when running on various devices. Tested locally. Change-Id: I06cbff8eda63f3047e738e05b27b322c44ccfc0b
1 parent e7876a3 commit c9bc05d

File tree

2 files changed

+8
-11
lines changed

2 files changed

+8
-11
lines changed

llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,6 @@ constexpr unsigned SIMDPerCU = 4;
336336
// Max waves each SIMD supports
337337
constexpr unsigned MaxWavesPerEU8 = 8;
338338
constexpr unsigned MaxWavesPerEU10 = 10;
339-
// Wavefront size
340-
constexpr unsigned WaveFrontSize32 = 32;
341-
constexpr unsigned WaveFrontSize64 = 64;
342339
// Number of VGPR for each thread
343340
constexpr unsigned VGPRNumPerThread = 512;
344341
// flat work group size

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11701170
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
11711171
if (isSPMDMode() && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0) {
11721172
unsigned NumWavesPerTeam =
1173-
divideCeil(NumThreads, llvm::omp::amdgpu_arch::WaveFrontSize64);
1173+
divideCeil(NumThreads, GenericDevice.getWarpSize());
11741174
unsigned TotalWavesPerCU =
11751175
MaxOccupancy * llvm::omp::amdgpu_arch::SIMDPerCU;
11761176
// Per device
@@ -1268,7 +1268,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12681268
/// Follow the logic on the backend
12691269
/// Ref:
12701270
/// llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getOccupancyWithLocalMemSize
1271-
unsigned getOccupancyWithLDS(uint32_t GroupSegmentSize,
1271+
unsigned getOccupancyWithLDS(GenericDeviceTy &GenericDevice,
1272+
uint32_t GroupSegmentSize,
12721273
unsigned MaxWavesPerEU,
12731274
uint32_t MaxFlatWorkgroupSize) const {
12741275

@@ -1277,8 +1278,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12771278

12781279
// workgroup size
12791280
unsigned ThreadsPerWorkgroup = MaxFlatWorkgroupSize;
1280-
unsigned WavesPerWorkgroup = divideCeil(
1281-
ThreadsPerWorkgroup, llvm::omp::amdgpu_arch::WaveFrontSize64);
1281+
unsigned WavesPerWorkgroup =
1282+
divideCeil(ThreadsPerWorkgroup, GenericDevice.getWarpSize());
12821283

12831284
unsigned MaxWavesPerCU = MaxWavesPerEU * llvm::omp::amdgpu_arch::SIMDPerCU;
12841285

@@ -5287,8 +5288,8 @@ unsigned AMDGPUKernelTy::computeMaxOccupancy(GenericDeviceTy &Device) const {
52875288

52885289
// Constraint on LDS
52895290
if (GroupSegmentSize) {
5290-
unsigned WaveNumByLDS = getOccupancyWithLDS(GroupSegmentSize, MaxWavesPerEU,
5291-
MaxFlatWorkgroupSize);
5291+
unsigned WaveNumByLDS = getOccupancyWithLDS(
5292+
Device, GroupSegmentSize, MaxWavesPerEU, MaxFlatWorkgroupSize);
52925293
Occupancy = std::min(Occupancy, WaveNumByLDS);
52935294
} else {
52945295
// If 0 LDS required by the kernel
@@ -5326,8 +5327,7 @@ unsigned AMDGPUKernelTy::computeAchievedOccupancy(GenericDeviceTy &Device,
53265327
unsigned MaxNumWaves = MaxOccupancy * llvm::omp::amdgpu_arch::SIMDPerCU;
53275328
// Get the number of waves from the kernel launch parameters.
53285329
unsigned AchievedNumWaves =
5329-
divideCeil(numThreads, llvm::omp::amdgpu_arch::WaveFrontSize64) *
5330-
numTeams;
5330+
divideCeil(numThreads, AMDDevice.getWarpSize()) * numTeams;
53315331
// Get the number of waves per CU.
53325332
AchievedNumWaves = divideCeil(AchievedNumWaves, Device.getNumComputeUnits());
53335333
// Get the min waves.

0 commit comments

Comments
 (0)