@@ -1170,7 +1170,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11701170 int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
11711171 if (isSPMDMode () && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
11721172 unsigned NumWavesPerTeam =
1173- divideCeil (NumThreads, llvm::omp::amdgpu_arch::WaveFrontSize64 );
1173+ divideCeil (NumThreads, GenericDevice. getWarpSize () );
11741174 unsigned TotalWavesPerCU =
11751175 MaxOccupancy * llvm::omp::amdgpu_arch::SIMDPerCU;
11761176 // Per device
@@ -1268,7 +1268,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12681268 // / Follow the logic on the backend
12691269 // / Ref:
12701270 // / llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getOccupancyWithLocalMemSize
1271- unsigned getOccupancyWithLDS (uint32_t GroupSegmentSize,
1271+ unsigned getOccupancyWithLDS (GenericDeviceTy &GenericDevice,
1272+ uint32_t GroupSegmentSize,
12721273 unsigned MaxWavesPerEU,
12731274 uint32_t MaxFlatWorkgroupSize) const {
12741275
@@ -1277,8 +1278,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12771278
12781279 // workgroup size
12791280 unsigned ThreadsPerWorkgroup = MaxFlatWorkgroupSize;
1280- unsigned WavesPerWorkgroup = divideCeil (
1281- ThreadsPerWorkgroup, llvm::omp::amdgpu_arch::WaveFrontSize64 );
1281+ unsigned WavesPerWorkgroup =
1282+ divideCeil ( ThreadsPerWorkgroup, GenericDevice. getWarpSize () );
12821283
12831284 unsigned MaxWavesPerCU = MaxWavesPerEU * llvm::omp::amdgpu_arch::SIMDPerCU;
12841285
@@ -5287,8 +5288,8 @@ unsigned AMDGPUKernelTy::computeMaxOccupancy(GenericDeviceTy &Device) const {
52875288
52885289 // Constraint on LDS
52895290 if (GroupSegmentSize) {
5290- unsigned WaveNumByLDS = getOccupancyWithLDS (GroupSegmentSize, MaxWavesPerEU,
5291- MaxFlatWorkgroupSize);
5291+ unsigned WaveNumByLDS = getOccupancyWithLDS (
5292+ Device, GroupSegmentSize, MaxWavesPerEU, MaxFlatWorkgroupSize);
52925293 Occupancy = std::min (Occupancy, WaveNumByLDS);
52935294 } else {
52945295 // If 0 LDS required by the kernel
@@ -5326,8 +5327,7 @@ unsigned AMDGPUKernelTy::computeAchievedOccupancy(GenericDeviceTy &Device,
53265327 unsigned MaxNumWaves = MaxOccupancy * llvm::omp::amdgpu_arch::SIMDPerCU;
53275328 // Get the number of waves from the kernel launch parameters.
53285329 unsigned AchievedNumWaves =
5329- divideCeil (numThreads, llvm::omp::amdgpu_arch::WaveFrontSize64) *
5330- numTeams;
5330+ divideCeil (numThreads, AMDDevice.getWarpSize ()) * numTeams;
53315331 // Get the number of waves per CU.
53325332 AchievedNumWaves = divideCeil (AchievedNumWaves, Device.getNumComputeUnits ());
53335333 // Get the min waves.
0 commit comments