[OpenMP][Offload][AMDGPU] Updated the method to get the wavefront size

Kewen12 · ronlieb · commit c9bc05d9ba94 · 2024-10-18T21:08:45.000-04:00
This patch updated the method for getting the wavefront size in occupancy computation. It removed the hardcoded variable
to make the computation more flexible when running on various devices. Tested locally.

Change-Id: I06cbff8eda63f3047e738e05b27b322c44ccfc0b
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -336,9 +336,6 @@ constexpr unsigned SIMDPerCU = 4;
 // Max waves each SIMD supports
 constexpr unsigned MaxWavesPerEU8 = 8;
 constexpr unsigned MaxWavesPerEU10 = 10;
-// Wavefront size
-constexpr unsigned WaveFrontSize32 = 32;
-constexpr unsigned WaveFrontSize64 = 64;
 // Number of VGPR for each thread
 constexpr unsigned VGPRNumPerThread = 512;
 // flat work group size
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1170,7 +1170,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
     int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
     if (isSPMDMode() && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0) {
       unsigned NumWavesPerTeam =
-          divideCeil(NumThreads, llvm::omp::amdgpu_arch::WaveFrontSize64);
+          divideCeil(NumThreads, GenericDevice.getWarpSize());
       unsigned TotalWavesPerCU =
           MaxOccupancy * llvm::omp::amdgpu_arch::SIMDPerCU;
       // Per device
@@ -1268,7 +1268,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   /// Follow the logic on the backend
   /// Ref:
   /// llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getOccupancyWithLocalMemSize
-  unsigned getOccupancyWithLDS(uint32_t GroupSegmentSize,
+  unsigned getOccupancyWithLDS(GenericDeviceTy &GenericDevice,
+                               uint32_t GroupSegmentSize,
                                unsigned MaxWavesPerEU,
                                uint32_t MaxFlatWorkgroupSize) const {
 
@@ -1277,8 +1278,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
 
     // workgroup size
     unsigned ThreadsPerWorkgroup = MaxFlatWorkgroupSize;
-    unsigned WavesPerWorkgroup = divideCeil(
-        ThreadsPerWorkgroup, llvm::omp::amdgpu_arch::WaveFrontSize64);
+    unsigned WavesPerWorkgroup =
+        divideCeil(ThreadsPerWorkgroup, GenericDevice.getWarpSize());
 
     unsigned MaxWavesPerCU = MaxWavesPerEU * llvm::omp::amdgpu_arch::SIMDPerCU;
 
@@ -5287,8 +5288,8 @@ unsigned AMDGPUKernelTy::computeMaxOccupancy(GenericDeviceTy &Device) const {
 
   // Constraint on LDS
   if (GroupSegmentSize) {
-    unsigned WaveNumByLDS = getOccupancyWithLDS(GroupSegmentSize, MaxWavesPerEU,
-                                                MaxFlatWorkgroupSize);
+    unsigned WaveNumByLDS = getOccupancyWithLDS(
+        Device, GroupSegmentSize, MaxWavesPerEU, MaxFlatWorkgroupSize);
     Occupancy = std::min(Occupancy, WaveNumByLDS);
   } else {
     // If 0 LDS required by the kernel
@@ -5326,8 +5327,7 @@ unsigned AMDGPUKernelTy::computeAchievedOccupancy(GenericDeviceTy &Device,
   unsigned MaxNumWaves = MaxOccupancy * llvm::omp::amdgpu_arch::SIMDPerCU;
   // Get the number of waves from the kernel launch parameters.
   unsigned AchievedNumWaves =
-      divideCeil(numThreads, llvm::omp::amdgpu_arch::WaveFrontSize64) *
-      numTeams;
+      divideCeil(numThreads, AMDDevice.getWarpSize()) * numTeams;
   // Get the number of waves per CU.
   AchievedNumWaves = divideCeil(AchievedNumWaves, Device.getNumComputeUnits());
   // Get the min waves.