[OpenMP] [amdgpu] Scale up the number of teams for Xteam Reduction under an env-var.

dhruvachak · ronlieb · commit d9efd049a38c · 2024-02-10T18:11:53.000-05:00
Added an integer env-var LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS to
control the number of teams for Xteam Reduction. The default is 0
(disabled). If set to 1, the plugin will use occupancy as the goal to
scale up the number of teams when the blocksize is lower than the
default blocksize. If &gt;1, that number will be directly used to scale up
the number of teams. The default number of teams in the absence of any
user input remains at the number of CUs.

Change-Id: I622d26f1f4f7d12665eaa907be93f7dbd6418175
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -284,6 +284,9 @@ constexpr int16_t MaxCUMultiplier = 32;
 // Maximum number of threads allowed per CU.
 constexpr int16_t MaxThreadsPerCU = 2048;
 
+// Desired number of wavefronts per CU.
+constexpr int16_t DesiredWavesPerCU = 16;
+
 // Default block size, currently different from other kernel types.
 constexpr int16_t DefaultBlockSize = 1024;
 
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -963,6 +963,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
           XteamRedBlockSize > 0
               ? llvm::omp::xteam_red::MaxThreadsPerCU / XteamRedBlockSize
               : llvm::omp::xteam_red::MaxCUMultiplier;
+      if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
+        CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
 
       // Here's the default we use
       uint64_t NumGroups = DeviceNumCUs;
@@ -994,6 +996,29 @@ struct AMDGPUKernelTy : public GenericKernelTy {
         if (LoopTripCount > 0)
           NumGroupsFromTripCount =
               getNumGroupsFromThreadsAndTripCount(LoopTripCount, NumThreads);
+
+        // Compute desired number of groups in the absence of user input
+        // based on a factor controlled by an integer env-var.
+        // 0: disabled (default)
+        // 1: If the number of waves is lower than the default, increase
+        // the number of teams proportionally. Ideally, this would be the
+        // default behavior.
+        // > 1: Use as the scaling factor for the number of teams.
+        // Note that the upper bound is MaxNumGroups.
+        uint32_t AdjustFactor =
+            GenericDevice.getOMPXAdjustNumTeamsForXteamRedSmallBlockSize();
+        if (NumThreads > 0 && AdjustFactor > 0) {
+          uint64_t DesiredNumGroups = NumGroups;
+          if (AdjustFactor == 1) {
+            DesiredNumGroups =
+                DeviceNumCUs *
+                (llvm::omp::xteam_red::DesiredWavesPerCU / NumWavesInGroup);
+          } else {
+            DesiredNumGroups = DeviceNumCUs * AdjustFactor;
+          }
+          NumGroups = DesiredNumGroups;
+        }
+        NumGroups = std::min(NumGroups, MaxNumGroups);
         NumGroups = std::min(NumGroups, NumGroupsFromTripCount);
 
         // If the user specifies a number of teams for low trip count loops,
@@ -2469,6 +2494,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
             "LIBOMPTARGET_WAVES_PER_CU_FOR_LOW_TRIP_COUNT", 0),
         OMPX_AdjustNumTeamsForSmallBlockSize("LIBOMPTARGET_AMDGPU_ADJUST_TEAMS",
                                              0),
+        OMPX_AdjustNumTeamsForXteamRedSmallBlockSize(
+            "LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS", 0),
         OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
                                1 * 1024 * 1024), // 1MB
         OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
@@ -2564,6 +2591,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   virtual uint32_t getOMPXAdjustNumTeamsForSmallBlockSize() const override {
     return OMPX_AdjustNumTeamsForSmallBlockSize;
   }
+  virtual uint32_t
+  getOMPXAdjustNumTeamsForXteamRedSmallBlockSize() const override {
+    return OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
+  }
 
   /// Initialize the device, its resources and get its properties.
   Error initImpl(GenericPluginTy &Plugin) override {
@@ -3892,6 +3923,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// done.
   UInt32Envar OMPX_AdjustNumTeamsForSmallBlockSize;
 
+  /// Envar to allow scaling up the number of teams for Xteam-Reduction
+  /// whenever the blocksize has been reduced from the default. The env-var
+  /// default of 0 means that the scaling is not done by default.
+  UInt32Envar OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
+
   /// Envar specifying the maximum size in bytes where the memory copies are
   /// asynchronous operations. Up to this transfer size, the memory copies are
   /// asychronous operations pushed to the corresponding stream. For larger
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -903,6 +903,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual uint32_t getOMPXAdjustNumTeamsForSmallBlockSize() const {
     llvm_unreachable("Unimplemented");
   }
+  virtual uint32_t getOMPXAdjustNumTeamsForXteamRedSmallBlockSize() const {
+    llvm_unreachable("Unimplemented");
+  }
 
   /// Get target compute unit kind (e.g., sm_80, or gfx908).
   virtual std::string getComputeUnitKind() const { return "unknown"; }

Original file line number	Diff line number	Diff line change
`@@ -903,6 +903,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {`
`903`	`903`	`virtual uint32_t getOMPXAdjustNumTeamsForSmallBlockSize() const {`
`904`	`904`	`llvm_unreachable("Unimplemented");`
`905`	`905`	`}`
	`906`	`+ virtual uint32_t getOMPXAdjustNumTeamsForXteamRedSmallBlockSize() const {`
	`907`	`+ llvm_unreachable("Unimplemented");`
	`908`	`+ }`
`906`	`909`
`907`	`910`	`/// Get target compute unit kind (e.g., sm_80, or gfx908).`
`908`	`911`	`virtual std::string getComputeUnitKind() const { return "unknown"; }`