[OpenMP][Offload][AMDGPU] Add envar for setting CU multiplier (llvm#1143)

ronlieb · web-flow · commit fbc3f7b6bdd3 · 2025-03-14T11:16:12.000-04:00
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1095,11 +1095,18 @@ struct AMDGPUKernelTy : public GenericKernelTy {
       // Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
       // type, if possible.
       int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
+      // CU mulitiplier from envar.
+      uint32_t EnvarCUMultiplier = GenericDevice.getXTeamRedTeamsPerCU();
 
       if (GenericDevice.isFastReductionEnabled()) {
         // When fast reduction is enabled, the number of teams is capped by
         // the MaxCUMultiplier constant.
-        MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
+        // When envar is enabled, use it for computing MaxNumGroup.
+        if (EnvarCUMultiplier > 0)
+          MaxNumGroups = DeviceNumCUs * EnvarCUMultiplier;
+        else
+          MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
+
       } else {
         // When fast reduction is not enabled, the number of teams is capped
         // by the metadata that clang CodeGen created. The number of teams
@@ -1110,7 +1117,13 @@ struct AMDGPUKernelTy : public GenericKernelTy {
         // ConstWGSize is the block size that CodeGen used.
         uint32_t CUMultiplier =
             llvm::omp::xteam_red::getXteamRedCUMultiplier(ConstWGSize);
-        MaxNumGroups = DeviceNumCUs * CUMultiplier;
+
+        if (EnvarCUMultiplier > 0) {
+          MaxNumGroups =
+              DeviceNumCUs * std::min(CUMultiplier, EnvarCUMultiplier);
+        } else {
+          MaxNumGroups = DeviceNumCUs * CUMultiplier;
+        }
       }
 
       // If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -2915,6 +2928,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
             "LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU", 6),
         OMPX_BigJumpLoopTeamsPerCU(
             "LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_TEAMS_PER_CU", 0),
+        OMPX_XTeamRedTeamsPerCU("LIBOMPTARGET_AMDGPU_XTEAM_RED_TEAMS_PER_CU",
+                                0),
         OMPX_BigJumpLoopMaxTotalTeams(
             "LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_MAX_TOTAL_TEAMS", 1024 * 1024),
         OMPX_LowTripCount("LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT", 9000),
@@ -2980,6 +2995,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   virtual uint32_t getOMPXBigJumpLoopTeamsPerCU() const override {
     return OMPX_BigJumpLoopTeamsPerCU;
   }
+  virtual uint32_t getXTeamRedTeamsPerCU() const override {
+    return OMPX_XTeamRedTeamsPerCU;
+  }
   virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams() const override {
     return OMPX_BigJumpLoopMaxTotalTeams;
   }
@@ -4427,6 +4445,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// OMPX_BigJumpLoopTeamsPerCU * #CUs.
   UInt32Envar OMPX_BigJumpLoopTeamsPerCU;
 
+  /// Envar for controlling the number of teams relative to the number of
+  /// compute units (CUs) for cross-team-reduction kernels. 0 indicates that
+  /// this value is not specified. If non-zero, the number of teams =
+  /// OMPX_XTeamRedTeamsPerCU * #CUs.
+  UInt32Envar OMPX_XTeamRedTeamsPerCU;
+
   /// Envar controlling the maximum number of teams per device for
   /// Big-Jump-Loop kernels.
   UInt32Envar OMPX_BigJumpLoopMaxTotalTeams;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1017,6 +1017,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual uint32_t getOMPXBigJumpLoopTeamsPerCU() const {
     llvm_unreachable("Unimplemented");
   }
+  virtual uint32_t getXTeamRedTeamsPerCU() const {
+    llvm_unreachable("Unimplemented");
+  }
   virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams() const {
     llvm_unreachable("Unimplemented");
   }

Original file line number	Diff line number	Diff line change
`@@ -1017,6 +1017,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {`
`1017`	`1017`	`virtual uint32_t getOMPXBigJumpLoopTeamsPerCU() const {`
`1018`	`1018`	`llvm_unreachable("Unimplemented");`
`1019`	`1019`	`}`
	`1020`	`+ virtual uint32_t getXTeamRedTeamsPerCU() const {`
	`1021`	`+ llvm_unreachable("Unimplemented");`
	`1022`	`+ }`
`1020`	`1023`	`virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams() const {`
`1021`	`1024`	`llvm_unreachable("Unimplemented");`
`1022`	`1025`	`}`