@@ -1095,11 +1095,19 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10951095 // Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
10961096 // type, if possible.
10971097 int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1098+ // CU mulitiplier from envar.
1099+ uint32_t EnvarCUMultiplier = GenericDevice.getXTeamRedTeamsPerCU ();
1100+ // Disabled if the value is 0.
1101+ if (EnvarCUMultiplier == 0 ) {
1102+ EnvarCUMultiplier = UINT_MAX;
1103+ }
10981104
10991105 if (GenericDevice.isFastReductionEnabled ()) {
11001106 // When fast reduction is enabled, the number of teams is capped by
11011107 // the MaxCUMultiplier constant.
1102- MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1108+ MaxNumGroups =
1109+ DeviceNumCUs * std::min (llvm::omp::xteam_red::MaxCUMultiplier,
1110+ static_cast <int16_t >(EnvarCUMultiplier));
11031111 } else {
11041112 // When fast reduction is not enabled, the number of teams is capped
11051113 // by the metadata that clang CodeGen created. The number of teams
@@ -1110,7 +1118,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11101118 // ConstWGSize is the block size that CodeGen used.
11111119 uint32_t CUMultiplier =
11121120 llvm::omp::xteam_red::getXteamRedCUMultiplier (ConstWGSize);
1113- MaxNumGroups = DeviceNumCUs * CUMultiplier;
1121+ MaxNumGroups = DeviceNumCUs * std::min ( CUMultiplier, EnvarCUMultiplier) ;
11141122 }
11151123
11161124 // If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -2915,6 +2923,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29152923 " LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU" , 6 ),
29162924 OMPX_BigJumpLoopTeamsPerCU (
29172925 " LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_TEAMS_PER_CU" , 0 ),
2926+ OMPX_XTeamRedTeamsPerCU (" LIBOMPTARGET_AMDGPU_XTEAM_RED_TEAMS_PER_CU" ,
2927+ 0 ),
29182928 OMPX_BigJumpLoopMaxTotalTeams (
29192929 " LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_MAX_TOTAL_TEAMS" , 1024 * 1024 ),
29202930 OMPX_LowTripCount (" LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT" , 9000 ),
@@ -2980,6 +2990,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29802990 virtual uint32_t getOMPXBigJumpLoopTeamsPerCU () const override {
29812991 return OMPX_BigJumpLoopTeamsPerCU;
29822992 }
2993+ virtual uint32_t getXTeamRedTeamsPerCU () const override {
2994+ return OMPX_XTeamRedTeamsPerCU;
2995+ }
29832996 virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams () const override {
29842997 return OMPX_BigJumpLoopMaxTotalTeams;
29852998 }
@@ -4427,6 +4440,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
44274440 // / OMPX_BigJumpLoopTeamsPerCU * #CUs.
44284441 UInt32Envar OMPX_BigJumpLoopTeamsPerCU;
44294442
4443+ // / Envar for controlling the number of teams relative to the number of
4444+ // / compute units (CUs) for cross-team-reduction kernels. 0 indicates that
4445+ // / this value is not specified. If non-zero, the number of teams =
4446+ // / OMPX_XTeamRedTeamsPerCU * #CUs.
4447+ UInt32Envar OMPX_XTeamRedTeamsPerCU;
4448+
44304449 // / Envar controlling the maximum number of teams per device for
44314450 // / Big-Jump-Loop kernels.
44324451 UInt32Envar OMPX_BigJumpLoopMaxTotalTeams;
0 commit comments