@@ -1101,12 +1101,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11011101 if (GenericDevice.isFastReductionEnabled ()) {
11021102 // When fast reduction is enabled, the number of teams is capped by
11031103 // the MaxCUMultiplier constant.
1104- // When envar is enabled, use it for computing MaxNumGroup.
1105- if (EnvarCUMultiplier > 0 )
1106- MaxNumGroups = DeviceNumCUs * EnvarCUMultiplier;
1107- else
1108- MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1109-
1104+ MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
11101105 } else {
11111106 // When fast reduction is not enabled, the number of teams is capped
11121107 // by the metadata that clang CodeGen created. The number of teams
@@ -1117,13 +1112,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11171112 // ConstWGSize is the block size that CodeGen used.
11181113 uint32_t CUMultiplier =
11191114 llvm::omp::xteam_red::getXteamRedCUMultiplier (ConstWGSize);
1120-
1121- if (EnvarCUMultiplier > 0 ) {
1122- MaxNumGroups =
1123- DeviceNumCUs * std::min (CUMultiplier, EnvarCUMultiplier);
1124- } else {
1125- MaxNumGroups = DeviceNumCUs * CUMultiplier;
1126- }
1115+ MaxNumGroups = DeviceNumCUs * CUMultiplier;
11271116 }
11281117
11291118 // If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -1178,6 +1167,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11781167 }
11791168 NumGroups = DesiredNumGroups;
11801169 }
1170+
1171+ // Prefer OMPX_AdjustNumTeamsForXteamRedSmallBlockSize over
1172+ // OMPX_XTeamRedTeamsPerCU.
1173+ if (AdjustFactor == 0 && EnvarCUMultiplier > 0 )
1174+ NumGroups = DeviceNumCUs * EnvarCUMultiplier;
1175+
11811176 NumGroups = std::min (NumGroups, MaxNumGroups);
11821177 NumGroups = std::min (NumGroups, NumGroupsFromTripCount);
11831178
0 commit comments