@@ -745,6 +745,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
745745 AMDGPUKernelTy (const char *Name, GenericGlobalHandlerTy &Handler)
746746 : GenericKernelTy(Name),
747747 OMPX_SPMDOccupancyBasedOpt (" OMPX_SPMD_OCCUPANCY_BASED_OPT" , false ),
748+ OMPX_GenericSPMDOccupancyBasedOpt(
749+ " OMPX_GENERIC_SPMD_OCCUPANCY_BASED_OPT" , false ),
748750 OMPX_BigJumpLoopOccupancyBasedOpt(
749751 " OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT" , false ),
750752 OMPX_XTeamReductionOccupancyBasedOpt(
@@ -888,6 +890,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
888890 // / Envar to enable occupancy-based optimization for SPMD kernel.
889891 BoolEnvar OMPX_SPMDOccupancyBasedOpt;
890892
893+ // / Envar to enable occupancy-based optimization for generic SPMD kernel.
894+ BoolEnvar OMPX_GenericSPMDOccupancyBasedOpt;
895+
891896 // / Envar to enable occupancy-based optimization for big jump loop.
892897 BoolEnvar OMPX_BigJumpLoopOccupancyBasedOpt;
893898
@@ -1060,14 +1065,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10601065
10611066 if (isBigJumpLoopMode ()) {
10621067 int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1063-
1064- // If envar OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT is set and no
1065- // OMP_NUM_TEAMS is specified, optimize the num of teams based on
1066- // occupancy value.
1067- if (OMPX_BigJumpLoopOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
1068- return OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads);
1069- }
1070-
10711068 uint64_t NumGroups = 1 ;
10721069 // Cannot assert a non-zero tripcount. Instead, launch with 1 team if the
10731070 // tripcount is indeed zero.
@@ -1111,6 +1108,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11111108 NumGroups = LowTripCountBlocks;
11121109 }
11131110 }
1111+ // If envar OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT is set and no num_teams
1112+ // clause or OMP_NUM_TEAMS is specified, optimize the number of teams
1113+ // based on occupancy value.
1114+ if (OMPX_BigJumpLoopOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1115+ NumTeamsClause[0 ] == 0 ) {
1116+ return std::min (NumGroups, OptimizeNumTeamsBaseOccupancy (GenericDevice,
1117+ NumThreads));
1118+ }
11141119 return std::min (NumGroups,
11151120 static_cast <uint64_t >(GenericDevice.getBlockLimit ()));
11161121 }
@@ -1144,12 +1149,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11441149 }
11451150
11461151 // If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
1147- // OMP_NUM_TEAMS is specified, optimize the num of teams based on
1148- // occupancy value.
1149- if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
1152+ // OMP_NUM_TEAMS or num_teams clause is specified, optimize the num of
1153+ // teams based on occupancy value.
1154+ if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1155+ NumTeamsClause[0 ] == 0 ) {
11501156 uint64_t newNumTeams =
11511157 OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads);
1152-
11531158 return std::min (newNumTeams, MaxNumGroups);
11541159 }
11551160
@@ -1222,10 +1227,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12221227 // If envar OMPX_SPMD_OCCUPANCY_BASED_OPT is set and no OMP_NUM_TEAMS is
12231228 // specified, optimize the num of teams based on occupancy value.
12241229 int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1225- if (isSPMDMode () && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
1226- return OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads);
1227- }
1228-
12291230 uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t >::max ();
12301231 if (LoopTripCount > 0 ) {
12311232 if (isSPMDMode ()) {
@@ -1253,6 +1254,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12531254 }
12541255 }
12551256
1257+ if (isSPMDMode () && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1258+ NumTeamsClause[0 ] == 0 ) {
1259+ return std::min (TripCountNumBlocks,
1260+ OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads));
1261+ }
1262+
12561263 auto getAdjustedDefaultNumBlocks =
12571264 [this ](GenericDeviceTy &GenericDevice,
12581265 uint64_t DeviceNumCUs) -> uint64_t {
@@ -1286,9 +1293,17 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12861293 }
12871294
12881295 uint64_t PreferredNumBlocks = TripCountNumBlocks;
1289- // If the loops are long running we rather reuse blocks than spawn too many.
1290- if (GenericDevice.getReuseBlocksForHighTripCount ())
1296+ // Occupancy-based setting overrides block reuse.
1297+ if (OMPX_GenericSPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 && NumTeamsClause[0 ] == 0 ) {
1298+ PreferredNumBlocks =
1299+ std::min (PreferredNumBlocks,
1300+ OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads));
1301+ } else if (GenericDevice.getReuseBlocksForHighTripCount ()) {
1302+ // If the loops are long running we rather reuse blocks than spawn too
1303+ // many.
12911304 PreferredNumBlocks = std::min (TripCountNumBlocks, AdjustedNumBlocks);
1305+ }
1306+
12921307 return std::min (PreferredNumBlocks,
12931308 (uint64_t )GenericDevice.getBlockLimit ());
12941309 }
0 commit comments