@@ -745,6 +745,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
745
745
AMDGPUKernelTy (const char *Name, GenericGlobalHandlerTy &Handler)
746
746
: GenericKernelTy(Name),
747
747
OMPX_SPMDOccupancyBasedOpt (" OMPX_SPMD_OCCUPANCY_BASED_OPT" , false ),
748
+ OMPX_GenericSPMDOccupancyBasedOpt(
749
+ " OMPX_GENERIC_SPMD_OCCUPANCY_BASED_OPT" , false ),
748
750
OMPX_BigJumpLoopOccupancyBasedOpt(
749
751
" OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT" , false ),
750
752
OMPX_XTeamReductionOccupancyBasedOpt(
@@ -888,6 +890,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
888
890
// / Envar to enable occupancy-based optimization for SPMD kernel.
889
891
BoolEnvar OMPX_SPMDOccupancyBasedOpt;
890
892
893
+ // / Envar to enable occupancy-based optimization for generic SPMD kernel.
894
+ BoolEnvar OMPX_GenericSPMDOccupancyBasedOpt;
895
+
891
896
// / Envar to enable occupancy-based optimization for big jump loop.
892
897
BoolEnvar OMPX_BigJumpLoopOccupancyBasedOpt;
893
898
@@ -1060,14 +1065,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1060
1065
1061
1066
if (isBigJumpLoopMode ()) {
1062
1067
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1063
-
1064
- // If envar OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT is set and no
1065
- // OMP_NUM_TEAMS is specified, optimize the num of teams based on
1066
- // occupancy value.
1067
- if (OMPX_BigJumpLoopOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
1068
- return OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads);
1069
- }
1070
-
1071
1068
uint64_t NumGroups = 1 ;
1072
1069
// Cannot assert a non-zero tripcount. Instead, launch with 1 team if the
1073
1070
// tripcount is indeed zero.
@@ -1111,6 +1108,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1111
1108
NumGroups = LowTripCountBlocks;
1112
1109
}
1113
1110
}
1111
+ // If envar OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT is set and no num_teams
1112
+ // clause or OMP_NUM_TEAMS is specified, optimize the number of teams
1113
+ // based on occupancy value.
1114
+ if (OMPX_BigJumpLoopOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1115
+ NumTeamsClause[0 ] == 0 ) {
1116
+ return std::min (NumGroups, OptimizeNumTeamsBaseOccupancy (GenericDevice,
1117
+ NumThreads));
1118
+ }
1114
1119
return std::min (NumGroups,
1115
1120
static_cast <uint64_t >(GenericDevice.getBlockLimit ()));
1116
1121
}
@@ -1144,12 +1149,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1144
1149
}
1145
1150
1146
1151
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
1147
- // OMP_NUM_TEAMS is specified, optimize the num of teams based on
1148
- // occupancy value.
1149
- if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
1152
+ // OMP_NUM_TEAMS or num_teams clause is specified, optimize the num of
1153
+ // teams based on occupancy value.
1154
+ if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1155
+ NumTeamsClause[0 ] == 0 ) {
1150
1156
uint64_t newNumTeams =
1151
1157
OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads);
1152
-
1153
1158
return std::min (newNumTeams, MaxNumGroups);
1154
1159
}
1155
1160
@@ -1222,10 +1227,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1222
1227
// If envar OMPX_SPMD_OCCUPANCY_BASED_OPT is set and no OMP_NUM_TEAMS is
1223
1228
// specified, optimize the num of teams based on occupancy value.
1224
1229
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1225
- if (isSPMDMode () && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
1226
- return OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads);
1227
- }
1228
-
1229
1230
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t >::max ();
1230
1231
if (LoopTripCount > 0 ) {
1231
1232
if (isSPMDMode ()) {
@@ -1253,6 +1254,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1253
1254
}
1254
1255
}
1255
1256
1257
+ if (isSPMDMode () && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1258
+ NumTeamsClause[0 ] == 0 ) {
1259
+ return std::min (TripCountNumBlocks,
1260
+ OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads));
1261
+ }
1262
+
1256
1263
auto getAdjustedDefaultNumBlocks =
1257
1264
[this ](GenericDeviceTy &GenericDevice,
1258
1265
uint64_t DeviceNumCUs) -> uint64_t {
@@ -1286,9 +1293,17 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1286
1293
}
1287
1294
1288
1295
uint64_t PreferredNumBlocks = TripCountNumBlocks;
1289
- // If the loops are long running we rather reuse blocks than spawn too many.
1290
- if (GenericDevice.getReuseBlocksForHighTripCount ())
1296
+ // Occupancy-based setting overrides block reuse.
1297
+ if (OMPX_GenericSPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 && NumTeamsClause[0 ] == 0 ) {
1298
+ PreferredNumBlocks =
1299
+ std::min (PreferredNumBlocks,
1300
+ OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads));
1301
+ } else if (GenericDevice.getReuseBlocksForHighTripCount ()) {
1302
+ // If the loops are long running we rather reuse blocks than spawn too
1303
+ // many.
1291
1304
PreferredNumBlocks = std::min (TripCountNumBlocks, AdjustedNumBlocks);
1305
+ }
1306
+
1292
1307
return std::min (PreferredNumBlocks,
1293
1308
(uint64_t )GenericDevice.getBlockLimit ());
1294
1309
}
0 commit comments