Skip to content

Commit 39be8c6

Browse files
authored
[OpenMP] [amdgpu] Implement occupancy-based num_teams selection for generic-SPMD mode kernels. (llvm#4060)
2 parents 56f1b63 + f2be466 commit 39be8c6

File tree

1 file changed

+33
-18
lines changed
  • offload/plugins-nextgen/amdgpu/src

1 file changed

+33
-18
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
745745
AMDGPUKernelTy(const char *Name, GenericGlobalHandlerTy &Handler)
746746
: GenericKernelTy(Name),
747747
OMPX_SPMDOccupancyBasedOpt("OMPX_SPMD_OCCUPANCY_BASED_OPT", false),
748+
OMPX_GenericSPMDOccupancyBasedOpt(
749+
"OMPX_GENERIC_SPMD_OCCUPANCY_BASED_OPT", false),
748750
OMPX_BigJumpLoopOccupancyBasedOpt(
749751
"OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT", false),
750752
OMPX_XTeamReductionOccupancyBasedOpt(
@@ -888,6 +890,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
888890
/// Envar to enable occupancy-based optimization for SPMD kernel.
889891
BoolEnvar OMPX_SPMDOccupancyBasedOpt;
890892

893+
/// Envar to enable occupancy-based optimization for generic SPMD kernel.
894+
BoolEnvar OMPX_GenericSPMDOccupancyBasedOpt;
895+
891896
/// Envar to enable occupancy-based optimization for big jump loop.
892897
BoolEnvar OMPX_BigJumpLoopOccupancyBasedOpt;
893898

@@ -1060,14 +1065,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10601065

10611066
if (isBigJumpLoopMode()) {
10621067
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
1063-
1064-
// If envar OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT is set and no
1065-
// OMP_NUM_TEAMS is specified, optimize the num of teams based on
1066-
// occupancy value.
1067-
if (OMPX_BigJumpLoopOccupancyBasedOpt && NumTeamsEnvVar == 0) {
1068-
return OptimizeNumTeamsBaseOccupancy(GenericDevice, NumThreads);
1069-
}
1070-
10711068
uint64_t NumGroups = 1;
10721069
// Cannot assert a non-zero tripcount. Instead, launch with 1 team if the
10731070
// tripcount is indeed zero.
@@ -1111,6 +1108,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11111108
NumGroups = LowTripCountBlocks;
11121109
}
11131110
}
1111+
// If envar OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT is set and no num_teams
1112+
// clause or OMP_NUM_TEAMS is specified, optimize the number of teams
1113+
// based on occupancy value.
1114+
if (OMPX_BigJumpLoopOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1115+
NumTeamsClause[0] == 0) {
1116+
return std::min(NumGroups, OptimizeNumTeamsBaseOccupancy(GenericDevice,
1117+
NumThreads));
1118+
}
11141119
return std::min(NumGroups,
11151120
static_cast<uint64_t>(GenericDevice.getBlockLimit()));
11161121
}
@@ -1144,12 +1149,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11441149
}
11451150

11461151
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
1147-
// OMP_NUM_TEAMS is specified, optimize the num of teams based on
1148-
// occupancy value.
1149-
if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0) {
1152+
// OMP_NUM_TEAMS or num_teams clause is specified, optimize the num of
1153+
// teams based on occupancy value.
1154+
if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1155+
NumTeamsClause[0] == 0) {
11501156
uint64_t newNumTeams =
11511157
OptimizeNumTeamsBaseOccupancy(GenericDevice, NumThreads);
1152-
11531158
return std::min(newNumTeams, MaxNumGroups);
11541159
}
11551160

@@ -1222,10 +1227,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12221227
// If envar OMPX_SPMD_OCCUPANCY_BASED_OPT is set and no OMP_NUM_TEAMS is
12231228
// specified, optimize the num of teams based on occupancy value.
12241229
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
1225-
if (isSPMDMode() && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0) {
1226-
return OptimizeNumTeamsBaseOccupancy(GenericDevice, NumThreads);
1227-
}
1228-
12291230
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
12301231
if (LoopTripCount > 0) {
12311232
if (isSPMDMode()) {
@@ -1253,6 +1254,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12531254
}
12541255
}
12551256

1257+
if (isSPMDMode() && OMPX_SPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 &&
1258+
NumTeamsClause[0] == 0) {
1259+
return std::min(TripCountNumBlocks,
1260+
OptimizeNumTeamsBaseOccupancy(GenericDevice, NumThreads));
1261+
}
1262+
12561263
auto getAdjustedDefaultNumBlocks =
12571264
[this](GenericDeviceTy &GenericDevice,
12581265
uint64_t DeviceNumCUs) -> uint64_t {
@@ -1286,9 +1293,17 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12861293
}
12871294

12881295
uint64_t PreferredNumBlocks = TripCountNumBlocks;
1289-
// If the loops are long running we rather reuse blocks than spawn too many.
1290-
if (GenericDevice.getReuseBlocksForHighTripCount())
1296+
// Occupancy-based setting overrides block reuse.
1297+
if (OMPX_GenericSPMDOccupancyBasedOpt && NumTeamsEnvVar == 0 && NumTeamsClause[0] == 0) {
1298+
PreferredNumBlocks =
1299+
std::min(PreferredNumBlocks,
1300+
OptimizeNumTeamsBaseOccupancy(GenericDevice, NumThreads));
1301+
} else if (GenericDevice.getReuseBlocksForHighTripCount()) {
1302+
// If the loops are long running we rather reuse blocks than spawn too
1303+
// many.
12911304
PreferredNumBlocks = std::min(TripCountNumBlocks, AdjustedNumBlocks);
1305+
}
1306+
12921307
return std::min(PreferredNumBlocks,
12931308
(uint64_t)GenericDevice.getBlockLimit());
12941309
}

0 commit comments

Comments
 (0)