Skip to content

Commit 524abee

Browse files
Kewen12ronlieb
authored andcommitted
[OpenMP][Offload][AMDGPU] Enable occupancy base optimization for cross-team reduction kernel
This patch introduced a new runtime flag to enable the occupancy-based optimization on the cross-team reduction kernel (SGN=8). A new test covering this case will be added in a PR to AOMP repo. Change-Id: Ifb22be855d7b14d199784117351b891e55723e0c
1 parent 4c46129 commit 524abee

File tree

1 file changed

+19
-4
lines changed
  • offload/plugins-nextgen/amdgpu/src

1 file changed

+19
-4
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
747747
HostServiceBufferHandler(Handler),
748748
OMPX_SPMDOccupancyBasedOpt("OMPX_SPMD_OCCUPANCY_BASED_OPT", false),
749749
OMPX_BigJumpLoopOccupancyBasedOpt(
750-
"OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT", false) {}
750+
"OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT", false),
751+
OMPX_XTeamReductionOccupancyBasedOpt(
752+
"OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT", false) {}
751753

752754
/// Initialize the AMDGPU kernel.
753755
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -880,6 +882,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
880882
/// Envar to enable occupancy-based optimization for big jump loop.
881883
BoolEnvar OMPX_BigJumpLoopOccupancyBasedOpt;
882884

885+
/// Envar to enable occupancy-based optimization for cross team reduction.
886+
BoolEnvar OMPX_XTeamReductionOccupancyBasedOpt;
887+
883888
private:
884889
/// The kernel object to execute.
885890
uint64_t KernelObject;
@@ -1104,6 +1109,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11041109
uint64_t NumGroups = DeviceNumCUs;
11051110
// The number of teams must not exceed this upper limit.
11061111
uint64_t MaxNumGroups = NumGroups;
1112+
// Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
1113+
// type, if possible.
1114+
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
1115+
11071116
if (GenericDevice.isFastReductionEnabled()) {
11081117
// When fast reduction is enabled, the number of teams is capped by
11091118
// the MaxCUMultiplier constant.
@@ -1121,9 +1130,15 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11211130
MaxNumGroups = DeviceNumCUs * CUMultiplier;
11221131
}
11231132

1124-
// Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
1125-
// type, if possible.
1126-
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
1133+
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
1134+
// OMP_NUM_TEAMS is specified, optimize the num of teams based on
1135+
// occupancy value.
1136+
if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0) {
1137+
uint64_t newNumTeams =
1138+
OptimizeNumTeamsBaseOccupancy(GenericDevice, NumThreads);
1139+
1140+
return std::min(newNumTeams, MaxNumGroups);
1141+
}
11271142

11281143
// Prefer num_teams clause over environment variable. There is a corner
11291144
// case where inspite of the presence of a num_teams clause, CodeGen

0 commit comments

Comments
 (0)