@@ -747,7 +747,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
747747 HostServiceBufferHandler(Handler),
748748 OMPX_SPMDOccupancyBasedOpt(" OMPX_SPMD_OCCUPANCY_BASED_OPT" , false ),
749749 OMPX_BigJumpLoopOccupancyBasedOpt(
750- " OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT" , false ) {}
750+ " OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT" , false ),
751+ OMPX_XTeamReductionOccupancyBasedOpt(
752+ " OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT" , false ) {}
751753
752754 // / Initialize the AMDGPU kernel.
753755 Error initImpl (GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -880,6 +882,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
880882 // / Envar to enable occupancy-based optimization for big jump loop.
881883 BoolEnvar OMPX_BigJumpLoopOccupancyBasedOpt;
882884
885+ // / Envar to enable occupancy-based optimization for cross team reduction.
886+ BoolEnvar OMPX_XTeamReductionOccupancyBasedOpt;
887+
883888private:
884889 // / The kernel object to execute.
885890 uint64_t KernelObject;
@@ -1104,6 +1109,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11041109 uint64_t NumGroups = DeviceNumCUs;
11051110 // The number of teams must not exceed this upper limit.
11061111 uint64_t MaxNumGroups = NumGroups;
1112+ // Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
1113+ // type, if possible.
1114+ int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1115+
11071116 if (GenericDevice.isFastReductionEnabled ()) {
11081117 // When fast reduction is enabled, the number of teams is capped by
11091118 // the MaxCUMultiplier constant.
@@ -1121,9 +1130,15 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11211130 MaxNumGroups = DeviceNumCUs * CUMultiplier;
11221131 }
11231132
1124- // Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
1125- // type, if possible.
1126- int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1133+ // If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
1134+ // OMP_NUM_TEAMS is specified, optimize the num of teams based on
1135+ // occupancy value.
1136+ if (OMPX_XTeamReductionOccupancyBasedOpt && NumTeamsEnvVar == 0 ) {
1137+ uint64_t newNumTeams =
1138+ OptimizeNumTeamsBaseOccupancy (GenericDevice, NumThreads);
1139+
1140+ return std::min (newNumTeams, MaxNumGroups);
1141+ }
11271142
11281143 // Prefer num_teams clause over environment variable. There is a corner
11291144 // case where inspite of the presence of a num_teams clause, CodeGen
0 commit comments