@@ -963,6 +963,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
963
963
XteamRedBlockSize > 0
964
964
? llvm::omp::xteam_red::MaxThreadsPerCU / XteamRedBlockSize
965
965
: llvm::omp::xteam_red::MaxCUMultiplier;
966
+ if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
967
+ CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
966
968
967
969
// Here's the default we use
968
970
uint64_t NumGroups = DeviceNumCUs;
@@ -994,6 +996,29 @@ struct AMDGPUKernelTy : public GenericKernelTy {
994
996
if (LoopTripCount > 0 )
995
997
NumGroupsFromTripCount =
996
998
getNumGroupsFromThreadsAndTripCount (LoopTripCount, NumThreads);
999
+
1000
+ // Compute desired number of groups in the absence of user input
1001
+ // based on a factor controlled by an integer env-var.
1002
+ // 0: disabled (default)
1003
+ // 1: If the number of waves is lower than the default, increase
1004
+ // the number of teams proportionally. Ideally, this would be the
1005
+ // default behavior.
1006
+ // > 1: Use as the scaling factor for the number of teams.
1007
+ // Note that the upper bound is MaxNumGroups.
1008
+ uint32_t AdjustFactor =
1009
+ GenericDevice.getOMPXAdjustNumTeamsForXteamRedSmallBlockSize ();
1010
+ if (NumThreads > 0 && AdjustFactor > 0 ) {
1011
+ uint64_t DesiredNumGroups = NumGroups;
1012
+ if (AdjustFactor == 1 ) {
1013
+ DesiredNumGroups =
1014
+ DeviceNumCUs *
1015
+ (llvm::omp::xteam_red::DesiredWavesPerCU / NumWavesInGroup);
1016
+ } else {
1017
+ DesiredNumGroups = DeviceNumCUs * AdjustFactor;
1018
+ }
1019
+ NumGroups = DesiredNumGroups;
1020
+ }
1021
+ NumGroups = std::min (NumGroups, MaxNumGroups);
997
1022
NumGroups = std::min (NumGroups, NumGroupsFromTripCount);
998
1023
999
1024
// If the user specifies a number of teams for low trip count loops,
@@ -2469,6 +2494,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2469
2494
" LIBOMPTARGET_WAVES_PER_CU_FOR_LOW_TRIP_COUNT" , 0 ),
2470
2495
OMPX_AdjustNumTeamsForSmallBlockSize (" LIBOMPTARGET_AMDGPU_ADJUST_TEAMS" ,
2471
2496
0 ),
2497
+ OMPX_AdjustNumTeamsForXteamRedSmallBlockSize (
2498
+ " LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS" , 0 ),
2472
2499
OMPX_MaxAsyncCopyBytes (" LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES" ,
2473
2500
1 * 1024 * 1024 ), // 1MB
2474
2501
OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
@@ -2564,6 +2591,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2564
2591
virtual uint32_t getOMPXAdjustNumTeamsForSmallBlockSize () const override {
2565
2592
return OMPX_AdjustNumTeamsForSmallBlockSize;
2566
2593
}
2594
+ virtual uint32_t
2595
+ getOMPXAdjustNumTeamsForXteamRedSmallBlockSize () const override {
2596
+ return OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
2597
+ }
2567
2598
2568
2599
// / Initialize the device, its resources and get its properties.
2569
2600
Error initImpl (GenericPluginTy &Plugin) override {
@@ -3892,6 +3923,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3892
3923
// / done.
3893
3924
UInt32Envar OMPX_AdjustNumTeamsForSmallBlockSize;
3894
3925
3926
+ // / Envar to allow scaling up the number of teams for Xteam-Reduction
3927
+ // / whenever the blocksize has been reduced from the default. The env-var
3928
+ // / default of 0 means that the scaling is not done by default.
3929
+ UInt32Envar OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
3930
+
3895
3931
// / Envar specifying the maximum size in bytes where the memory copies are
3896
3932
// / asynchronous operations. Up to this transfer size, the memory copies are
3897
3933
// / asychronous operations pushed to the corresponding stream. For larger
0 commit comments