Skip to content

Commit d9efd04

Browse files
dhruvachakronlieb
authored andcommitted
[OpenMP] [amdgpu] Scale up the number of teams for Xteam Reduction under an env-var.
Added an integer env-var LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS to control the number of teams for Xteam Reduction. The default is 0 (disabled). If set to 1, the plugin will use occupancy as the goal to scale up the number of teams when the blocksize is lower than the default blocksize. If >1, that number will be directly used to scale up the number of teams. The default number of teams in the absence of any user input remains at the number of CUs. Change-Id: I622d26f1f4f7d12665eaa907be93f7dbd6418175
1 parent ad92c0d commit d9efd04

File tree

3 files changed

+42
-0
lines changed

3 files changed

+42
-0
lines changed

llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,9 @@ constexpr int16_t MaxCUMultiplier = 32;
284284
// Maximum number of threads allowed per CU.
285285
constexpr int16_t MaxThreadsPerCU = 2048;
286286

287+
// Desired number of wavefronts per CU.
288+
constexpr int16_t DesiredWavesPerCU = 16;
289+
287290
// Default block size, currently different from other kernel types.
288291
constexpr int16_t DefaultBlockSize = 1024;
289292

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
963963
XteamRedBlockSize > 0
964964
? llvm::omp::xteam_red::MaxThreadsPerCU / XteamRedBlockSize
965965
: llvm::omp::xteam_red::MaxCUMultiplier;
966+
if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
967+
CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
966968

967969
// Here's the default we use
968970
uint64_t NumGroups = DeviceNumCUs;
@@ -994,6 +996,29 @@ struct AMDGPUKernelTy : public GenericKernelTy {
994996
if (LoopTripCount > 0)
995997
NumGroupsFromTripCount =
996998
getNumGroupsFromThreadsAndTripCount(LoopTripCount, NumThreads);
999+
1000+
// Compute desired number of groups in the absence of user input
1001+
// based on a factor controlled by an integer env-var.
1002+
// 0: disabled (default)
1003+
// 1: If the number of waves is lower than the default, increase
1004+
// the number of teams proportionally. Ideally, this would be the
1005+
// default behavior.
1006+
// > 1: Use as the scaling factor for the number of teams.
1007+
// Note that the upper bound is MaxNumGroups.
1008+
uint32_t AdjustFactor =
1009+
GenericDevice.getOMPXAdjustNumTeamsForXteamRedSmallBlockSize();
1010+
if (NumThreads > 0 && AdjustFactor > 0) {
1011+
uint64_t DesiredNumGroups = NumGroups;
1012+
if (AdjustFactor == 1) {
1013+
DesiredNumGroups =
1014+
DeviceNumCUs *
1015+
(llvm::omp::xteam_red::DesiredWavesPerCU / NumWavesInGroup);
1016+
} else {
1017+
DesiredNumGroups = DeviceNumCUs * AdjustFactor;
1018+
}
1019+
NumGroups = DesiredNumGroups;
1020+
}
1021+
NumGroups = std::min(NumGroups, MaxNumGroups);
9971022
NumGroups = std::min(NumGroups, NumGroupsFromTripCount);
9981023

9991024
// If the user specifies a number of teams for low trip count loops,
@@ -2469,6 +2494,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
24692494
"LIBOMPTARGET_WAVES_PER_CU_FOR_LOW_TRIP_COUNT", 0),
24702495
OMPX_AdjustNumTeamsForSmallBlockSize("LIBOMPTARGET_AMDGPU_ADJUST_TEAMS",
24712496
0),
2497+
OMPX_AdjustNumTeamsForXteamRedSmallBlockSize(
2498+
"LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS", 0),
24722499
OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
24732500
1 * 1024 * 1024), // 1MB
24742501
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
@@ -2564,6 +2591,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25642591
virtual uint32_t getOMPXAdjustNumTeamsForSmallBlockSize() const override {
25652592
return OMPX_AdjustNumTeamsForSmallBlockSize;
25662593
}
2594+
virtual uint32_t
2595+
getOMPXAdjustNumTeamsForXteamRedSmallBlockSize() const override {
2596+
return OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
2597+
}
25672598

25682599
/// Initialize the device, its resources and get its properties.
25692600
Error initImpl(GenericPluginTy &Plugin) override {
@@ -3892,6 +3923,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
38923923
/// done.
38933924
UInt32Envar OMPX_AdjustNumTeamsForSmallBlockSize;
38943925

3926+
/// Envar to allow scaling up the number of teams for Xteam-Reduction
3927+
/// whenever the blocksize has been reduced from the default. The env-var
3928+
/// default of 0 means that the scaling is not done by default.
3929+
UInt32Envar OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
3930+
38953931
/// Envar specifying the maximum size in bytes where the memory copies are
38963932
/// asynchronous operations. Up to this transfer size, the memory copies are
38973933
/// asychronous operations pushed to the corresponding stream. For larger

openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -903,6 +903,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
903903
virtual uint32_t getOMPXAdjustNumTeamsForSmallBlockSize() const {
904904
llvm_unreachable("Unimplemented");
905905
}
906+
virtual uint32_t getOMPXAdjustNumTeamsForXteamRedSmallBlockSize() const {
907+
llvm_unreachable("Unimplemented");
908+
}
906909

907910
/// Get target compute unit kind (e.g., sm_80, or gfx908).
908911
virtual std::string getComputeUnitKind() const { return "unknown"; }

0 commit comments

Comments
 (0)