Skip to content

Commit 441c5fc

Browse files
committed
[OpenMP] [Xteam Reduction] Compute number of teams based on whether fast reduction is enabled.
Clang will write a global variable indicating whether -fopenmp-target-fast-reduction was used during compile. If so, the number of teams allowed during kernel launch may be determined accordingly. Change-Id: Iba930f8d0cbfdb6a8ef376270a5c936c6f87d17e
1 parent d045641 commit 441c5fc

File tree

10 files changed

+159
-42
lines changed

10 files changed

+159
-42
lines changed

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9617,14 +9617,11 @@ static void emitTargetCallKernelLaunch(
96179617
OMPRTL_ompx_get_team_procs),
96189618
DevIdVal, "team_procs");
96199619

9620-
// Compute CUMultiplier = (Max threads per CU) / (Block size)
9621-
int64_t XteamRedBlockSize = CGF.CGM.getXteamRedBlockSize(D);
9622-
int64_t CUMultiplier =
9623-
XteamRedBlockSize > 0
9624-
? llvm::omp::xteam_red::MaxThreadsPerCU / XteamRedBlockSize
9625-
: llvm::omp::xteam_red::MaxCUMultiplier;
9626-
if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
9627-
CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
9620+
// Given the currently determined blocksize, compute the scaling
9621+
// factor for number of teams in terms of the number of CUs. This
9622+
// computation must stay in sync with the runtime.
9623+
uint32_t CUMultiplier = llvm::omp::xteam_red::getXteamRedCUMultiplier(
9624+
CGF.CGM.getXteamRedBlockSize(D));
96289625

96299626
llvm::Value *Int64CUMultiplier =
96309627
llvm::ConstantInt::get(CGF.Int64Ty, CUMultiplier);

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,19 @@ static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
10001000
CGM.addCompilerUsedGlobal(GVMode);
10011001
}
10021002

1003+
// Create a global variable to indicate whether fast reduction is enabled for
1004+
// this file. This variable is read by the runtime while determining the launch
1005+
// bounds.
1006+
static void setIsFastReduction(CodeGenModule &CGM) {
1007+
auto *GVFastReduction = new llvm::GlobalVariable(
1008+
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1009+
llvm::GlobalValue::WeakAnyLinkage,
1010+
llvm::ConstantInt::get(CGM.Int8Ty,
1011+
CGM.getLangOpts().OpenMPTargetFastReduction),
1012+
Twine("__omp_plugin_enable_fast_reduction"));
1013+
CGM.addCompilerUsedGlobal(GVFastReduction);
1014+
}
1015+
10031016
static OMPTgtExecModeFlags
10041017
computeExecutionMode(bool Mode, const Stmt *DirectiveStmt, CodeGenModule &CGM) {
10051018
if (!Mode)
@@ -1085,6 +1098,11 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
10851098
if (CGM.getLangOpts().OpenMPCUDAMode)
10861099
CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
10871100

1101+
// Write a global variable indicating whether fast reduction is enabled.
1102+
// This is done regardless of -nogpulib
1103+
if (!CGM.getLangOpts().OMPHostIRFile.empty())
1104+
setIsFastReduction(CGM);
1105+
10881106
llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
10891107
if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
10901108
return;

clang/test/OpenMP/declare_target_constexpr_codegen.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class A {
1818
//.
1919
// CHECK: @_ZN1A2piE = linkonce_odr constant double 0x400921FB54442D18, comdat, align 8
2020
// CHECK: @_ZL9anotherPi = internal constant double 3.140000e+00, align 8
21-
// CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata"
21+
// CHECK: @llvm.compiler.used = appending global [3 x ptr] [ptr @__omp_plugin_enable_fast_reduction, ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata"
2222
//.
2323
A() { ; }
2424
~A() { ; }

llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,16 @@ constexpr int16_t DefaultBlockSize = 1024;
294294
// so that it is accessible for all targets.
295295
constexpr int16_t MaxBlockSize = 1024;
296296

297+
// Compute CUMultiplier = (Max threads per CU) / (Block size)
298+
static inline uint32_t getXteamRedCUMultiplier(uint32_t BlockSize) {
299+
uint32_t CUMultiplier =
300+
BlockSize > 0 ? llvm::omp::xteam_red::MaxThreadsPerCU / BlockSize
301+
: llvm::omp::xteam_red::MaxCUMultiplier;
302+
if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
303+
CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
304+
return CUMultiplier;
305+
}
306+
297307
} // end namespace xteam_red
298308

299309
/// A type of worksharing loop construct
@@ -306,6 +316,15 @@ enum class WorksharingLoopType {
306316
DistributeForStaticLoop
307317
};
308318

319+
static inline uint32_t getBlockSizeAsPowerOfTwo(uint32_t BlockSize) {
320+
uint32_t Tmp = BlockSize;
321+
do {
322+
BlockSize = Tmp;
323+
Tmp = BlockSize & (BlockSize - 1);
324+
} while (Tmp != 0);
325+
return BlockSize;
326+
}
327+
309328
} // end namespace omp
310329

311330
} // end namespace llvm

llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -120,15 +120,6 @@ static constexpr GV NVPTXGridValues = {
120120
128, // GV_Default_WG_Size
121121
};
122122

123-
static inline uint32_t getBlockSizeAsPowerOfTwo(uint32_t BlockSize) {
124-
uint32_t Tmp = BlockSize;
125-
do {
126-
BlockSize = Tmp;
127-
Tmp = BlockSize & (BlockSize - 1);
128-
} while (Tmp != 0);
129-
return BlockSize;
130-
}
131-
132123
} // namespace omp
133124
} // namespace llvm
134125

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -945,32 +945,26 @@ struct AMDGPUKernelTy : public GenericKernelTy {
945945
}
946946

947947
if (isXTeamReductionsMode()) {
948-
// Note: The plugin does not know whether XteamReduction is running in
949-
// fast mode. If fast mode, metadata is not used and the following
950-
// restrictions are not required. But since the plugin does not know, it
951-
// will assume that it is running in the default mode with constrained
952-
// metadata.
953-
954-
// The number of teams must not exceed the upper limit determined during
955-
// code generation. This upper limit is not currently communicated from
956-
// codegen to the plugin. So compute it here again, note that this must
957-
// be kept in sync with codegen.
958-
959-
// This is the block size that CodeGen used.
960-
uint32_t XteamRedBlockSize = ConstWGSize;
961-
962-
int32_t CUMultiplier =
963-
XteamRedBlockSize > 0
964-
? llvm::omp::xteam_red::MaxThreadsPerCU / XteamRedBlockSize
965-
: llvm::omp::xteam_red::MaxCUMultiplier;
966-
if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
967-
CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
968-
969-
// Here's the default we use
948+
// Here's the default number of teams.
970949
uint64_t NumGroups = DeviceNumCUs;
971-
972950
// The number of teams must not exceed this upper limit.
973-
uint64_t MaxNumGroups = DeviceNumCUs * CUMultiplier;
951+
uint64_t MaxNumGroups = NumGroups;
952+
if (GenericDevice.isFastReductionEnabled()) {
953+
// When fast reduction is enabled, the number of teams is capped by
954+
// the MaxCUMultiplier constant.
955+
MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
956+
} else {
957+
// When fast reduction is not enabled, the number of teams is capped
958+
// by the metadata that clang CodeGen created. The number of teams
959+
// used here must not exceed the upper limit determined during
960+
// CodeGen. This upper limit is not currently communicated from
961+
// CodeGen to the plugin. So it is re-computed here.
962+
963+
// ConstWGSize is the block size that CodeGen used.
964+
uint32_t CUMultiplier =
965+
llvm::omp::xteam_red::getXteamRedCUMultiplier(ConstWGSize);
966+
MaxNumGroups = DeviceNumCUs * CUMultiplier;
967+
}
974968

975969
// Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
976970
// type, if possible.
@@ -1029,6 +1023,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10291023
NumGroups = std::min(MaxNumGroups, LowTripCountBlocks);
10301024
}
10311025
}
1026+
DP("xteam-red:NumCUs=%lu xteam-red:NumGroups=%lu\n", DeviceNumCUs,
1027+
NumGroups);
10321028
return NumGroups;
10331029
}
10341030

openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
950950
bool useAutoZeroCopy();
951951
virtual bool useAutoZeroCopyImpl() { return false; }
952952

953+
bool isFastReductionEnabled() const { return IsFastReductionEnabled; }
954+
953955
private:
954956
/// Get and set the stack size and heap size for the device. If not used, the
955957
/// plugin can implement the setters as no-op and setting the output
@@ -1045,6 +1047,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
10451047

10461048
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
10471049
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
1050+
1051+
bool IsFastReductionEnabled = false;
10481052
};
10491053

10501054
/// Class implementing common functionalities of offload plugins. Each plugin

openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,17 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
10181018
DevEnvGlobal.getName().data());
10191019
consumeError(std::move(Err));
10201020
}
1021+
1022+
// From the image, read whether fast reduction is enabled.
1023+
StaticGlobalTy<int8_t> IsFastRedGlobal("__omp_plugin_enable_fast_reduction");
1024+
if (auto Err = GHandler.readGlobalFromImage(*this, Image, IsFastRedGlobal)) {
1025+
DP("Missing symbol %s, continue execution anyway.\n",
1026+
IsFastRedGlobal.getName().data());
1027+
consumeError(std::move(Err));
1028+
} else {
1029+
IsFastReductionEnabled = IsFastRedGlobal.getValue();
1030+
}
1031+
10211032
return Plugin::success();
10221033
}
10231034

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// clang-format off
2+
// This test verifies that the reduction kernel is of Xteam-reduction type
3+
// and is launched with 1920 teams and 8 threads in each team.
4+
//
5+
// RUN: %libomptarget-compile-generic -fopenmp-target-fast -fopenmp-target-fast-reduction
6+
// RUN: env LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT=15360 LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS=32 \
7+
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
8+
9+
// UNSUPPORTED: nvptx64-nvidia-cuda
10+
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
11+
// UNSUPPORTED: aarch64-unknown-linux-gnu
12+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
13+
// UNSUPPORTED: x86_64-pc-linux-gnu
14+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
15+
16+
// clang-format on
17+
#include <stdio.h>
18+
19+
int main() {
20+
int N = 15360;
21+
22+
double a[N];
23+
24+
for (int i = 0; i < N; i++)
25+
a[i] = i;
26+
27+
double sum1;
28+
sum1 = 0;
29+
30+
#pragma omp target teams distribute parallel for map(tofrom:sum1) reduction(+:sum1)
31+
for (int j = 0; j < N; j = j + 1)
32+
sum1 += a[j];
33+
34+
printf("sum1=%f\n", sum1);
35+
36+
return 0;
37+
}
38+
// clang-format off
39+
/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8
40+
/// CHECK: teamsXthrds:(1920X 8)
41+
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// clang-format off
2+
// This test verifies that the reduction kernel is of Xteam reduction
3+
// type and is launched with as many teams as the number of CUs.
4+
// RUN: %libomptarget-compile-generic -fopenmp-target-fast
5+
// RUN: env LIBOMPTARGET_DEBUG=1 \
6+
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
7+
8+
// UNSUPPORTED: nvptx64-nvidia-cuda
9+
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
10+
// UNSUPPORTED: aarch64-unknown-linux-gnu
11+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
12+
// UNSUPPORTED: x86_64-pc-linux-gnu
13+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
14+
15+
// clang-format on
16+
#include <stdio.h>
17+
18+
int main() {
19+
int N = 1000000;
20+
21+
double a[N];
22+
23+
for (int i = 0; i < N; i++)
24+
a[i] = i;
25+
26+
double sum1;
27+
sum1 = 0;
28+
29+
#pragma omp target teams distribute parallel for map(tofrom:sum1) reduction(+:sum1)
30+
for (int j = 0; j < N; j = j + 1)
31+
sum1 += a[j];
32+
33+
printf("sum1=%f\n", sum1);
34+
35+
return 0;
36+
}
37+
// clang-format off
38+
/// CHECK: xteam-red:NumCUs=[[CU_COUNT:[0-9]+]]
39+
/// CHECK: xteam-red:NumGroups=[[CU_COUNT]]
40+

0 commit comments

Comments
 (0)