Skip to content

Commit 0ba57c8

Browse files
authored
[OpenMP] Pass min/max thread and team count to the OMPIRBuilder (#70247)
We now provide the information about the min/max thread and team count from to the OMPIRBuilder, no matter what the source was. That means we unify `thread_limit`, `num_teams`, `num_threads` handling with the target specific attriutes (`__launch_bounds__` and `amdgpu_flat_work_group_size`). This is in preparation to pass the values to the runtime, and to allow the middle-end (OpenMP-opt) to tighten the values if it seems appropriate. There is no "real" change after this commit.
1 parent 57cebc7 commit 0ba57c8

File tree

12 files changed

+1322
-1165
lines changed

12 files changed

+1322
-1165
lines changed

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6021,15 +6021,46 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
60216021
};
60226022

60236023
// Get NumTeams and ThreadLimit attributes
6024-
int32_t DefaultValTeams = -1;
6025-
uint32_t DefaultValThreads = UINT32_MAX;
6026-
getNumTeamsExprForTargetDirective(CGF, D, DefaultValTeams);
6027-
getNumThreadsExprForTargetDirective(CGF, D, DefaultValThreads,
6024+
int32_t DefaultValMinTeams = 1;
6025+
int32_t DefaultValMaxTeams = -1;
6026+
uint32_t DefaultValMinThreads = 1;
6027+
uint32_t DefaultValMaxThreads = UINT32_MAX;
6028+
6029+
getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
6030+
DefaultValMaxTeams);
6031+
getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
60286032
/*UpperBoundOnly=*/true);
60296033

6030-
OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
6031-
DefaultValTeams, DefaultValThreads,
6032-
IsOffloadEntry, OutlinedFn, OutlinedFnID);
6034+
for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
6035+
for (auto *A : C->getAttrs()) {
6036+
int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
6037+
int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
6038+
if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
6039+
CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
6040+
&MinBlocksVal, &MaxBlocksVal);
6041+
else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
6042+
CGM.handleAMDGPUFlatWorkGroupSizeAttr(
6043+
nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
6044+
else
6045+
continue;
6046+
6047+
DefaultValMinThreads =
6048+
std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
6049+
DefaultValMaxThreads =
6050+
DefaultValMaxThreads
6051+
? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
6052+
: MaxThreadsVal;
6053+
DefaultValMinTeams = DefaultValMinTeams
6054+
? std::max(DefaultValMinTeams, MinBlocksVal)
6055+
: MinBlocksVal;
6056+
DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
6057+
}
6058+
}
6059+
6060+
OMPBuilder.emitTargetRegionFunction(
6061+
EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
6062+
DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
6063+
IsOffloadEntry, OutlinedFn, OutlinedFnID);
60336064

60346065
if (!OutlinedFn)
60356066
return;
@@ -6038,14 +6069,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
60386069

60396070
for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
60406071
for (auto *A : C->getAttrs()) {
6041-
if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
6042-
CGM.handleCUDALaunchBoundsAttr(OutlinedFn, Attr);
6043-
else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
6044-
CGM.handleAMDGPUFlatWorkGroupSizeAttr(OutlinedFn, Attr);
6045-
else if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
6072+
if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
60466073
CGM.handleAMDGPUWavesPerEUAttr(OutlinedFn, Attr);
6047-
else
6048-
llvm_unreachable("Unexpected attribute kind");
60496074
}
60506075
}
60516076
}
@@ -6103,8 +6128,8 @@ const Stmt *CGOpenMPRuntime::getSingleCompoundChild(ASTContext &Ctx,
61036128
}
61046129

61056130
const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
6106-
CodeGenFunction &CGF, const OMPExecutableDirective &D,
6107-
int32_t &DefaultVal) {
6131+
CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &MinTeamsVal,
6132+
int32_t &MaxTeamsVal) {
61086133

61096134
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
61106135
assert(isOpenMPTargetExecutionDirective(DirectiveKind) &&
@@ -6125,22 +6150,22 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
61256150
if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
61266151
if (auto Constant =
61276152
NumTeams->getIntegerConstantExpr(CGF.getContext()))
6128-
DefaultVal = Constant->getExtValue();
6153+
MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
61296154
return NumTeams;
61306155
}
6131-
DefaultVal = 0;
6156+
MinTeamsVal = MaxTeamsVal = 0;
61326157
return nullptr;
61336158
}
61346159
if (isOpenMPParallelDirective(NestedDir->getDirectiveKind()) ||
61356160
isOpenMPSimdDirective(NestedDir->getDirectiveKind())) {
6136-
DefaultVal = 1;
6161+
MinTeamsVal = MaxTeamsVal = 1;
61376162
return nullptr;
61386163
}
6139-
DefaultVal = 1;
6164+
MinTeamsVal = MaxTeamsVal = 1;
61406165
return nullptr;
61416166
}
61426167
// A value of -1 is used to check if we need to emit no teams region
6143-
DefaultVal = -1;
6168+
MinTeamsVal = MaxTeamsVal = -1;
61446169
return nullptr;
61456170
}
61466171
case OMPD_target_teams_loop:
@@ -6154,18 +6179,18 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
61546179
D.getSingleClause<OMPNumTeamsClause>()->getNumTeams();
61556180
if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
61566181
if (auto Constant = NumTeams->getIntegerConstantExpr(CGF.getContext()))
6157-
DefaultVal = Constant->getExtValue();
6182+
MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
61586183
return NumTeams;
61596184
}
6160-
DefaultVal = 0;
6185+
MinTeamsVal = MaxTeamsVal = 0;
61616186
return nullptr;
61626187
}
61636188
case OMPD_target_parallel:
61646189
case OMPD_target_parallel_for:
61656190
case OMPD_target_parallel_for_simd:
61666191
case OMPD_target_parallel_loop:
61676192
case OMPD_target_simd:
6168-
DefaultVal = 1;
6193+
MinTeamsVal = MaxTeamsVal = 1;
61696194
return nullptr;
61706195
case OMPD_parallel:
61716196
case OMPD_for:
@@ -6240,8 +6265,9 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
62406265
"Clauses associated with the teams directive expected to be emitted "
62416266
"only for the host!");
62426267
CGBuilderTy &Bld = CGF.Builder;
6243-
int32_t DefaultNT = -1;
6244-
const Expr *NumTeams = getNumTeamsExprForTargetDirective(CGF, D, DefaultNT);
6268+
int32_t MinNT = -1, MaxNT = -1;
6269+
const Expr *NumTeams =
6270+
getNumTeamsExprForTargetDirective(CGF, D, MinNT, MaxNT);
62456271
if (NumTeams != nullptr) {
62466272
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
62476273

@@ -6271,7 +6297,8 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
62716297
}
62726298
}
62736299

6274-
return llvm::ConstantInt::get(CGF.Int32Ty, DefaultNT);
6300+
assert(MinNT == MaxNT && "Num threads ranges require handling here.");
6301+
return llvm::ConstantInt::get(CGF.Int32Ty, MinNT);
62756302
}
62766303

62776304
/// Check for a num threads constant value (stored in \p DefaultVal), or

clang/lib/CodeGen/CGOpenMPRuntime.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,8 @@ class CGOpenMPRuntime {
637637
/// Otherwise, return nullptr.
638638
const Expr *getNumTeamsExprForTargetDirective(CodeGenFunction &CGF,
639639
const OMPExecutableDirective &D,
640-
int32_t &DefaultVal);
640+
int32_t &MinTeamsVal,
641+
int32_t &MaxTeamsVal);
641642
llvm::Value *emitNumTeamsForTargetDirective(CodeGenFunction &CGF,
642643
const OMPExecutableDirective &D);
643644

clang/lib/CodeGen/CodeGenModule.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1543,15 +1543,23 @@ class CodeGenModule : public CodeGenTypeCache {
15431543
void moveLazyEmissionStates(CodeGenModule *NewBuilder);
15441544

15451545
/// Emit the IR encoding to attach the CUDA launch bounds attribute to \p F.
1546+
/// If \p MaxThreadsVal is not nullptr, the max threads value is stored in it,
1547+
/// if a valid one was found.
15461548
void handleCUDALaunchBoundsAttr(llvm::Function *F,
1547-
const CUDALaunchBoundsAttr *A);
1549+
const CUDALaunchBoundsAttr *A,
1550+
int32_t *MaxThreadsVal = nullptr,
1551+
int32_t *MinBlocksVal = nullptr,
1552+
int32_t *MaxClusterRankVal = nullptr);
15481553

15491554
/// Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute
15501555
/// to \p F. Alternatively, the work group size can be taken from a \p
1551-
/// ReqdWGS.
1556+
/// ReqdWGS. If \p MinThreadsVal is not nullptr, the min threads value is
1557+
/// stored in it, if a valid one was found. If \p MaxThreadsVal is not
1558+
/// nullptr, the max threads value is stored in it, if a valid one was found.
15521559
void handleAMDGPUFlatWorkGroupSizeAttr(
15531560
llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A,
1554-
const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr);
1561+
const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr,
1562+
int32_t *MinThreadsVal = nullptr, int32_t *MaxThreadsVal = nullptr);
15551563

15561564
/// Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to \p F.
15571565
void handleAMDGPUWavesPerEUAttr(llvm::Function *F,

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,8 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
594594

595595
void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
596596
llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
597-
const ReqdWorkGroupSizeAttr *ReqdWGS) {
597+
const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
598+
int32_t *MaxThreadsVal) {
598599
unsigned Min = 0;
599600
unsigned Max = 0;
600601
if (FlatWGS) {
@@ -607,8 +608,13 @@ void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
607608
if (Min != 0) {
608609
assert(Min <= Max && "Min must be less than or equal Max");
609610

611+
if (MinThreadsVal)
612+
*MinThreadsVal = Min;
613+
if (MaxThreadsVal)
614+
*MaxThreadsVal = Max;
610615
std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
611-
F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
616+
if (F)
617+
F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
612618
} else
613619
assert(Max == 0 && "Max must be zero");
614620
}

clang/lib/CodeGen/Targets/NVPTX.cpp

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -287,33 +287,52 @@ bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
287287
}
288288
}
289289

290-
void CodeGenModule::handleCUDALaunchBoundsAttr(
291-
llvm::Function *F, const CUDALaunchBoundsAttr *Attr) {
290+
void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
291+
const CUDALaunchBoundsAttr *Attr,
292+
int32_t *MaxThreadsVal,
293+
int32_t *MinBlocksVal,
294+
int32_t *MaxClusterRankVal) {
292295
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
293296
llvm::APSInt MaxThreads(32);
294297
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
295-
if (MaxThreads > 0)
296-
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
297-
MaxThreads.getExtValue());
298+
if (MaxThreads > 0) {
299+
if (MaxThreadsVal)
300+
*MaxThreadsVal = MaxThreads.getExtValue();
301+
if (F) {
302+
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
303+
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
304+
MaxThreads.getExtValue());
305+
}
306+
}
298307

299308
// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
300309
// was not specified in __launch_bounds__ or if the user specified a 0 value,
301310
// we don't have to add a PTX directive.
302311
if (Attr->getMinBlocks()) {
303312
llvm::APSInt MinBlocks(32);
304313
MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(getContext());
305-
if (MinBlocks > 0)
306-
// Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
307-
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
308-
MinBlocks.getExtValue());
314+
if (MinBlocks > 0) {
315+
if (MinBlocksVal)
316+
*MinBlocksVal = MinBlocks.getExtValue();
317+
if (F) {
318+
// Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
319+
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
320+
MinBlocks.getExtValue());
321+
}
322+
}
309323
}
310324
if (Attr->getMaxBlocks()) {
311325
llvm::APSInt MaxBlocks(32);
312326
MaxBlocks = Attr->getMaxBlocks()->EvaluateKnownConstInt(getContext());
313-
if (MaxBlocks > 0)
314-
// Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
315-
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
316-
MaxBlocks.getExtValue());
327+
if (MaxBlocks > 0) {
328+
if (MaxClusterRankVal)
329+
*MaxClusterRankVal = MaxBlocks.getExtValue();
330+
if (F) {
331+
// Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
332+
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
333+
MaxBlocks.getExtValue());
334+
}
335+
}
317336
}
318337
}
319338

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
// REQUIRES: amdgpu-registered-target
22

33
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
4-
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
5-
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
4+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
5+
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
6+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
67
// expected-no-diagnostics
78

89

910
// Check that the target attributes are set on the generated kernel
1011
void func() {
11-
// CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l15() #0
12-
// CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l17()
13-
// CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l19() #4
12+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16() #0
13+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18()
14+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20() #4
1415

1516
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
1617
{}
@@ -21,11 +22,20 @@ void func() {
2122
{}
2223
}
2324

24-
// CHECK: attributes #0
25-
// CHECK-SAME: "amdgpu-flat-work-group-size"="10,20"
26-
// CHECK: attributes #4
27-
// CHECK-SAME: "amdgpu-flat-work-group-size"="3,17"
28-
// CHECK-SAME: "amdgpu-waves-per-eu"="3,7"
25+
// AMD: attributes #0
26+
// AMD-SAME: "amdgpu-flat-work-group-size"="10,20"
27+
// AMD-SAME: "omp_target_thread_limit"="20"
28+
// AMD: "omp_target_thread_limit"="45"
29+
// AMD: attributes #4
30+
// AMD-SAME: "amdgpu-flat-work-group-size"="3,17"
31+
// AMD-SAME: "amdgpu-waves-per-eu"="3,7"
32+
// AMD-SAME: "omp_target_thread_limit"="17"
2933

30-
// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"maxntidx", i32 45}
31-
// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"minctasm", i32 90}
34+
// It is unclear if we should use the AMD annotations for other targets, we do for now.
35+
// NVIDIA: "omp_target_thread_limit"="20"
36+
// NVIDIA: "omp_target_thread_limit"="45"
37+
// NVIDIA: "omp_target_thread_limit"="17"
38+
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
39+
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"minctasm", i32 90}
40+
// NVIDIA: !{ptr @__omp_offloading[[HASH2]]_l18, !"maxntidx", i32 45}
41+
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}

0 commit comments

Comments
 (0)