Skip to content

Commit 83ef38a

Browse files
[Flang][OpenMP] Enable no-loop kernels (#155818)
Enable the generation of no-loop kernels for Fortran OpenMP code. target teams distribute parallel do pragmas can be promoted to no-loop kernels if the user adds the -fopenmp-assume-teams-oversubscription and -fopenmp-assume-threads-oversubscription flags. If the OpenMP kernel contains reduction or num_teams clauses, it is not promoted to no-loop mode. The global OpenMP device RTL oversubscription flags no longer force no-loop code generation for Fortran.
1 parent 42dd926 commit 83ef38a

File tree

8 files changed

+210
-38
lines changed

8 files changed

+210
-38
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3837,10 +3837,26 @@ let Visibility = [ClangOption, CC1Option, FC1Option, FlangOption] in {
38373837
let Group = f_Group in {
38383838

38393839
def fopenmp_target_debug_EQ : Joined<["-"], "fopenmp-target-debug=">;
3840-
def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">;
3841-
def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">;
3842-
def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">;
3843-
def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">;
3840+
def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">,
3841+
HelpText<"Allow the optimizer to discretely increase the number of "
3842+
"teams. May cause ignore environment variables that set "
3843+
"the number of teams to be ignored. The combination of "
3844+
"-fopenmp-assume-teams-oversubscription "
3845+
"and -fopenmp-assume-threads-oversubscription "
3846+
"may allow the conversion of loops into sequential code by "
3847+
"ensuring that each team/thread executes at most one iteration.">;
3848+
def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">,
3849+
HelpText<"Allow the optimizer to discretely increase the number of "
3850+
"threads. May cause ignore environment variables that set "
3851+
"the number of threads to be ignored. The combination of "
3852+
"-fopenmp-assume-teams-oversubscription "
3853+
"and -fopenmp-assume-threads-oversubscription "
3854+
"may allow the conversion of loops into sequential code by "
3855+
"ensuring that each team/thread executes at most one iteration.">;
3856+
def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">,
3857+
HelpText<"Do not assume teams oversubscription.">;
3858+
def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">,
3859+
HelpText<"Do not assume threads oversubscription.">;
38443860
def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">,
38453861
HelpText<"Assert no thread in a parallel region modifies an ICV">,
38463862
MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;

llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,11 +1085,13 @@ class OpenMPIRBuilder {
10851085
/// preheader of the loop.
10861086
/// \param LoopType Information about type of loop worksharing.
10871087
/// It corresponds to type of loop workshare OpenMP pragma.
1088+
/// \param NoLoop If true, no-loop code is generated.
10881089
///
10891090
/// \returns Point where to insert code after the workshare construct.
10901091
InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
10911092
InsertPointTy AllocaIP,
1092-
omp::WorksharingLoopType LoopType);
1093+
omp::WorksharingLoopType LoopType,
1094+
bool NoLoop);
10931095

10941096
/// Modifies the canonical loop to be a statically-scheduled workshare loop.
10951097
///
@@ -1209,6 +1211,7 @@ class OpenMPIRBuilder {
12091211
/// present.
12101212
/// \param LoopType Information about type of loop worksharing.
12111213
/// It corresponds to type of loop workshare OpenMP pragma.
1214+
/// \param NoLoop If true, no-loop code is generated.
12121215
///
12131216
/// \returns Point where to insert code after the workshare construct.
12141217
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(
@@ -1219,7 +1222,8 @@ class OpenMPIRBuilder {
12191222
bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
12201223
bool HasOrderedClause = false,
12211224
omp::WorksharingLoopType LoopType =
1222-
omp::WorksharingLoopType::ForStaticLoop);
1225+
omp::WorksharingLoopType::ForStaticLoop,
1226+
bool NoLoop = false);
12231227

12241228
/// Tile a loop nest.
12251229
///

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4979,7 +4979,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
49794979
WorksharingLoopType LoopType,
49804980
BasicBlock *InsertBlock, Value *Ident,
49814981
Value *LoopBodyArg, Value *TripCount,
4982-
Function &LoopBodyFn) {
4982+
Function &LoopBodyFn, bool NoLoop) {
49834983
Type *TripCountTy = TripCount->getType();
49844984
Module &M = OMPBuilder->M;
49854985
IRBuilder<> &Builder = OMPBuilder->Builder;
@@ -5007,16 +5007,18 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
50075007
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
50085008
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
50095009
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5010+
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5011+
} else {
5012+
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
50105013
}
5011-
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
50125014

50135015
Builder.CreateCall(RTLFn, RealArgs);
50145016
}
50155017

50165018
static void workshareLoopTargetCallback(
50175019
OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
50185020
Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5019-
WorksharingLoopType LoopType) {
5021+
WorksharingLoopType LoopType, bool NoLoop) {
50205022
IRBuilder<> &Builder = OMPIRBuilder->Builder;
50215023
BasicBlock *Preheader = CLI->getPreheader();
50225024
Value *TripCount = CLI->getTripCount();
@@ -5063,17 +5065,16 @@ static void workshareLoopTargetCallback(
50635065
OutlinedFnCallInstruction->eraseFromParent();
50645066

50655067
createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5066-
LoopBodyArg, TripCount, OutlinedFn);
5068+
LoopBodyArg, TripCount, OutlinedFn, NoLoop);
50675069

50685070
for (auto &ToBeDeletedItem : ToBeDeleted)
50695071
ToBeDeletedItem->eraseFromParent();
50705072
CLI->invalidate();
50715073
}
50725074

5073-
OpenMPIRBuilder::InsertPointTy
5074-
OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
5075-
InsertPointTy AllocaIP,
5076-
WorksharingLoopType LoopType) {
5075+
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5076+
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5077+
WorksharingLoopType LoopType, bool NoLoop) {
50775078
uint32_t SrcLocStrSize;
50785079
Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
50795080
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -5156,7 +5157,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
51565157
OI.PostOutlineCB = [=, ToBeDeletedVec =
51575158
std::move(ToBeDeleted)](Function &OutlinedFn) {
51585159
workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5159-
LoopType);
5160+
LoopType, NoLoop);
51605161
};
51615162
addOutlineInfo(std::move(OI));
51625163
return CLI->getAfterIP();
@@ -5167,9 +5168,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
51675168
bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
51685169
bool HasSimdModifier, bool HasMonotonicModifier,
51695170
bool HasNonmonotonicModifier, bool HasOrderedClause,
5170-
WorksharingLoopType LoopType) {
5171+
WorksharingLoopType LoopType, bool NoLoop) {
51715172
if (Config.isTargetDevice())
5172-
return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
5173+
return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
51735174
OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
51745175
SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
51755176
HasNonmonotonicModifier, HasOrderedClause);

mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,14 +230,24 @@ def TargetRegionFlagsNone : I32BitEnumAttrCaseNone<"none">;
230230
def TargetRegionFlagsGeneric : I32BitEnumAttrCaseBit<"generic", 0>;
231231
def TargetRegionFlagsSpmd : I32BitEnumAttrCaseBit<"spmd", 1>;
232232
def TargetRegionFlagsTripCount : I32BitEnumAttrCaseBit<"trip_count", 2>;
233+
def TargetRegionFlagsNoLoop : I32BitEnumAttrCaseBit<"no_loop", 3>;
233234

234235
def TargetRegionFlags : OpenMP_BitEnumAttr<
235236
"TargetRegionFlags",
236-
"target region property flags", [
237+
"These flags describe properties of the target kernel. "
238+
"TargetRegionFlagsGeneric - denotes generic kernel. "
239+
"TargetRegionFlagsSpmd - denotes SPMD kernel. "
240+
"TargetRegionFlagsNoLoop - denotes kernel where "
241+
"num_teams * num_threads >= loop_trip_count. It allows the conversion "
242+
"of loops into sequential code by ensuring that each team/thread "
243+
"executes at most one iteration. "
244+
"TargetRegionFlagsTripCount - checks if the loop trip count should be "
245+
"calculated.", [
237246
TargetRegionFlagsNone,
238247
TargetRegionFlagsGeneric,
239248
TargetRegionFlagsSpmd,
240-
TargetRegionFlagsTripCount
249+
TargetRegionFlagsTripCount,
250+
TargetRegionFlagsNoLoop
241251
]>;
242252

243253
//===----------------------------------------------------------------------===//

mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2111,6 +2111,31 @@ Operation *TargetOp::getInnermostCapturedOmpOp() {
21112111
});
21122112
}
21132113

2114+
/// Check if we can promote SPMD kernel to No-Loop kernel.
2115+
static bool canPromoteToNoLoop(Operation *capturedOp, TeamsOp teamsOp,
2116+
WsloopOp *wsLoopOp) {
2117+
// num_teams clause can break no-loop teams/threads assumption.
2118+
if (teamsOp.getNumTeamsUpper())
2119+
return false;
2120+
2121+
// Reduction kernels are slower in no-loop mode.
2122+
if (teamsOp.getNumReductionVars())
2123+
return false;
2124+
if (wsLoopOp->getNumReductionVars())
2125+
return false;
2126+
2127+
// Check if the user allows the promotion of kernels to no-loop mode.
2128+
OffloadModuleInterface offloadMod =
2129+
capturedOp->getParentOfType<omp::OffloadModuleInterface>();
2130+
if (!offloadMod)
2131+
return false;
2132+
auto ompFlags = offloadMod.getFlags();
2133+
if (!ompFlags)
2134+
return false;
2135+
return ompFlags.getAssumeTeamsOversubscription() &&
2136+
ompFlags.getAssumeThreadsOversubscription();
2137+
}
2138+
21142139
TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
21152140
// A non-null captured op is only valid if it resides inside of a TargetOp
21162141
// and is the result of calling getInnermostCapturedOmpOp() on it.
@@ -2139,7 +2164,8 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
21392164

21402165
// Detect target-teams-distribute-parallel-wsloop[-simd].
21412166
if (numWrappers == 2) {
2142-
if (!isa<WsloopOp>(innermostWrapper))
2167+
WsloopOp *wsloopOp = dyn_cast<WsloopOp>(innermostWrapper);
2168+
if (!wsloopOp)
21432169
return TargetRegionFlags::generic;
21442170

21452171
innermostWrapper = std::next(innermostWrapper);
@@ -2150,12 +2176,17 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
21502176
if (!isa_and_present<ParallelOp>(parallelOp))
21512177
return TargetRegionFlags::generic;
21522178

2153-
Operation *teamsOp = parallelOp->getParentOp();
2154-
if (!isa_and_present<TeamsOp>(teamsOp))
2179+
TeamsOp teamsOp = dyn_cast<TeamsOp>(parallelOp->getParentOp());
2180+
if (!teamsOp)
21552181
return TargetRegionFlags::generic;
21562182

2157-
if (teamsOp->getParentOp() == targetOp.getOperation())
2158-
return TargetRegionFlags::spmd | TargetRegionFlags::trip_count;
2183+
if (teamsOp->getParentOp() == targetOp.getOperation()) {
2184+
TargetRegionFlags result =
2185+
TargetRegionFlags::spmd | TargetRegionFlags::trip_count;
2186+
if (canPromoteToNoLoop(capturedOp, teamsOp, wsloopOp))
2187+
result = result | TargetRegionFlags::no_loop;
2188+
return result;
2189+
}
21592190
}
21602191
// Detect target-teams-distribute[-simd] and target-teams-loop.
21612192
else if (isa<DistributeOp, LoopOp>(innermostWrapper)) {

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2591,13 +2591,34 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
25912591
}
25922592

25932593
builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin());
2594+
2595+
// Check if we can generate no-loop kernel
2596+
bool noLoopMode = false;
2597+
omp::TargetOp targetOp = wsloopOp->getParentOfType<mlir::omp::TargetOp>();
2598+
if (targetOp) {
2599+
Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp();
2600+
// We need this check because, without it, noLoopMode would be set to true
2601+
// for every omp.wsloop nested inside a no-loop SPMD target region, even if
2602+
// that loop is not the top-level SPMD one.
2603+
if (loopOp == targetCapturedOp) {
2604+
omp::TargetRegionFlags kernelFlags =
2605+
targetOp.getKernelExecFlags(targetCapturedOp);
2606+
if (omp::bitEnumContainsAll(kernelFlags,
2607+
omp::TargetRegionFlags::spmd |
2608+
omp::TargetRegionFlags::no_loop) &&
2609+
!omp::bitEnumContainsAny(kernelFlags,
2610+
omp::TargetRegionFlags::generic))
2611+
noLoopMode = true;
2612+
}
2613+
}
2614+
25942615
llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
25952616
ompBuilder->applyWorkshareLoop(
25962617
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
25972618
convertToScheduleKind(schedule), chunk, isSimd,
25982619
scheduleMod == omp::ScheduleModifier::monotonic,
25992620
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
2600-
workshareLoopType);
2621+
workshareLoopType, noLoopMode);
26012622

26022623
if (failed(handleError(wsloopIP, opInst)))
26032624
return failure();
@@ -5425,6 +5446,12 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp,
54255446
? llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD
54265447
: llvm::omp::OMP_TGT_EXEC_MODE_GENERIC
54275448
: llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
5449+
if (omp::bitEnumContainsAll(kernelFlags,
5450+
omp::TargetRegionFlags::spmd |
5451+
omp::TargetRegionFlags::no_loop) &&
5452+
!omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic))
5453+
attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
5454+
54285455
attrs.MinTeams = minTeamsVal;
54295456
attrs.MaxTeams.front() = maxTeamsVal;
54305457
attrs.MinThreads = 1;
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
! REQUIRES: flang
2+
3+
! RUN: %libomptarget-compile-fortran-generic -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription
4+
! RUN: env LIBOMPTARGET_INFO=16 OMP_NUM_TEAMS=16 OMP_TEAMS_THREAD_LIMIT=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
5+
function check_errors(array) result (errors)
6+
integer, intent(in) :: array(1024)
7+
integer :: errors
8+
integer :: i
9+
errors = 0
10+
do i = 1, 1024
11+
if ( array( i) .ne. (i) ) then
12+
errors = errors + 1
13+
end if
14+
end do
15+
end function
16+
17+
program main
18+
use omp_lib
19+
implicit none
20+
integer :: i,j,red
21+
integer :: array(1024), errors = 0
22+
array = 1
23+
24+
! No-loop kernel
25+
!$omp target teams distribute parallel do
26+
do i = 1, 1024
27+
array(i) = i
28+
end do
29+
errors = errors + check_errors(array)
30+
31+
! SPMD kernel (num_teams clause blocks promotion to no-loop)
32+
array = 1
33+
!$omp target teams distribute parallel do num_teams(3)
34+
do i = 1, 1024
35+
array(i) = i
36+
end do
37+
38+
errors = errors + check_errors(array)
39+
40+
! No-loop kernel
41+
array = 1
42+
!$omp target teams distribute parallel do num_threads(64)
43+
do i = 1, 1024
44+
array(i) = i
45+
end do
46+
47+
errors = errors + check_errors(array)
48+
49+
! SPMD kernel
50+
array = 1
51+
!$omp target parallel do
52+
do i = 1, 1024
53+
array(i) = i
54+
end do
55+
56+
errors = errors + check_errors(array)
57+
58+
! Generic kernel
59+
array = 1
60+
!$omp target teams distribute
61+
do i = 1, 1024
62+
array(i) = i
63+
end do
64+
65+
errors = errors + check_errors(array)
66+
67+
! SPMD kernel (reduction clause blocks promotion to no-loop)
68+
array = 1
69+
red =0
70+
!$omp target teams distribute parallel do reduction(+:red)
71+
do i = 1, 1024
72+
red = red + array(i)
73+
end do
74+
75+
if (red .ne. 1024) then
76+
errors = errors + 1
77+
end if
78+
79+
print *,"number of errors: ", errors
80+
81+
end program main
82+
83+
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode
84+
! CHECK: info: #Args: 3 Teams x Thrds: 64x 16
85+
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
86+
! CHECK: info: #Args: 3 Teams x Thrds: 3x 16 {{.*}}
87+
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode
88+
! CHECK: info: #Args: 3 Teams x Thrds: 64x 16 {{.*}}
89+
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
90+
! CHECK: info: #Args: 3 Teams x Thrds: 1x 16
91+
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} Generic mode
92+
! CHECK: info: #Args: 3 Teams x Thrds: 16x 16 {{.*}}
93+
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
94+
! CHECK: info: #Args: 4 Teams x Thrds: 16x 16 {{.*}}
95+
! CHECK: number of errors: 0
96+

0 commit comments

Comments
 (0)