Skip to content

Commit 04b5e07

Browse files
committed
[clang][OpenMP] Reinterpret the usage of flags to enable Xteam Scan Kernel CodeGen
This patch enforces the following meanings to the Xteam Scan flags - 1. -fopenmp-target-xteam-scan: - Enables Xteam Segmented Scan Kernel CodeGen 2. -fopenmp-target-xteam-no-loop-scan: - Enables Xteam No-Loop Scan Kernel CodeGen It also updates the clang and offload tests as needed. Change-Id: I44773d607591fd956b52746fa79b463248ee2b82
1 parent 12848da commit 04b5e07

File tree

12 files changed

+81
-59
lines changed

12 files changed

+81
-59
lines changed

clang/include/clang/Basic/LangOptions.def

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,10 @@ LANGOPT(OpenMPTargetNoLoop , 1, 1, "Use no-loop code generation technique.")
282282
LANGOPT(OpenMPTargetXteamReduction , 1, 1, "Use cross-team code generation technique.")
283283
LANGOPT(OpenMPTargetFastReduction , 1, 0, "Use fast reduction code generation technique.")
284284
LANGOPT(OpenMPTargetMultiDevice , 1, 0, "Offload the iteration space of a single target region across multiple GPU devices.")
285-
LANGOPT(OpenMPTargetXteamScan , 1, 0, "Use the cross-team scan code generation technique.")
286-
LANGOPT(OpenMPTargetXteamScanSegmented , 1, 0, "Use the segmented cross-team scan code generation technique.")
285+
286+
// The flag '-fopenmp-target-xteam-scan' triggers the 'Segmented Cross Team Scan' variant by default. To use the no-loop variant, please use the flag '-fopenmp-target-no-loop-scan' instead.
287+
LANGOPT(OpenMPTargetXteamScan , 1, 0, "Use the cross-team specialized kernel code generation for 'scan' directive.")
288+
LANGOPT(OpenMPTargetXteamNoLoopScan , 1, 0, "Use the no-loop variant of the cross-team specialized kernel code generation for 'scan' directive.")
287289
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
288290
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
289291
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")

clang/include/clang/Driver/Options.td

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3748,20 +3748,20 @@ def fno_openmp_target_fast_reduction : Flag<["-"], "fno-openmp-target-fast-reduc
37483748
MarshallingInfoFlag<LangOpts<"OpenMPTargetFastReduction">>;
37493749
def fopenmp_target_xteam_scan : Flag<["-"], "fopenmp-target-xteam-scan">, Group<f_Group>,
37503750
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
3751-
HelpText<"Use the cross-team scan code generation technique.">,
3751+
HelpText<"Use the cross-team specialized kernel code generation for 'scan' directive.">,
37523752
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamScan">>;
37533753
def fno_openmp_target_xteam_scan : Flag<["-"], "fno-openmp-target-xteam-scan">, Group<f_Group>,
37543754
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
3755-
HelpText<"Do not use the cross-team scan code generation technique.">,
3755+
HelpText<"Do not use the cross-team specialized kernel code generation for 'scan' directive.">,
37563756
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamScan">>;
3757-
def fopenmp_target_xteam_scan_segmented : Flag<["-"], "fopenmp-target-xteam-scan-segmented">, Group<f_Group>,
3757+
def fopenmp_target_xteam_no_loop_scan : Flag<["-"], "fopenmp-target-xteam-no-loop-scan">, Group<f_Group>,
37583758
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
3759-
HelpText<"Use the cross-team segmented scan code generation technique.">,
3760-
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamScanSegmented">>;
3761-
def fno_openmp_target_xteam_scan_segmented : Flag<["-"], "fno-openmp-target-xteam-scan-segmented">, Group<f_Group>,
3759+
HelpText<"Use the no-loop variant of the cross-team specialized kernel code generation for 'scan' directive.">,
3760+
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamNoLoopScan">>;
3761+
def fno_openmp_target_xteam_no_loop_scan : Flag<["-"], "fno-openmp-target-xteam-no-loop-scan">, Group<f_Group>,
37623762
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
3763-
HelpText<"Do not use the cross-team segmented scan code generation technique.">,
3764-
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamScanSegmented">>;
3763+
HelpText<"Do not use the no-loop variant of the cross-team specialized kernel code generation for 'scan' directive.">,
3764+
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamNoLoopScan">>;
37653765
def fopenmp_target_multi_device : Flag<["-"], "fopenmp-target-multi-device">, Group<f_Group>,
37663766
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option, FlangOption]>,
37673767
HelpText<"Enable code generation to emit support for multi device target region execution">,

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9467,11 +9467,11 @@ static void emitTargetCallKernelLaunch(
94679467
// array and `teams_done_ptr`.
94689468
// 2. The Xteam Scan Reduction kernels require a third helper variable -
94699469
// `scan_storage` array.
9470-
// a. The segmented scan variant requires a fourth helper variable -
9471-
// `segmented_vals`
9470+
// a. The segmented scan variant(the default) requires a fourth helper
9471+
// variable - `segmented_vals`
94729472
size_t ExpectedNumArgs =
94739473
CGF.CGM.isXteamScanKernel()
9474-
? (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented ? 4 : 3)
9474+
? (CGF.CGM.isXteamSegmentedScanKernel() ? 4 : 3)
94759475
: 2;
94769476
assert((CapturedVars.size() ==
94779477
CapturedCount + ExpectedNumArgs * XteamRVM.size()) &&
@@ -9549,7 +9549,7 @@ static void emitTargetCallKernelLaunch(
95499549
CGF, CombinedInfo, CGF.CGM.ReductionVars[1]); // teams_done_ptr
95509550
addXTeamReductionComponentHelper(
95519551
CGF, CombinedInfo, CGF.CGM.ReductionVars[2]); // scan_storage
9552-
if (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented)
9552+
if (CGF.CGM.isXteamSegmentedScanKernel())
95539553
addXTeamReductionComponentHelper(
95549554
CGF, CombinedInfo, CGF.CGM.ReductionVars[3]); // segment_vals
95559555
} else {
@@ -9619,7 +9619,7 @@ static void emitTargetCallKernelLaunch(
96199619
OMPBuilder.getOrCreateRuntimeFunction(CGF.CGM.getModule(),
96209620
OMPRTL_omp_target_alloc),
96219621
TgtAllocArgsScan, "d_scan_storage");
9622-
if (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
9622+
if (CGF.CGM.isXteamSegmentedScanKernel()) {
96239623
// Emit the lower and upper bounds
96249624
const auto *LBDecl = cast<VarDecl>(
96259625
cast<DeclRefExpr>(
@@ -9711,7 +9711,7 @@ static void emitTargetCallKernelLaunch(
97119711
++ArgPos;
97129712
CGF.CGM.ReductionVars.push_back(DScanStorageInst);
97139713
addXTeamReductionComponentHelper(CGF, CombinedInfo, DScanStorageInst);
9714-
if (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
9714+
if (CGF.CGM.isXteamSegmentedScanKernel()) {
97159715
++ArgPos;
97169716
CGF.CGM.ReductionVars.push_back(DSegmentValsInst);
97179717
addXTeamReductionComponentHelper(CGF, CombinedInfo,

clang/lib/CodeGen/CGStmt.cpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -757,14 +757,29 @@ void CodeGenFunction::EmitXteamRedCode(const OMPExecutableDirective &D,
757757
EmitXteamLocalAggregator(CapturedForStmt);
758758

759759
if (CGM.isXteamScanKernel()) {
760-
if (CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
760+
// Note about the two Xteam Scan Kernel variants:
761+
//
762+
// 1. Segmented Scan Kernel: This is the default Xteam Scan kernel that will
763+
// be generated.
764+
//
765+
// 2. NoLoop Scan Kernel: This is a special case when the number of
766+
// iterations in the captured 'For' Stmt(i.e. total number of elements in
767+
// the input array that has to be scanned) is smaller than or equal to
768+
// the total number of parallel work-items available during the kernel
769+
// execution. This will generate a more time and space efficient kernel
770+
// for this case.
771+
//
772+
if (CGM.isXteamSegmentedScanKernel()) {
773+
// Follow the Xteam Segmented Scan Kernel Codegen
761774
EmitForStmtWithArgs(cast<ForStmt>(*CapturedForStmt), Args);
762775
// Toggle the Phase number(1 or 2) after emitting any of the phases
763776
CGM.isXteamScanPhaseOne = !CGM.isXteamScanPhaseOne;
764777
} else if (CGM.isXteamScanPhaseOne) {
778+
// Follow the Xteam NoLoop Scan Kernel Codegen - Phase 1
765779
EmitNoLoopXteamScanPhaseOneCode(D, CapturedForStmt, Loc, Args);
766780
CGM.isXteamScanPhaseOne = false;
767781
} else {
782+
// Follow the Xteam NoLoop Scan Kernel Codegen - Phase 2
768783
EmitNoLoopXteamScanPhaseTwoCode(D, CapturedForStmt, Loc, Args);
769784
CGM.isXteamScanPhaseOne = true;
770785
}
@@ -2291,8 +2306,7 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S,
22912306
llvm::BasicBlock *ExecBB = nullptr;
22922307
llvm::BasicBlock *DoneBB = nullptr;
22932308
clang::QualType RedVarType;
2294-
if (getLangOpts().OpenMPIsTargetDevice &&
2295-
getLangOpts().OpenMPTargetXteamScanSegmented) {
2309+
if (getLangOpts().OpenMPIsTargetDevice && CGM.isXteamSegmentedScanKernel()) {
22962310
// Compute Loop trip-count (N) = GlobalUB - GlobalLB + 1
22972311
const auto UBLValue = EmitLValue(
22982312
cast<DeclRefExpr>(BigJumpLoopLD->getUpperBoundVariable())); // GlobalUB
@@ -2430,7 +2444,7 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S,
24302444
llvm::BasicBlock *ForBody = createBasicBlock("for.body");
24312445

24322446
if (getLangOpts().OpenMPIsTargetDevice &&
2433-
getLangOpts().OpenMPTargetXteamScanSegmented) {
2447+
CGM.isXteamSegmentedScanKernel()) {
24342448
// Emit the Segment loop breaking condition
24352449

24362450
llvm::Value *loopIterationVar = Builder.CreateLoad(BigJumpLoopIvAddr);
@@ -2495,7 +2509,7 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S,
24952509
getProfileCount(BigJumpLoopLD->getBody()));
24962510
EmitBlock(NextBB);
24972511
}
2498-
if (CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
2512+
if (CGM.isXteamSegmentedScanKernel()) {
24992513
if (!CGM.isXteamScanPhaseOne) {
25002514
// SegmentVals contains the final scanned results computed for every
25012515
// element in a segment.
@@ -2525,7 +2539,7 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S,
25252539

25262540
if (CGM.getLangOpts().OpenMPIsTargetDevice &&
25272541
(CGM.isXteamRedKernel(&S) || CGM.isBigJumpLoopKernel(&S))) {
2528-
if (CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
2542+
if (CGM.isXteamSegmentedScanKernel()) {
25292543
EmitBlock(Continue.getBlock());
25302544
Address SegmentValsGEP = Address(
25312545
Builder.CreateGEP(Int32Ty, DSegmentVals,
@@ -2570,7 +2584,7 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S,
25702584
EmitBlock(LoopExit.getBlock(), true);
25712585

25722586
if (CGM.getLangOpts().OpenMPIsTargetDevice &&
2573-
CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
2587+
CGM.isXteamSegmentedScanKernel()) {
25742588
if (CGM.isXteamScanPhaseOne)
25752589
EmitXteamScanSum(&S, *Args, CGM.getXteamRedBlockSize(*BigJumpLoopLD));
25762590
EmitBranch(DoneBB);

clang/lib/CodeGen/CGStmtOpenMP.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ void CodeGenFunction::InitializeXteamRedCapturedVars(
423423

424424
assert(DScanStorageInst && "Device scan storage pointer cannot be null");
425425
CapturedVars.push_back(DScanStorageInst);
426-
if (CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
426+
if (CGM.isXteamSegmentedScanKernel()) {
427427
// Placeholder for d_segment_vals initialized to nullptr
428428
llvm::Value *DSegmentValsInst =
429429
Builder.CreateAlloca(RedVarType, nullptr, "d_segment_vals");
@@ -776,7 +776,7 @@ static llvm::Function *emitOutlinedFunctionPrologue(
776776
Ctx, Ctx.VoidPtrTy, ImplicitParamKind::CapturedContext);
777777
Args.emplace_back(DScanStorageVD);
778778
TargetArgs.emplace_back(DScanStorageVD);
779-
if (CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
779+
if (CGM.isXteamSegmentedScanKernel()) {
780780
VarDecl *DSegmentValsVD = ImplicitParamDecl::Create(
781781
Ctx, Ctx.VoidPtrTy, ImplicitParamKind::CapturedContext);
782782
Args.emplace_back(DSegmentValsVD);

clang/lib/CodeGen/CodeGenModule.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1899,7 +1899,13 @@ class CodeGenModule : public CodeGenTypeCache {
18991899
}
19001900

19011901
bool isXteamScanKernel() {
1902-
return getLangOpts().OpenMPTargetXteamScan && isXteamScanCandidate;
1902+
return (getLangOpts().OpenMPTargetXteamScan ||
1903+
getLangOpts().OpenMPTargetXteamNoLoopScan) &&
1904+
isXteamScanCandidate;
1905+
}
1906+
1907+
bool isXteamSegmentedScanKernel() {
1908+
return isXteamScanKernel() && !getLangOpts().OpenMPTargetXteamNoLoopScan;
19031909
}
19041910

19051911
/// If we are able to generate a NoLoop kernel for this directive, return

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6988,12 +6988,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
69886988
else
69896989
CmdArgs.push_back("-fno-openmp-target-xteam-scan");
69906990

6991-
if (Args.hasFlag(options::OPT_fopenmp_target_xteam_scan_segmented,
6992-
options::OPT_fno_openmp_target_xteam_scan_segmented,
6991+
if (Args.hasFlag(options::OPT_fopenmp_target_xteam_no_loop_scan,
6992+
options::OPT_fno_openmp_target_xteam_no_loop_scan,
69936993
false))
6994-
CmdArgs.push_back("-fopenmp-target-xteam-scan-segmented");
6994+
CmdArgs.push_back("-fopenmp-target-xteam-no-loop-scan");
69956995
else
6996-
CmdArgs.push_back("-fno-openmp-target-xteam-scan-segmented");
6996+
CmdArgs.push_back("-fno-openmp-target-xteam-no-loop-scan");
69976997
// When in OpenMP offloading mode with NVPTX target, forward
69986998
// cuda-mode flag
69996999
if (Args.hasFlag(options::OPT_fopenmp_cuda_mode,

clang/lib/Frontend/CompilerInvocation.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3836,10 +3836,10 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
38363836
else
38373837
GenerateArg(Consumer, OPT_fno_openmp_target_xteam_scan);
38383838

3839-
if (Opts.OpenMPTargetXteamScanSegmented)
3840-
GenerateArg(Consumer, OPT_fopenmp_target_xteam_scan_segmented);
3839+
if (Opts.OpenMPTargetXteamNoLoopScan)
3840+
GenerateArg(Consumer, OPT_fopenmp_target_xteam_no_loop_scan);
38413841
else
3842-
GenerateArg(Consumer, OPT_fno_openmp_target_xteam_scan_segmented);
3842+
GenerateArg(Consumer, OPT_fno_openmp_target_xteam_no_loop_scan);
38433843

38443844
if (Opts.OpenMPThreadSubscription)
38453845
GenerateArg(Consumer, OPT_fopenmp_assume_threads_oversubscription);
@@ -4377,9 +4377,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
43774377
Args.hasFlag(options::OPT_fopenmp_target_xteam_scan,
43784378
options::OPT_fno_openmp_target_xteam_scan, false);
43794379

4380-
Opts.OpenMPTargetXteamScanSegmented =
4381-
Args.hasFlag(options::OPT_fopenmp_target_xteam_scan_segmented,
4382-
options::OPT_fno_openmp_target_xteam_scan_segmented, false);
4380+
Opts.OpenMPTargetXteamNoLoopScan =
4381+
Args.hasFlag(options::OPT_fopenmp_target_xteam_no_loop_scan,
4382+
options::OPT_fno_openmp_target_xteam_no_loop_scan, false);
43834383

43844384
Opts.OpenMPKernelIO =
43854385
Args.hasFlag(options::OPT_fopenmp_allow_kernel_io,

0 commit comments

Comments
 (0)