Skip to content

Commit a646234

Browse files
animeshk-amdronlieb
authored andcommitted
[OpenMP][Clang] Codegen support for the segmented Xteam Scan
When total number of threads is lesser than the loop tripcount, the option `-fopenmp-target-xteam-scan-segmented` can be used to trigger segmented cross-team scan kernel codegen. The kernel bodies for both the phases of Xteam scan will contain `for` loops spanning through the iteration space of a segment that one thread will be responsible to compute scan for. For the original loop tripcount N (which is the number of elements in the input array) and total number of threads T, segments of size N/T will be assigned to each thread to compute a sequential scan. After that, these sequentially computed results will be fed into the existing cross-team scan machinery. More details: https://confluence.amd.com/x/mu6HJg Change-Id: I60354508ddd41e2fd2404547a404feae6f3bbb66
1 parent a62b47b commit a646234

File tree

16 files changed

+4611
-74
lines changed

16 files changed

+4611
-74
lines changed

clang/include/clang/Basic/LangOptions.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ LANGOPT(OpenMPTargetXteamReduction , 1, 1, "Use cross-team code generation techn
283283
LANGOPT(OpenMPTargetFastReduction , 1, 0, "Use fast reduction code generation technique.")
284284
LANGOPT(OpenMPTargetMultiDevice , 1, 0, "Offload the iteration space of a single target region across multiple GPU devices.")
285285
LANGOPT(OpenMPTargetXteamScan , 1, 0, "Use the cross-team scan code generation technique.")
286+
LANGOPT(OpenMPTargetXteamScanSegmented , 1, 0, "Use the segmented cross-team scan code generation technique.")
286287
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
287288
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
288289
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")

clang/include/clang/Driver/Options.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3754,6 +3754,14 @@ def fno_openmp_target_xteam_scan : Flag<["-"], "fno-openmp-target-xteam-scan">,
37543754
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
37553755
HelpText<"Do not use the cross-team scan code generation technique.">,
37563756
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamScan">>;
3757+
def fopenmp_target_xteam_scan_segmented : Flag<["-"], "fopenmp-target-xteam-scan-segmented">, Group<f_Group>,
3758+
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
3759+
HelpText<"Use the cross-team segmented scan code generation technique.">,
3760+
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamScanSegmented">>;
3761+
def fno_openmp_target_xteam_scan_segmented : Flag<["-"], "fno-openmp-target-xteam-scan-segmented">, Group<f_Group>,
3762+
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
3763+
HelpText<"Do not use the cross-team segmented scan code generation technique.">,
3764+
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamScanSegmented">>;
37573765
def fopenmp_target_multi_device : Flag<["-"], "fopenmp-target-multi-device">, Group<f_Group>,
37583766
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option, FlangOption]>,
37593767
HelpText<"Enable code generation to emit support for multi device target region execution">,

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9462,11 +9462,17 @@ static void emitTargetCallKernelLaunch(
94629462
CodeGenModule::XteamRedVarMap &XteamRVM = CGF.CGM.getXteamRedVarMap(FStmt);
94639463
auto &XteamOrdVars = CGF.CGM.getXteamOrderedRedVar(FStmt);
94649464

9465-
// The Xteam Reduction kernels require two helper variables - `team_vals`
9465+
// Note Regarding the ExpectedNumArgs:
9466+
// 1. The Xteam Reduction kernels require two helper variables - `team_vals`
94669467
// array and `teams_done_ptr`.
9467-
// The Xteam Scan Reduction kernels require a third helper variable -
9468+
// 2. The Xteam Scan Reduction kernels require a third helper variable -
94689469
// `scan_storage` array.
9469-
int ExpectedNumArgs = CGF.CGM.isXteamScanKernel() ? 3 : 2;
9470+
// a. The segmented scan variant requires a fourth helper variable -
9471+
// `segmented_vals`
9472+
size_t ExpectedNumArgs =
9473+
CGF.CGM.isXteamScanKernel()
9474+
? (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented ? 4 : 3)
9475+
: 2;
94709476
assert((CapturedVars.size() ==
94719477
CapturedCount + ExpectedNumArgs * XteamRVM.size()) &&
94729478
"Unexpected number of captured vars");
@@ -9534,19 +9540,24 @@ static void emitTargetCallKernelLaunch(
95349540
// For the Phase 2 of the Xteam Scan codegen, fresh memory allocation for
95359541
// reduction helper data structures is not needed. The helpers generated
95369542
// during the Phase 1 will be re-used here.
9537-
assert(CGF.CGM.ReductionVars.size() == 3 &&
9538-
"Xteam Scan reduction code-generates three helper variables");
9543+
assert(CGF.CGM.ReductionVars.size() == ExpectedNumArgs &&
9544+
"Insufficient number of helper variables for Xteam Scan reduction "
9545+
"code-generation");
95399546
addXTeamReductionComponentHelper(
95409547
CGF, CombinedInfo, CGF.CGM.ReductionVars[0]); // team_vals
95419548
addXTeamReductionComponentHelper(
95429549
CGF, CombinedInfo, CGF.CGM.ReductionVars[1]); // teams_done_ptr
95439550
addXTeamReductionComponentHelper(
95449551
CGF, CombinedInfo, CGF.CGM.ReductionVars[2]); // scan_storage
9552+
if (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented)
9553+
addXTeamReductionComponentHelper(
9554+
CGF, CombinedInfo, CGF.CGM.ReductionVars[3]); // segment_vals
95459555
} else {
95469556
for (; CapturedCount + ArgPos < CapturedVars.size();) {
95479557
// Process the pair of captured variables:
95489558
llvm::Value *DTeamValsInst = nullptr;
95499559
llvm::Value *DScanStorageInst = nullptr;
9560+
llvm::Value *DSegmentValsInst = nullptr;
95509561

95519562
assert(CapturedCount + ArgPos < CapturedVars.size() &&
95529563
"Xteam reduction argument position out of bounds");
@@ -9594,7 +9605,7 @@ static void emitTargetCallKernelLaunch(
95949605
XteamRedNumTeamsFromClauseVal ? XteamRedNumTeamsFromClauseVal
95959606
: XteamRedNumTeamsFromOccupancy,
95969607
CGF.Builder.CreateIntCast(
9597-
OMPRuntime->emitNumThreadsForTargetDirective(CGF, D),
9608+
CGF.Builder.getInt32(CGF.CGM.getXteamRedBlockSize(D)),
95989609
CGF.Int64Ty, false),
95999610
"total_num_threads");
96009611
llvm::Value *StorageSize = CGF.Builder.CreateAdd(
@@ -9608,6 +9619,41 @@ static void emitTargetCallKernelLaunch(
96089619
OMPBuilder.getOrCreateRuntimeFunction(CGF.CGM.getModule(),
96099620
OMPRTL_omp_target_alloc),
96109621
TgtAllocArgsScan, "d_scan_storage");
9622+
if (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
9623+
// Emit the lower and upper bounds
9624+
const auto *LBDecl = cast<VarDecl>(
9625+
cast<DeclRefExpr>(
9626+
cast<OMPLoopDirective>(D).getLowerBoundVariable())
9627+
->getDecl());
9628+
CGF.EmitVarDecl(*LBDecl);
9629+
9630+
const auto *UBDecl = cast<VarDecl>(
9631+
cast<DeclRefExpr>(
9632+
cast<OMPLoopDirective>(D).getUpperBoundVariable())
9633+
->getDecl());
9634+
CGF.EmitVarDecl(*UBDecl);
9635+
const auto UBLValue = CGF.EmitLValue(cast<DeclRefExpr>(
9636+
cast<OMPLoopDirective>(D).getUpperBoundVariable()));
9637+
const auto LBLValue = CGF.EmitLValue(cast<DeclRefExpr>(
9638+
cast<OMPLoopDirective>(D).getLowerBoundVariable()));
9639+
// Emit SegmentValsSize = UBLValue - LBLValue + 1
9640+
llvm::Value *SegmentValsSize = CGF.Builder.CreateAdd(
9641+
CGF.Builder.CreateSub(
9642+
CGF.Builder.CreateLoad(UBLValue.getAddress()),
9643+
CGF.Builder.CreateLoad(LBLValue.getAddress())),
9644+
llvm::ConstantInt::get(CGF.Int32Ty, 1), "segment_vals_size");
9645+
9646+
llvm::Value *DSegmentValsSz = CGF.Builder.CreateMul(
9647+
RedVarTySz,
9648+
CGF.Builder.CreateIntCast(SegmentValsSize, CGF.Int64Ty,
9649+
/*isSigned*/ false),
9650+
"d_segment_vals_sz");
9651+
llvm::Value *TgtAllocArgsScan[] = {DSegmentValsSz, DevIdVal};
9652+
DSegmentValsInst = CGF.EmitRuntimeCall(
9653+
OMPBuilder.getOrCreateRuntimeFunction(
9654+
CGF.CGM.getModule(), OMPRTL_omp_target_alloc),
9655+
TgtAllocArgsScan, "d_segment_vals");
9656+
}
96119657
}
96129658
}
96139659
CGF.CGM.ReductionVars.push_back(DTeamValsInst);
@@ -9665,6 +9711,12 @@ static void emitTargetCallKernelLaunch(
96659711
++ArgPos;
96669712
CGF.CGM.ReductionVars.push_back(DScanStorageInst);
96679713
addXTeamReductionComponentHelper(CGF, CombinedInfo, DScanStorageInst);
9714+
if (CGF.CGM.getLangOpts().OpenMPTargetXteamScanSegmented) {
9715+
++ArgPos;
9716+
CGF.CGM.ReductionVars.push_back(DSegmentValsInst);
9717+
addXTeamReductionComponentHelper(CGF, CombinedInfo,
9718+
DSegmentValsInst);
9719+
}
96689720
}
96699721
// Advance to the next reduction variable in the pair:
96709722
++ArgPos;

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2906,7 +2906,12 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanSum(
29062906

29072907
if (SumType->isIntegerTy()) {
29082908
if (WarpSize == 64) {
2909-
if (BlockSize == 512)
2909+
if (BlockSize == 1024)
2910+
return CGF.EmitRuntimeCall(
2911+
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
2912+
OMPRTL___kmpc_xteams_i_16x64),
2913+
Args);
2914+
else if (BlockSize == 512)
29102915
return CGF.EmitRuntimeCall(
29112916
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
29122917
OMPRTL___kmpc_xteams_i_8x64),
@@ -2917,7 +2922,7 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanSum(
29172922
OMPRTL___kmpc_xteams_i_4x64),
29182923
Args);
29192924
else
2920-
llvm_unreachable("Block size should be 256 or 512.");
2925+
llvm_unreachable("Block size should be 256, 512 or 1024.");
29212926
} else if (WarpSize == 32) {
29222927
if (BlockSize == 512)
29232928
return CGF.EmitRuntimeCall(
@@ -2937,6 +2942,79 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanSum(
29372942
llvm_unreachable("No support for other types currently.");
29382943
}
29392944

2945+
llvm::Value *CGOpenMPRuntimeGPU::getXteamScanPhaseTwo(
2946+
CodeGenFunction &CGF, llvm::Value *Val, llvm::Value *SegmentSize,
2947+
llvm::Value *DTeamVals, llvm::Value *DScanStorage,
2948+
llvm::Value *DSegmentVals, llvm::Value *ThreadStartIndex, int BlockSize,
2949+
bool IsInclusiveScan) {
2950+
// TODO handle more types
2951+
llvm::Type *SumType = Val->getType();
2952+
assert(
2953+
(SumType->isIntegerTy() && (SumType->getPrimitiveSizeInBits() == 32 ||
2954+
SumType->getPrimitiveSizeInBits() == 64)) &&
2955+
"Unhandled type");
2956+
2957+
llvm::Type *Int32Ty = llvm::Type::getInt32Ty(CGM.getLLVMContext());
2958+
llvm::Type *Int64Ty = llvm::Type::getInt64Ty(CGM.getLLVMContext());
2959+
2960+
std::pair<llvm::Value *, llvm::Value *> RfunPair =
2961+
getXteamRedFunctionPtrs(CGF, SumType);
2962+
llvm::Value *ZeroVal = SumType->getPrimitiveSizeInBits() == 32
2963+
? llvm::ConstantInt::get(Int32Ty, 0)
2964+
: llvm::ConstantInt::get(Int64Ty, 0);
2965+
2966+
llvm::Value *IsInclusiveScanVal =
2967+
llvm::ConstantInt::get(Int32Ty, IsInclusiveScan);
2968+
llvm::Value *Args[] = {DScanStorage, SegmentSize, DTeamVals,
2969+
DSegmentVals, RfunPair.first, ZeroVal,
2970+
ThreadStartIndex, IsInclusiveScanVal};
2971+
2972+
unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
2973+
assert(WarpSize == 32 || WarpSize == 64);
2974+
2975+
assert(BlockSize > 0 && BlockSize <= llvm::omp::xteam_red::MaxBlockSize &&
2976+
"XTeam Reduction blocksize outside expected range");
2977+
assert(((BlockSize & (BlockSize - 1)) == 0) &&
2978+
"XTeam Reduction blocksize must be a power of two");
2979+
2980+
if (SumType->isIntegerTy()) {
2981+
if (WarpSize == 64) {
2982+
if (BlockSize == 1024)
2983+
return CGF.EmitRuntimeCall(
2984+
OMPBuilder.getOrCreateRuntimeFunction(
2985+
CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_16x64),
2986+
Args);
2987+
else if (BlockSize == 512)
2988+
return CGF.EmitRuntimeCall(
2989+
OMPBuilder.getOrCreateRuntimeFunction(
2990+
CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_8x64),
2991+
Args);
2992+
else if (BlockSize == 256)
2993+
return CGF.EmitRuntimeCall(
2994+
OMPBuilder.getOrCreateRuntimeFunction(
2995+
CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_4x64),
2996+
Args);
2997+
else
2998+
llvm_unreachable("Block size should be 256, 512 or 1024.");
2999+
} else if (WarpSize == 32) {
3000+
if (BlockSize == 512)
3001+
return CGF.EmitRuntimeCall(
3002+
OMPBuilder.getOrCreateRuntimeFunction(
3003+
CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_16x32),
3004+
Args);
3005+
else if (BlockSize == 256)
3006+
return CGF.EmitRuntimeCall(
3007+
OMPBuilder.getOrCreateRuntimeFunction(
3008+
CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_8x32),
3009+
Args);
3010+
else
3011+
llvm_unreachable("Block size should be 256 or 512.");
3012+
} else
3013+
llvm_unreachable("Warp size should be 32 or 64.");
3014+
}
3015+
llvm_unreachable("No support for other types currently.");
3016+
}
3017+
29403018
bool CGOpenMPRuntimeGPU::needsHintsForFastFPAtomics() {
29413019
return getOffloadArch(CGM) == OffloadArch::GFX90a;
29423020
}

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,15 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
186186
llvm::Value *DScanStorage, llvm::Value *ThreadStartIndex,
187187
llvm::Value *NumTeams, int BlockSize, bool IsFast);
188188

189+
/// Emit calls to Cross-team scan Phase 2 entry points
190+
llvm::Value *getXteamScanPhaseTwo(CodeGenFunction &CGF, llvm::Value *Val,
191+
llvm::Value *SegmentSize,
192+
llvm::Value *DTeamVals,
193+
llvm::Value *DScanStorage,
194+
llvm::Value *DSegmentVals,
195+
llvm::Value *ThreadStartIndex,
196+
int BlockSize, bool IsInclusiveScan);
197+
189198
// Returns whether the hint expressions for an architecture should be
190199
// evaluated to decide which kind of atomic ops should be generated.
191200
bool needsHintsForFastFPAtomics() override final;

0 commit comments

Comments
 (0)