@@ -4907,11 +4907,255 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
49074907 }
49084908}
49094909
4910+ static std::string generateUniqueName(CodeGenModule &CGM,
4911+ llvm::StringRef Prefix, const Expr *Ref);
4912+
4913+ void CGOpenMPRuntime::emitPrivateReduction(
4914+ CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates,
4915+ const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) {
4916+
4917+ // Create a shared global variable (__shared_reduction_var) to accumulate the
4918+ // final result.
4919+ //
4920+ // Call __kmpc_barrier to synchronize threads before initialization.
4921+ //
4922+ // The master thread (thread_id == 0) initializes __shared_reduction_var
4923+ // with the identity value or initializer.
4924+ //
4925+ // Call __kmpc_barrier to synchronize before combining.
4926+ // For each i:
4927+ // - Thread enters critical section.
4928+ // - Reads its private value from LHSExprs[i].
4929+ // - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i],
4930+ // Privates[i]).
4931+ // - Exits critical section.
4932+ //
4933+ // Call __kmpc_barrier after combining.
4934+ //
4935+ // Each thread copies __shared_reduction_var[i] back to RHSExprs[i].
4936+ //
4937+ // Final __kmpc_barrier to synchronize after broadcasting
4938+ QualType PrivateType = Privates->getType();
4939+ llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
4940+
4941+ const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps);
4942+ std::string ReductionVarNameStr;
4943+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates->IgnoreParenCasts()))
4944+ ReductionVarNameStr =
4945+ generateUniqueName(CGM, DRE->getDecl()->getNameAsString(), Privates);
4946+ else
4947+ ReductionVarNameStr = "unnamed_priv_var";
4948+
4949+ // Create an internal shared variable
4950+ std::string SharedName =
4951+ CGM.getOpenMPRuntime().getName({"internal_pivate_", ReductionVarNameStr});
4952+ llvm::GlobalVariable *SharedVar = OMPBuilder.getOrCreateInternalVariable(
4953+ LLVMType, ".omp.reduction." + SharedName);
4954+
4955+ SharedVar->setAlignment(
4956+ llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8));
4957+
4958+ Address SharedResult =
4959+ CGF.MakeNaturalAlignRawAddrLValue(SharedVar, PrivateType).getAddress();
4960+
4961+ llvm::Value *ThreadId = getThreadID(CGF, Loc);
4962+ llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
4963+ llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
4964+
4965+ llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
4966+ llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
4967+
4968+ llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ(
4969+ ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0));
4970+ CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
4971+
4972+ CGF.EmitBlock(InitBB);
4973+
4974+ auto EmitSharedInit = [&]() {
4975+ if (UDR) { // Check if it's a User-Defined Reduction
4976+ if (const Expr *UDRInitExpr = UDR->getInitializer()) {
4977+ std::pair<llvm::Function *, llvm::Function *> FnPair =
4978+ getUserDefinedReduction(UDR);
4979+ llvm::Function *InitializerFn = FnPair.second;
4980+ if (InitializerFn) {
4981+ if (const auto *CE =
4982+ dyn_cast<CallExpr>(UDRInitExpr->IgnoreParenImpCasts())) {
4983+ const auto *OutDRE = cast<DeclRefExpr>(
4984+ cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
4985+ ->getSubExpr());
4986+ const VarDecl *OutVD = cast<VarDecl>(OutDRE->getDecl());
4987+
4988+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
4989+ LocalScope.addPrivate(OutVD, SharedResult);
4990+
4991+ (void)LocalScope.Privatize();
4992+ if (const auto *OVE = dyn_cast<OpaqueValueExpr>(
4993+ CE->getCallee()->IgnoreParenImpCasts())) {
4994+ CodeGenFunction::OpaqueValueMapping OpaqueMap(
4995+ CGF, OVE, RValue::get(InitializerFn));
4996+ CGF.EmitIgnoredExpr(CE);
4997+ } else {
4998+ CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
4999+ PrivateType.getQualifiers(),
5000+ /*IsInitializer=*/true);
5001+ }
5002+ } else {
5003+ CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
5004+ PrivateType.getQualifiers(),
5005+ /*IsInitializer=*/true);
5006+ }
5007+ } else {
5008+ CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
5009+ PrivateType.getQualifiers(),
5010+ /*IsInitializer=*/true);
5011+ }
5012+ } else {
5013+ // EmitNullInitialization handles default construction for C++ classes
5014+ // and zeroing for scalars, which is a reasonable default.
5015+ CGF.EmitNullInitialization(SharedResult, PrivateType);
5016+ }
5017+ return; // UDR initialization handled
5018+ }
5019+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates)) {
5020+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
5021+ if (const Expr *InitExpr = VD->getInit()) {
5022+ CGF.EmitAnyExprToMem(InitExpr, SharedResult,
5023+ PrivateType.getQualifiers(), true);
5024+ return;
5025+ }
5026+ }
5027+ }
5028+ CGF.EmitNullInitialization(SharedResult, PrivateType);
5029+ };
5030+ EmitSharedInit();
5031+ CGF.Builder.CreateBr(InitEndBB);
5032+ CGF.EmitBlock(InitEndBB);
5033+
5034+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
5035+ CGM.getModule(), OMPRTL___kmpc_barrier),
5036+ BarrierArgs);
5037+
5038+ const Expr *ReductionOp = ReductionOps;
5039+ const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp);
5040+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
5041+ LValue LHSLV = CGF.EmitLValue(Privates);
5042+
5043+ auto EmitCriticalReduction = [&](auto ReductionGen) {
5044+ std::string CriticalName = getName({"reduction_critical"});
5045+ emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
5046+ };
5047+
5048+ if (CurrentUDR) {
5049+ // Handle user-defined reduction.
5050+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
5051+ Action.Enter(CGF);
5052+ std::pair<llvm::Function *, llvm::Function *> FnPair =
5053+ getUserDefinedReduction(CurrentUDR);
5054+ if (FnPair.first) {
5055+ if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
5056+ const auto *OutDRE = cast<DeclRefExpr>(
5057+ cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
5058+ ->getSubExpr());
5059+ const auto *InDRE = cast<DeclRefExpr>(
5060+ cast<UnaryOperator>(CE->getArg(1)->IgnoreParenImpCasts())
5061+ ->getSubExpr());
5062+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
5063+ LocalScope.addPrivate(cast<VarDecl>(OutDRE->getDecl()),
5064+ SharedLV.getAddress());
5065+ LocalScope.addPrivate(cast<VarDecl>(InDRE->getDecl()),
5066+ LHSLV.getAddress());
5067+ (void)LocalScope.Privatize();
5068+ emitReductionCombiner(CGF, ReductionOp);
5069+ }
5070+ }
5071+ };
5072+ EmitCriticalReduction(ReductionGen);
5073+ } else {
5074+ // Handle built-in reduction operations.
5075+ #ifndef NDEBUG
5076+ const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
5077+ if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
5078+ ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
5079+
5080+ const Expr *AssignRHS = nullptr;
5081+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
5082+ if (BinOp->getOpcode() == BO_Assign)
5083+ AssignRHS = BinOp->getRHS();
5084+ } else if (const auto *OpCall =
5085+ dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
5086+ if (OpCall->getOperator() == OO_Equal)
5087+ AssignRHS = OpCall->getArg(1);
5088+ }
5089+
5090+ assert(AssignRHS &&
5091+ "Private Variable Reduction : Invalid ReductionOp expression");
5092+ #endif
5093+
5094+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
5095+ Action.Enter(CGF);
5096+ const auto *OmpOutDRE =
5097+ dyn_cast<DeclRefExpr>(LHSExprs->IgnoreParenImpCasts());
5098+ const auto *OmpInDRE =
5099+ dyn_cast<DeclRefExpr>(RHSExprs->IgnoreParenImpCasts());
5100+ assert(
5101+ OmpOutDRE && OmpInDRE &&
5102+ "Private Variable Reduction : LHSExpr/RHSExpr must be DeclRefExprs");
5103+ const VarDecl *OmpOutVD = cast<VarDecl>(OmpOutDRE->getDecl());
5104+ const VarDecl *OmpInVD = cast<VarDecl>(OmpInDRE->getDecl());
5105+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
5106+ LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress());
5107+ LocalScope.addPrivate(OmpInVD, LHSLV.getAddress());
5108+ (void)LocalScope.Privatize();
5109+ // Emit the actual reduction operation
5110+ CGF.EmitIgnoredExpr(ReductionOp);
5111+ };
5112+ EmitCriticalReduction(ReductionGen);
5113+ }
5114+
5115+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
5116+ CGM.getModule(), OMPRTL___kmpc_barrier),
5117+ BarrierArgs);
5118+
5119+ // Broadcast final result
5120+ bool IsAggregate = PrivateType->isAggregateType();
5121+ LValue SharedLV1 = CGF.MakeAddrLValue(SharedResult, PrivateType);
5122+ llvm::Value *FinalResultVal = nullptr;
5123+ Address FinalResultAddr = Address::invalid();
5124+
5125+ if (IsAggregate)
5126+ FinalResultAddr = SharedResult;
5127+ else
5128+ FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc);
5129+
5130+ LValue TargetLHSLV = CGF.EmitLValue(RHSExprs);
5131+ if (IsAggregate) {
5132+ CGF.EmitAggregateCopy(TargetLHSLV,
5133+ CGF.MakeAddrLValue(FinalResultAddr, PrivateType),
5134+ PrivateType, AggValueSlot::DoesNotOverlap, false);
5135+ } else {
5136+ CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
5137+ }
5138+ // Final synchronization barrier
5139+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
5140+ CGM.getModule(), OMPRTL___kmpc_barrier),
5141+ BarrierArgs);
5142+
5143+ // Combiner with original list item
5144+ auto OriginalListCombiner = [&](CodeGenFunction &CGF,
5145+ PrePostActionTy &Action) {
5146+ Action.Enter(CGF);
5147+ emitSingleReductionCombiner(CGF, ReductionOps, Privates,
5148+ cast<DeclRefExpr>(LHSExprs),
5149+ cast<DeclRefExpr>(RHSExprs));
5150+ };
5151+ EmitCriticalReduction(OriginalListCombiner);
5152+ }
5153+
49105154void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
4911- ArrayRef<const Expr *> Privates ,
4912- ArrayRef<const Expr *> LHSExprs ,
4913- ArrayRef<const Expr *> RHSExprs ,
4914- ArrayRef<const Expr *> ReductionOps ,
5155+ ArrayRef<const Expr *> OrgPrivates ,
5156+ ArrayRef<const Expr *> OrgLHSExprs ,
5157+ ArrayRef<const Expr *> OrgRHSExprs ,
5158+ ArrayRef<const Expr *> OrgReductionOps ,
49155159 ReductionOptionsTy Options) {
49165160 if (!CGF.HaveInsertPoint())
49175161 return;
@@ -4958,10 +5202,10 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
49585202
49595203 if (SimpleReduction) {
49605204 CodeGenFunction::RunCleanupsScope Scope(CGF);
4961- const auto *IPriv = Privates .begin();
4962- const auto *ILHS = LHSExprs .begin();
4963- const auto *IRHS = RHSExprs .begin();
4964- for (const Expr *E : ReductionOps ) {
5205+ const auto *IPriv = OrgPrivates .begin();
5206+ const auto *ILHS = OrgLHSExprs .begin();
5207+ const auto *IRHS = OrgRHSExprs .begin();
5208+ for (const Expr *E : OrgReductionOps ) {
49655209 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
49665210 cast<DeclRefExpr>(*IRHS));
49675211 ++IPriv;
@@ -4971,6 +5215,26 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
49715215 return;
49725216 }
49735217
5218+ // Filter out shared reduction variables based on IsPrivateVarReduction flag.
5219+ // Only keep entries where the corresponding variable is not private.
5220+ SmallVector<const Expr *> FilteredPrivates, FilteredLHSExprs,
5221+ FilteredRHSExprs, FilteredReductionOps;
5222+ for (unsigned I : llvm::seq<unsigned>(
5223+ std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) {
5224+ if (!Options.IsPrivateVarReduction[I]) {
5225+ FilteredPrivates.emplace_back(OrgPrivates[I]);
5226+ FilteredLHSExprs.emplace_back(OrgLHSExprs[I]);
5227+ FilteredRHSExprs.emplace_back(OrgRHSExprs[I]);
5228+ FilteredReductionOps.emplace_back(OrgReductionOps[I]);
5229+ }
5230+ }
5231+ // Wrap filtered vectors in ArrayRef for downstream shared reduction
5232+ // processing.
5233+ ArrayRef<const Expr *> Privates = FilteredPrivates;
5234+ ArrayRef<const Expr *> LHSExprs = FilteredLHSExprs;
5235+ ArrayRef<const Expr *> RHSExprs = FilteredRHSExprs;
5236+ ArrayRef<const Expr *> ReductionOps = FilteredReductionOps;
5237+
49745238 // 1. Build a list of reduction variables.
49755239 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
49765240 auto Size = RHSExprs.size();
@@ -5162,7 +5426,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
51625426 } else {
51635427 // Emit as a critical region.
51645428 auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *,
5165- const Expr *, const Expr *) {
5429+ const Expr *, const Expr *) {
51665430 CGOpenMPRuntime &RT = CGF.CGM.getOpenMPRuntime();
51675431 std::string Name = RT.getName({"atomic_reduction"});
51685432 RT.emitCriticalRegion(
@@ -5209,6 +5473,16 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
52095473
52105474 CGF.EmitBranch(DefaultBB);
52115475 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
5476+ assert(OrgLHSExprs.size() == OrgPrivates.size() &&
5477+ "PrivateVarReduction: Privates size mismatch");
5478+ assert(OrgLHSExprs.size() == OrgReductionOps.size() &&
5479+ "PrivateVarReduction: ReductionOps size mismatch");
5480+ for (unsigned I : llvm::seq<unsigned>(
5481+ std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) {
5482+ if (Options.IsPrivateVarReduction[I])
5483+ emitPrivateReduction(CGF, Loc, OrgPrivates[I], OrgLHSExprs[I],
5484+ OrgRHSExprs[I], OrgReductionOps[I]);
5485+ }
52125486}
52135487
52145488/// Generates unique name for artificial threadprivate variables.
0 commit comments