@@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
6666class AMDGPUAtomicOptimizerImpl
6767 : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
6868private:
69+ Function &F;
6970 SmallVector<ReplacementInfo, 8 > ToReplace;
70- const UniformityInfo * UA;
71- const DataLayout * DL;
71+ const UniformityInfo & UA;
72+ const DataLayout & DL;
7273 DomTreeUpdater &DTU;
73- const GCNSubtarget * ST;
74+ const GCNSubtarget & ST;
7475 bool IsPixelShader;
7576 ScanOptions ScanImpl;
7677
@@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
9192public:
9293 AMDGPUAtomicOptimizerImpl () = delete ;
9394
94- AMDGPUAtomicOptimizerImpl (const UniformityInfo *UA, const DataLayout *DL,
95- DomTreeUpdater &DTU, const GCNSubtarget *ST,
96- bool IsPixelShader, ScanOptions ScanImpl)
97- : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
95+ AMDGPUAtomicOptimizerImpl (Function &F, const UniformityInfo &UA,
96+ DomTreeUpdater &DTU, const GCNSubtarget &ST,
97+ ScanOptions ScanImpl)
98+ : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
99+ IsPixelShader (F.getCallingConv() == CallingConv::AMDGPU_PS),
98100 ScanImpl(ScanImpl) {}
99101
100- bool run (Function &F );
102+ bool run ();
101103
102104 void visitAtomicRMWInst (AtomicRMWInst &I);
103105 void visitIntrinsicInst (IntrinsicInst &I);
@@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
114116 return false ;
115117 }
116118
117- const UniformityInfo *UA =
118- &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo ();
119- const DataLayout *DL = &F.getDataLayout ();
119+ const UniformityInfo &UA =
120+ getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo ();
120121
121- DominatorTreeWrapperPass *const DTW =
122+ DominatorTreeWrapperPass *DTW =
122123 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
123124 DomTreeUpdater DTU (DTW ? &DTW->getDomTree () : nullptr ,
124125 DomTreeUpdater::UpdateStrategy::Lazy);
125126
126127 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
127128 const TargetMachine &TM = TPC.getTM <TargetMachine>();
128- const GCNSubtarget * ST = & TM.getSubtarget <GCNSubtarget>(F);
129+ const GCNSubtarget & ST = TM.getSubtarget <GCNSubtarget>(F);
129130
130- bool IsPixelShader = F.getCallingConv () == CallingConv::AMDGPU_PS;
131-
132- return AMDGPUAtomicOptimizerImpl (UA, DL, DTU, ST, IsPixelShader, ScanImpl)
133- .run (F);
131+ return AMDGPUAtomicOptimizerImpl (F, UA, DTU, ST, ScanImpl).run ();
134132}
135133
136134PreservedAnalyses AMDGPUAtomicOptimizerPass::run (Function &F,
137135 FunctionAnalysisManager &AM) {
138-
139- const auto *UA = &AM.getResult <UniformityInfoAnalysis>(F);
140- const DataLayout *DL = &F.getDataLayout ();
136+ const auto &UA = AM.getResult <UniformityInfoAnalysis>(F);
141137
142138 DomTreeUpdater DTU (&AM.getResult <DominatorTreeAnalysis>(F),
143139 DomTreeUpdater::UpdateStrategy::Lazy);
144- const GCNSubtarget *ST = &TM.getSubtarget <GCNSubtarget>(F);
145-
146- bool IsPixelShader = F.getCallingConv () == CallingConv::AMDGPU_PS;
140+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
147141
148- bool IsChanged =
149- AMDGPUAtomicOptimizerImpl (UA, DL, DTU, ST, IsPixelShader, ScanImpl)
150- .run (F);
142+ bool IsChanged = AMDGPUAtomicOptimizerImpl (F, UA, DTU, ST, ScanImpl).run ();
151143
152144 if (!IsChanged) {
153145 return PreservedAnalyses::all ();
@@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
158150 return PA;
159151}
160152
161- bool AMDGPUAtomicOptimizerImpl::run (Function &F ) {
153+ bool AMDGPUAtomicOptimizerImpl::run () {
162154
163155 // Scan option None disables the Pass
164156 if (ScanImpl == ScanOptions::None) {
@@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
234226
235227 // If the pointer operand is divergent, then each lane is doing an atomic
236228 // operation on a different address, and we cannot optimize that.
237- if (UA-> isDivergentUse (I.getOperandUse (PtrIdx))) {
229+ if (UA. isDivergentUse (I.getOperandUse (PtrIdx))) {
238230 return ;
239231 }
240232
241- bool ValDivergent = UA-> isDivergentUse (I.getOperandUse (ValIdx));
233+ bool ValDivergent = UA. isDivergentUse (I.getOperandUse (ValIdx));
242234
243235 // If the value operand is divergent, each lane is contributing a different
244236 // value to the atomic calculation. We can only optimize divergent values if
245237 // we have DPP available on our subtarget (for DPP strategy), and the atomic
246238 // operation is 32 or 64 bits.
247239 if (ValDivergent) {
248- if (ScanImpl == ScanOptions::DPP && !ST-> hasDPP ())
240+ if (ScanImpl == ScanOptions::DPP && !ST. hasDPP ())
249241 return ;
250242
251243 if (!isLegalCrossLaneType (I.getType ()))
@@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
324316
325317 const unsigned ValIdx = 0 ;
326318
327- const bool ValDivergent = UA-> isDivergentUse (I.getOperandUse (ValIdx));
319+ const bool ValDivergent = UA. isDivergentUse (I.getOperandUse (ValIdx));
328320
329321 // If the value operand is divergent, each lane is contributing a different
330322 // value to the atomic calculation. We can only optimize divergent values if
331323 // we have DPP available on our subtarget (for DPP strategy), and the atomic
332324 // operation is 32 or 64 bits.
333325 if (ValDivergent) {
334- if (ScanImpl == ScanOptions::DPP && !ST-> hasDPP ())
326+ if (ScanImpl == ScanOptions::DPP && !ST. hasDPP ())
335327 return ;
336328
337329 if (!isLegalCrossLaneType (I.getType ()))
@@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
341333 // If any of the other arguments to the intrinsic are divergent, we can't
342334 // optimize the operation.
343335 for (unsigned Idx = 1 ; Idx < I.getNumOperands (); Idx++) {
344- if (UA-> isDivergentUse (I.getOperandUse (Idx))) {
336+ if (UA. isDivergentUse (I.getOperandUse (Idx))) {
345337 return ;
346338 }
347339 }
@@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
418410 }
419411
420412 // Reduce within each pair of rows (i.e. 32 lanes).
421- assert (ST-> hasPermLaneX16 ());
413+ assert (ST. hasPermLaneX16 ());
422414 Value *Permlanex16Call =
423415 B.CreateIntrinsic (AtomicTy, Intrinsic::amdgcn_permlanex16,
424416 {PoisonValue::get (AtomicTy), V, B.getInt32 (0 ),
425417 B.getInt32 (0 ), B.getFalse (), B.getFalse ()});
426418 V = buildNonAtomicBinOp (B, Op, V, Permlanex16Call);
427- if (ST-> isWave32 ()) {
419+ if (ST. isWave32 ()) {
428420 return V;
429421 }
430422
431- if (ST-> hasPermLane64 ()) {
423+ if (ST. hasPermLane64 ()) {
432424 // Reduce across the upper and lower 32 lanes.
433425 Value *Permlane64Call =
434426 B.CreateIntrinsic (AtomicTy, Intrinsic::amdgcn_permlane64, V);
@@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
461453 {Identity, V, B.getInt32 (DPP::ROW_SHR0 | 1 << Idx),
462454 B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()}));
463455 }
464- if (ST-> hasDPPBroadcasts ()) {
456+ if (ST. hasDPPBroadcasts ()) {
465457 // GFX9 has DPP row broadcast operations.
466458 V = buildNonAtomicBinOp (
467459 B, Op, V,
@@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
479471
480472 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
481473 // 48..63).
482- assert (ST-> hasPermLaneX16 ());
474+ assert (ST. hasPermLaneX16 ());
483475 Value *PermX =
484476 B.CreateIntrinsic (AtomicTy, Intrinsic::amdgcn_permlanex16,
485477 {PoisonValue::get (AtomicTy), V, B.getInt32 (-1 ),
@@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
490482 B.getInt32 (0xa ), B.getInt32 (0xf ), B.getFalse ()});
491483 V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
492484
493- if (!ST-> isWave32 ()) {
485+ if (!ST. isWave32 ()) {
494486 // Combine lane 31 into lanes 32..63.
495487 Value *const Lane31 = B.CreateIntrinsic (
496488 AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
@@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
513505 Module *M = B.GetInsertBlock ()->getModule ();
514506 Function *UpdateDPP = Intrinsic::getOrInsertDeclaration (
515507 M, Intrinsic::amdgcn_update_dpp, AtomicTy);
516- if (ST-> hasDPPWavefrontShifts ()) {
508+ if (ST. hasDPPWavefrontShifts ()) {
517509 // GFX9 has DPP wavefront shift operations.
518510 V = B.CreateCall (UpdateDPP,
519511 {Identity, V, B.getInt32 (DPP::WAVE_SHR1), B.getInt32 (0xf ),
@@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
535527 V = B.CreateCall (WriteLane, {B.CreateCall (ReadLane, {Old, B.getInt32 (15 )}),
536528 B.getInt32 (16 ), V});
537529
538- if (!ST-> isWave32 ()) {
530+ if (!ST. isWave32 ()) {
539531 // Copy the old lane 31 to the new lane 32.
540532 V = B.CreateCall (
541533 WriteLane,
@@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
560552 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
561553 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
562554 auto *Ty = I.getType ();
563- auto *WaveTy = B.getIntNTy (ST-> getWavefrontSize ());
555+ auto *WaveTy = B.getIntNTy (ST. getWavefrontSize ());
564556 auto *EntryBB = I.getParent ();
565557 auto NeedResult = !I.use_empty ();
566558
@@ -698,15 +690,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
698690 Type *const Ty = I.getType ();
699691 Type *Int32Ty = B.getInt32Ty ();
700692 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy ();
701- [[maybe_unused]] const unsigned TyBitWidth = DL-> getTypeSizeInBits (Ty);
693+ [[maybe_unused]] const unsigned TyBitWidth = DL. getTypeSizeInBits (Ty);
702694
703695 // This is the value in the atomic operation we need to combine in order to
704696 // reduce the number of atomic operations.
705697 Value *V = I.getOperand (ValIdx);
706698
707699 // We need to know how many lanes are active within the wavefront, and we do
708700 // this by doing a ballot of active lanes.
709- Type *const WaveTy = B.getIntNTy (ST-> getWavefrontSize ());
701+ Type *const WaveTy = B.getIntNTy (ST. getWavefrontSize ());
710702 CallInst *const Ballot =
711703 B.CreateIntrinsic (Intrinsic::amdgcn_ballot, WaveTy, B.getTrue ());
712704
@@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
715707 // below us only if its associated index was less than ours. We do this by
716708 // using the mbcnt intrinsic.
717709 Value *Mbcnt;
718- if (ST-> isWave32 ()) {
710+ if (ST. isWave32 ()) {
719711 Mbcnt = B.CreateIntrinsic (Intrinsic::amdgcn_mbcnt_lo, {},
720712 {Ballot, B.getInt32 (0 )});
721713 } else {
@@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
755747 // that they can correctly contribute to the final result.
756748 NewV =
757749 B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
758- if (!NeedResult && ST-> hasPermLaneX16 ()) {
750+ if (!NeedResult && ST. hasPermLaneX16 ()) {
759751 // On GFX10 the permlanex16 instruction helps us build a reduction
760752 // without too many readlanes and writelanes, which are generally bad
761753 // for performance.
@@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
767759 // Read the value from the last lane, which has accumulated the values
768760 // of each active lane in the wavefront. This will be our new value
769761 // which we will provide to the atomic operation.
770- Value *const LastLaneIdx = B.getInt32 (ST-> getWavefrontSize () - 1 );
762+ Value *const LastLaneIdx = B.getInt32 (ST. getWavefrontSize () - 1 );
771763 NewV = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readlane,
772764 {NewV, LastLaneIdx});
773765 }
0 commit comments