Skip to content

Commit 83cbb17

Browse files
authored
[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. (#118302)
Use references instead of pointers for most state and common up some of the initialization between the legacy and new pass manager paths.
1 parent e776484 commit 83cbb17

File tree

1 file changed

+39
-47
lines changed

1 file changed

+39
-47
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 39 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
6666
class AMDGPUAtomicOptimizerImpl
6767
: public InstVisitor<AMDGPUAtomicOptimizerImpl> {
6868
private:
69+
Function &F;
6970
SmallVector<ReplacementInfo, 8> ToReplace;
70-
const UniformityInfo *UA;
71-
const DataLayout *DL;
71+
const UniformityInfo &UA;
72+
const DataLayout &DL;
7273
DomTreeUpdater &DTU;
73-
const GCNSubtarget *ST;
74+
const GCNSubtarget &ST;
7475
bool IsPixelShader;
7576
ScanOptions ScanImpl;
7677

@@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
9192
public:
9293
AMDGPUAtomicOptimizerImpl() = delete;
9394

94-
AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
95-
DomTreeUpdater &DTU, const GCNSubtarget *ST,
96-
bool IsPixelShader, ScanOptions ScanImpl)
97-
: UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
95+
AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA,
96+
DomTreeUpdater &DTU, const GCNSubtarget &ST,
97+
ScanOptions ScanImpl)
98+
: F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
99+
IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS),
98100
ScanImpl(ScanImpl) {}
99101

100-
bool run(Function &F);
102+
bool run();
101103

102104
void visitAtomicRMWInst(AtomicRMWInst &I);
103105
void visitIntrinsicInst(IntrinsicInst &I);
@@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
114116
return false;
115117
}
116118

117-
const UniformityInfo *UA =
118-
&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
119-
const DataLayout *DL = &F.getDataLayout();
119+
const UniformityInfo &UA =
120+
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
120121

121-
DominatorTreeWrapperPass *const DTW =
122+
DominatorTreeWrapperPass *DTW =
122123
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
123124
DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
124125
DomTreeUpdater::UpdateStrategy::Lazy);
125126

126127
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
127128
const TargetMachine &TM = TPC.getTM<TargetMachine>();
128-
const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
129+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
129130

130-
bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
131-
132-
return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
133-
.run(F);
131+
return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
134132
}
135133

136134
PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
137135
FunctionAnalysisManager &AM) {
138-
139-
const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
140-
const DataLayout *DL = &F.getDataLayout();
136+
const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
141137

142138
DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
143139
DomTreeUpdater::UpdateStrategy::Lazy);
144-
const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
145-
146-
bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
140+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
147141

148-
bool IsChanged =
149-
AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
150-
.run(F);
142+
bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
151143

152144
if (!IsChanged) {
153145
return PreservedAnalyses::all();
@@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
158150
return PA;
159151
}
160152

161-
bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
153+
bool AMDGPUAtomicOptimizerImpl::run() {
162154

163155
// Scan option None disables the Pass
164156
if (ScanImpl == ScanOptions::None) {
@@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
234226

235227
// If the pointer operand is divergent, then each lane is doing an atomic
236228
// operation on a different address, and we cannot optimize that.
237-
if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
229+
if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) {
238230
return;
239231
}
240232

241-
bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
233+
bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
242234

243235
// If the value operand is divergent, each lane is contributing a different
244236
// value to the atomic calculation. We can only optimize divergent values if
245237
// we have DPP available on our subtarget (for DPP strategy), and the atomic
246238
// operation is 32 or 64 bits.
247239
if (ValDivergent) {
248-
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
240+
if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
249241
return;
250242

251243
if (!isLegalCrossLaneType(I.getType()))
@@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
324316

325317
const unsigned ValIdx = 0;
326318

327-
const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
319+
const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
328320

329321
// If the value operand is divergent, each lane is contributing a different
330322
// value to the atomic calculation. We can only optimize divergent values if
331323
// we have DPP available on our subtarget (for DPP strategy), and the atomic
332324
// operation is 32 or 64 bits.
333325
if (ValDivergent) {
334-
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
326+
if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
335327
return;
336328

337329
if (!isLegalCrossLaneType(I.getType()))
@@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
341333
// If any of the other arguments to the intrinsic are divergent, we can't
342334
// optimize the operation.
343335
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
344-
if (UA->isDivergentUse(I.getOperandUse(Idx))) {
336+
if (UA.isDivergentUse(I.getOperandUse(Idx))) {
345337
return;
346338
}
347339
}
@@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
418410
}
419411

420412
// Reduce within each pair of rows (i.e. 32 lanes).
421-
assert(ST->hasPermLaneX16());
413+
assert(ST.hasPermLaneX16());
422414
Value *Permlanex16Call =
423415
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
424416
{PoisonValue::get(AtomicTy), V, B.getInt32(0),
425417
B.getInt32(0), B.getFalse(), B.getFalse()});
426418
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
427-
if (ST->isWave32()) {
419+
if (ST.isWave32()) {
428420
return V;
429421
}
430422

431-
if (ST->hasPermLane64()) {
423+
if (ST.hasPermLane64()) {
432424
// Reduce across the upper and lower 32 lanes.
433425
Value *Permlane64Call =
434426
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
@@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
461453
{Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
462454
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
463455
}
464-
if (ST->hasDPPBroadcasts()) {
456+
if (ST.hasDPPBroadcasts()) {
465457
// GFX9 has DPP row broadcast operations.
466458
V = buildNonAtomicBinOp(
467459
B, Op, V,
@@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
479471

480472
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
481473
// 48..63).
482-
assert(ST->hasPermLaneX16());
474+
assert(ST.hasPermLaneX16());
483475
Value *PermX =
484476
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
485477
{PoisonValue::get(AtomicTy), V, B.getInt32(-1),
@@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
490482
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
491483
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
492484

493-
if (!ST->isWave32()) {
485+
if (!ST.isWave32()) {
494486
// Combine lane 31 into lanes 32..63.
495487
Value *const Lane31 = B.CreateIntrinsic(
496488
AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
513505
Module *M = B.GetInsertBlock()->getModule();
514506
Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
515507
M, Intrinsic::amdgcn_update_dpp, AtomicTy);
516-
if (ST->hasDPPWavefrontShifts()) {
508+
if (ST.hasDPPWavefrontShifts()) {
517509
// GFX9 has DPP wavefront shift operations.
518510
V = B.CreateCall(UpdateDPP,
519511
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
@@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
535527
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
536528
B.getInt32(16), V});
537529

538-
if (!ST->isWave32()) {
530+
if (!ST.isWave32()) {
539531
// Copy the old lane 31 to the new lane 32.
540532
V = B.CreateCall(
541533
WriteLane,
@@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
560552
IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
561553
Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
562554
auto *Ty = I.getType();
563-
auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
555+
auto *WaveTy = B.getIntNTy(ST.getWavefrontSize());
564556
auto *EntryBB = I.getParent();
565557
auto NeedResult = !I.use_empty();
566558

@@ -698,15 +690,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
698690
Type *const Ty = I.getType();
699691
Type *Int32Ty = B.getInt32Ty();
700692
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
701-
[[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
693+
[[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
702694

703695
// This is the value in the atomic operation we need to combine in order to
704696
// reduce the number of atomic operations.
705697
Value *V = I.getOperand(ValIdx);
706698

707699
// We need to know how many lanes are active within the wavefront, and we do
708700
// this by doing a ballot of active lanes.
709-
Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
701+
Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
710702
CallInst *const Ballot =
711703
B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
712704

@@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
715707
// below us only if its associated index was less than ours. We do this by
716708
// using the mbcnt intrinsic.
717709
Value *Mbcnt;
718-
if (ST->isWave32()) {
710+
if (ST.isWave32()) {
719711
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
720712
{Ballot, B.getInt32(0)});
721713
} else {
@@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
755747
// that they can correctly contribute to the final result.
756748
NewV =
757749
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
758-
if (!NeedResult && ST->hasPermLaneX16()) {
750+
if (!NeedResult && ST.hasPermLaneX16()) {
759751
// On GFX10 the permlanex16 instruction helps us build a reduction
760752
// without too many readlanes and writelanes, which are generally bad
761753
// for performance.
@@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
767759
// Read the value from the last lane, which has accumulated the values
768760
// of each active lane in the wavefront. This will be our new value
769761
// which we will provide to the atomic operation.
770-
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
762+
Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
771763
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
772764
{NewV, LastLaneIdx});
773765
}

0 commit comments

Comments
 (0)