-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. #118302
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Use references instead of pointers for most state and common up some of the initialization between the legacy and new pass manager paths.
Member
|
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesUse references instead of pointers for most state and common up some of Full diff: https://github.com/llvm/llvm-project/pull/118302.diff 1 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4ca1ae0499b93..c09c71c83fead7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
class AMDGPUAtomicOptimizerImpl
: public InstVisitor<AMDGPUAtomicOptimizerImpl> {
private:
+ Function &F;
SmallVector<ReplacementInfo, 8> ToReplace;
- const UniformityInfo *UA;
- const DataLayout *DL;
+ const UniformityInfo &UA;
+ const DataLayout &DL;
DomTreeUpdater &DTU;
- const GCNSubtarget *ST;
+ const GCNSubtarget &ST;
bool IsPixelShader;
ScanOptions ScanImpl;
@@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
public:
AMDGPUAtomicOptimizerImpl() = delete;
- AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
- DomTreeUpdater &DTU, const GCNSubtarget *ST,
- bool IsPixelShader, ScanOptions ScanImpl)
- : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
+ AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA,
+ DomTreeUpdater &DTU, const GCNSubtarget &ST,
+ ScanOptions ScanImpl)
+ : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
+ IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS),
ScanImpl(ScanImpl) {}
- bool run(Function &F);
+ bool run();
void visitAtomicRMWInst(AtomicRMWInst &I);
void visitIntrinsicInst(IntrinsicInst &I);
@@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
return false;
}
- const UniformityInfo *UA =
- &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- const DataLayout *DL = &F.getDataLayout();
+ const UniformityInfo &UA =
+ getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- DominatorTreeWrapperPass *const DTW =
+ DominatorTreeWrapperPass *DTW =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
DomTreeUpdater::UpdateStrategy::Lazy);
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
-
- return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
- .run(F);
+ return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
}
PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
FunctionAnalysisManager &AM) {
-
- const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
- const DataLayout *DL = &F.getDataLayout();
+ const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
DomTreeUpdater::UpdateStrategy::Lazy);
- const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
-
- bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- bool IsChanged =
- AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
- .run(F);
+ bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
if (!IsChanged) {
return PreservedAnalyses::all();
@@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
return PA;
}
-bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
+bool AMDGPUAtomicOptimizerImpl::run() {
// Scan option None disables the Pass
if (ScanImpl == ScanOptions::None) {
@@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// If the pointer operand is divergent, then each lane is doing an atomic
// operation on a different address, and we cannot optimize that.
- if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
+ if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) {
return;
}
- bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+ bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
- if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+ if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
return;
if (!isLegalCrossLaneType(I.getType()))
@@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
const unsigned ValIdx = 0;
- const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+ const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
- if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+ if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
return;
if (!isLegalCrossLaneType(I.getType()))
@@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// If any of the other arguments to the intrinsic are divergent, we can't
// optimize the operation.
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
- if (UA->isDivergentUse(I.getOperandUse(Idx))) {
+ if (UA.isDivergentUse(I.getOperandUse(Idx))) {
return;
}
}
@@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
}
// Reduce within each pair of rows (i.e. 32 lanes).
- assert(ST->hasPermLaneX16());
+ assert(ST.hasPermLaneX16());
Value *Permlanex16Call =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
{PoisonValue::get(AtomicTy), V, B.getInt32(0),
B.getInt32(0), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
- if (ST->isWave32()) {
+ if (ST.isWave32()) {
return V;
}
- if (ST->hasPermLane64()) {
+ if (ST.hasPermLane64()) {
// Reduce across the upper and lower 32 lanes.
Value *Permlane64Call =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
@@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
{Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
}
- if (ST->hasDPPBroadcasts()) {
+ if (ST.hasDPPBroadcasts()) {
// GFX9 has DPP row broadcast operations.
V = buildNonAtomicBinOp(
B, Op, V,
@@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
- assert(ST->hasPermLaneX16());
+ assert(ST.hasPermLaneX16());
Value *PermX =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
{PoisonValue::get(AtomicTy), V, B.getInt32(-1),
@@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
- if (!ST->isWave32()) {
+ if (!ST.isWave32()) {
// Combine lane 31 into lanes 32..63.
Value *const Lane31 = B.CreateIntrinsic(
AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
M, Intrinsic::amdgcn_update_dpp, AtomicTy);
- if (ST->hasDPPWavefrontShifts()) {
+ if (ST.hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
V = B.CreateCall(UpdateDPP,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
@@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
B.getInt32(16), V});
- if (!ST->isWave32()) {
+ if (!ST.isWave32()) {
// Copy the old lane 31 to the new lane 32.
V = B.CreateCall(
WriteLane,
@@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
auto *Ty = I.getType();
- auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ auto *WaveTy = B.getIntNTy(ST.getWavefrontSize());
auto *EntryBB = I.getParent();
auto NeedResult = !I.use_empty();
@@ -698,7 +690,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Type *const Ty = I.getType();
Type *Int32Ty = B.getInt32Ty();
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
- [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
+ [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
@@ -706,7 +698,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
- Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
CallInst *const Ballot =
B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
@@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
Value *Mbcnt;
- if (ST->isWave32()) {
+ if (ST.isWave32()) {
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
{Ballot, B.getInt32(0)});
} else {
@@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// that they can correctly contribute to the final result.
NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- if (!NeedResult && ST->hasPermLaneX16()) {
+ if (!NeedResult && ST.hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
// for performance.
@@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// Read the value from the last lane, which has accumulated the values
// of each active lane in the wavefront. This will be our new value
// which we will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
}
|
shiltian
approved these changes
Dec 2, 2024
arsenm
approved these changes
Dec 2, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.