[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. #118302

jayfoad · 2024-12-02T14:34:05Z

Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.

Use references instead of pointers for most state and common up some of the initialization between the legacy and new pass manager paths.

llvmbot · 2024-12-02T14:35:27Z

@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

Changes

Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.

Full diff: https://github.com/llvm/llvm-project/pull/118302.diff

1 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (+39-47)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4ca1ae0499b93..c09c71c83fead7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
 class AMDGPUAtomicOptimizerImpl
     : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
 private:
+  Function &F;
   SmallVector<ReplacementInfo, 8> ToReplace;
-  const UniformityInfo *UA;
-  const DataLayout *DL;
+  const UniformityInfo &UA;
+  const DataLayout &DL;
   DomTreeUpdater &DTU;
-  const GCNSubtarget *ST;
+  const GCNSubtarget &ST;
   bool IsPixelShader;
   ScanOptions ScanImpl;
 
@@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
 public:
   AMDGPUAtomicOptimizerImpl() = delete;
 
-  AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
-                            DomTreeUpdater &DTU, const GCNSubtarget *ST,
-                            bool IsPixelShader, ScanOptions ScanImpl)
-      : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
+  AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA,
+                            DomTreeUpdater &DTU, const GCNSubtarget &ST,
+                            ScanOptions ScanImpl)
+      : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
+        IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS),
         ScanImpl(ScanImpl) {}
 
-  bool run(Function &F);
+  bool run();
 
   void visitAtomicRMWInst(AtomicRMWInst &I);
   void visitIntrinsicInst(IntrinsicInst &I);
@@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
     return false;
   }
 
-  const UniformityInfo *UA =
-      &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-  const DataLayout *DL = &F.getDataLayout();
+  const UniformityInfo &UA =
+      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
-  DominatorTreeWrapperPass *const DTW =
+  DominatorTreeWrapperPass *DTW =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
                      DomTreeUpdater::UpdateStrategy::Lazy);
 
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
-  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
 
-  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
-
-  return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
-      .run(F);
+  return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
 }
 
 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
-
-  const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
-  const DataLayout *DL = &F.getDataLayout();
+  const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
 
   DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
                      DomTreeUpdater::UpdateStrategy::Lazy);
-  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
-
-  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
 
-  bool IsChanged =
-      AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
-          .run(F);
+  bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
 
   if (!IsChanged) {
     return PreservedAnalyses::all();
@@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
   return PA;
 }
 
-bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
+bool AMDGPUAtomicOptimizerImpl::run() {
 
   // Scan option None disables the Pass
   if (ScanImpl == ScanOptions::None) {
@@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
 
   // If the pointer operand is divergent, then each lane is doing an atomic
   // operation on a different address, and we cannot optimize that.
-  if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
+  if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) {
     return;
   }
 
-  bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+  bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget (for DPP strategy), and the atomic
   // operation is 32 or 64 bits.
   if (ValDivergent) {
-    if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+    if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
       return;
 
     if (!isLegalCrossLaneType(I.getType()))
@@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
 
   const unsigned ValIdx = 0;
 
-  const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+  const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget (for DPP strategy), and the atomic
   // operation is 32 or 64 bits.
   if (ValDivergent) {
-    if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+    if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
       return;
 
     if (!isLegalCrossLaneType(I.getType()))
@@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
   // If any of the other arguments to the intrinsic are divergent, we can't
   // optimize the operation.
   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
-    if (UA->isDivergentUse(I.getOperandUse(Idx))) {
+    if (UA.isDivergentUse(I.getOperandUse(Idx))) {
       return;
     }
   }
@@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   }
 
   // Reduce within each pair of rows (i.e. 32 lanes).
-  assert(ST->hasPermLaneX16());
+  assert(ST.hasPermLaneX16());
   Value *Permlanex16Call =
       B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
                         {PoisonValue::get(AtomicTy), V, B.getInt32(0),
                          B.getInt32(0), B.getFalse(), B.getFalse()});
   V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
-  if (ST->isWave32()) {
+  if (ST.isWave32()) {
     return V;
   }
 
-  if (ST->hasPermLane64()) {
+  if (ST.hasPermLane64()) {
     // Reduce across the upper and lower 32 lanes.
     Value *Permlane64Call =
         B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
@@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
                      {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
   }
-  if (ST->hasDPPBroadcasts()) {
+  if (ST.hasDPPBroadcasts()) {
     // GFX9 has DPP row broadcast operations.
     V = buildNonAtomicBinOp(
         B, Op, V,
@@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
 
     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
     // 48..63).
-    assert(ST->hasPermLaneX16());
+    assert(ST.hasPermLaneX16());
     Value *PermX =
         B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
                           {PoisonValue::get(AtomicTy), V, B.getInt32(-1),
@@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
                     B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
     V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
 
-    if (!ST->isWave32()) {
+    if (!ST.isWave32()) {
       // Combine lane 31 into lanes 32..63.
       Value *const Lane31 = B.CreateIntrinsic(
           AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::amdgcn_update_dpp, AtomicTy);
-  if (ST->hasDPPWavefrontShifts()) {
+  if (ST.hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
     V = B.CreateCall(UpdateDPP,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
@@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
     V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
                                  B.getInt32(16), V});
 
-    if (!ST->isWave32()) {
+    if (!ST.isWave32()) {
       // Copy the old lane 31 to the new lane 32.
       V = B.CreateCall(
           WriteLane,
@@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
     IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
     Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
   auto *Ty = I.getType();
-  auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  auto *WaveTy = B.getIntNTy(ST.getWavefrontSize());
   auto *EntryBB = I.getParent();
   auto NeedResult = !I.use_empty();
 
@@ -698,7 +690,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   Type *const Ty = I.getType();
   Type *Int32Ty = B.getInt32Ty();
   bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
-  [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
+  [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
 
   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
@@ -706,7 +698,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
 
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
-  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
   CallInst *const Ballot =
       B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
 
@@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // below us only if its associated index was less than ours. We do this by
   // using the mbcnt intrinsic.
   Value *Mbcnt;
-  if (ST->isWave32()) {
+  if (ST.isWave32()) {
     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
                               {Ballot, B.getInt32(0)});
   } else {
@@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       // that they can correctly contribute to the final result.
       NewV =
           B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-      if (!NeedResult && ST->hasPermLaneX16()) {
+      if (!NeedResult && ST.hasPermLaneX16()) {
         // On GFX10 the permlanex16 instruction helps us build a reduction
         // without too many readlanes and writelanes, which are generally bad
         // for performance.
@@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
         // Read the value from the last lane, which has accumulated the values
         // of each active lane in the wavefront. This will be our new value
         // which we will provide to the atomic operation.
-        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+        Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
         NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
                                  {NewV, LastLaneIdx});
       }

[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC.

2e291c6

Use references instead of pointers for most state and common up some of the initialization between the legacy and new pass manager paths.

llvmbot added the backend:AMDGPU label Dec 2, 2024

shiltian approved these changes Dec 2, 2024

View reviewed changes

arsenm approved these changes Dec 2, 2024

View reviewed changes

jayfoad merged commit 83cbb17 into llvm:main Dec 3, 2024
10 checks passed

jayfoad deleted the refine-amdgpuatomicoptimizer branch December 3, 2024 07:40

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. #118302

[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. #118302

Uh oh!

jayfoad commented Dec 2, 2024

Uh oh!

llvmbot commented Dec 2, 2024

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. #118302

[AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. #118302

Uh oh!

Conversation

jayfoad commented Dec 2, 2024

Uh oh!

llvmbot commented Dec 2, 2024

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants