Skip to content

Conversation

@jayfoad
Copy link
Contributor

@jayfoad jayfoad commented Dec 2, 2024

Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.

Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.
@llvmbot
Copy link
Member

llvmbot commented Dec 2, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

Changes

Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.


Full diff: https://github.com/llvm/llvm-project/pull/118302.diff

1 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (+39-47)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4ca1ae0499b93..c09c71c83fead7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
 class AMDGPUAtomicOptimizerImpl
     : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
 private:
+  Function &F;
   SmallVector<ReplacementInfo, 8> ToReplace;
-  const UniformityInfo *UA;
-  const DataLayout *DL;
+  const UniformityInfo &UA;
+  const DataLayout &DL;
   DomTreeUpdater &DTU;
-  const GCNSubtarget *ST;
+  const GCNSubtarget &ST;
   bool IsPixelShader;
   ScanOptions ScanImpl;
 
@@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
 public:
   AMDGPUAtomicOptimizerImpl() = delete;
 
-  AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
-                            DomTreeUpdater &DTU, const GCNSubtarget *ST,
-                            bool IsPixelShader, ScanOptions ScanImpl)
-      : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
+  AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA,
+                            DomTreeUpdater &DTU, const GCNSubtarget &ST,
+                            ScanOptions ScanImpl)
+      : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
+        IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS),
         ScanImpl(ScanImpl) {}
 
-  bool run(Function &F);
+  bool run();
 
   void visitAtomicRMWInst(AtomicRMWInst &I);
   void visitIntrinsicInst(IntrinsicInst &I);
@@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
     return false;
   }
 
-  const UniformityInfo *UA =
-      &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-  const DataLayout *DL = &F.getDataLayout();
+  const UniformityInfo &UA =
+      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
-  DominatorTreeWrapperPass *const DTW =
+  DominatorTreeWrapperPass *DTW =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
                      DomTreeUpdater::UpdateStrategy::Lazy);
 
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
-  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
 
-  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
-
-  return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
-      .run(F);
+  return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
 }
 
 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
-
-  const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
-  const DataLayout *DL = &F.getDataLayout();
+  const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
 
   DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
                      DomTreeUpdater::UpdateStrategy::Lazy);
-  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
-
-  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
 
-  bool IsChanged =
-      AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
-          .run(F);
+  bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
 
   if (!IsChanged) {
     return PreservedAnalyses::all();
@@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
   return PA;
 }
 
-bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
+bool AMDGPUAtomicOptimizerImpl::run() {
 
   // Scan option None disables the Pass
   if (ScanImpl == ScanOptions::None) {
@@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
 
   // If the pointer operand is divergent, then each lane is doing an atomic
   // operation on a different address, and we cannot optimize that.
-  if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
+  if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) {
     return;
   }
 
-  bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+  bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget (for DPP strategy), and the atomic
   // operation is 32 or 64 bits.
   if (ValDivergent) {
-    if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+    if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
       return;
 
     if (!isLegalCrossLaneType(I.getType()))
@@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
 
   const unsigned ValIdx = 0;
 
-  const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+  const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget (for DPP strategy), and the atomic
   // operation is 32 or 64 bits.
   if (ValDivergent) {
-    if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+    if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
       return;
 
     if (!isLegalCrossLaneType(I.getType()))
@@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
   // If any of the other arguments to the intrinsic are divergent, we can't
   // optimize the operation.
   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
-    if (UA->isDivergentUse(I.getOperandUse(Idx))) {
+    if (UA.isDivergentUse(I.getOperandUse(Idx))) {
       return;
     }
   }
@@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   }
 
   // Reduce within each pair of rows (i.e. 32 lanes).
-  assert(ST->hasPermLaneX16());
+  assert(ST.hasPermLaneX16());
   Value *Permlanex16Call =
       B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
                         {PoisonValue::get(AtomicTy), V, B.getInt32(0),
                          B.getInt32(0), B.getFalse(), B.getFalse()});
   V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
-  if (ST->isWave32()) {
+  if (ST.isWave32()) {
     return V;
   }
 
-  if (ST->hasPermLane64()) {
+  if (ST.hasPermLane64()) {
     // Reduce across the upper and lower 32 lanes.
     Value *Permlane64Call =
         B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
@@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
                      {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
   }
-  if (ST->hasDPPBroadcasts()) {
+  if (ST.hasDPPBroadcasts()) {
     // GFX9 has DPP row broadcast operations.
     V = buildNonAtomicBinOp(
         B, Op, V,
@@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
 
     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
     // 48..63).
-    assert(ST->hasPermLaneX16());
+    assert(ST.hasPermLaneX16());
     Value *PermX =
         B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
                           {PoisonValue::get(AtomicTy), V, B.getInt32(-1),
@@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
                     B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
     V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
 
-    if (!ST->isWave32()) {
+    if (!ST.isWave32()) {
       // Combine lane 31 into lanes 32..63.
       Value *const Lane31 = B.CreateIntrinsic(
           AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::amdgcn_update_dpp, AtomicTy);
-  if (ST->hasDPPWavefrontShifts()) {
+  if (ST.hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
     V = B.CreateCall(UpdateDPP,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
@@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
     V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
                                  B.getInt32(16), V});
 
-    if (!ST->isWave32()) {
+    if (!ST.isWave32()) {
       // Copy the old lane 31 to the new lane 32.
       V = B.CreateCall(
           WriteLane,
@@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
     IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
     Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
   auto *Ty = I.getType();
-  auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  auto *WaveTy = B.getIntNTy(ST.getWavefrontSize());
   auto *EntryBB = I.getParent();
   auto NeedResult = !I.use_empty();
 
@@ -698,7 +690,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   Type *const Ty = I.getType();
   Type *Int32Ty = B.getInt32Ty();
   bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
-  [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
+  [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
 
   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
@@ -706,7 +698,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
 
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
-  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
   CallInst *const Ballot =
       B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
 
@@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // below us only if its associated index was less than ours. We do this by
   // using the mbcnt intrinsic.
   Value *Mbcnt;
-  if (ST->isWave32()) {
+  if (ST.isWave32()) {
     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
                               {Ballot, B.getInt32(0)});
   } else {
@@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       // that they can correctly contribute to the final result.
       NewV =
           B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-      if (!NeedResult && ST->hasPermLaneX16()) {
+      if (!NeedResult && ST.hasPermLaneX16()) {
         // On GFX10 the permlanex16 instruction helps us build a reduction
         // without too many readlanes and writelanes, which are generally bad
         // for performance.
@@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
         // Read the value from the last lane, which has accumulated the values
         // of each active lane in the wavefront. This will be our new value
         // which we will provide to the atomic operation.
-        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+        Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
         NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
                                  {NewV, LastLaneIdx});
       }

@jayfoad jayfoad merged commit 83cbb17 into llvm:main Dec 3, 2024
10 checks passed
@jayfoad jayfoad deleted the refine-amdgpuatomicoptimizer branch December 3, 2024 07:40
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants