From 9822d9b7f86085344daeaea7764d1f6c8ff932a9 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 8 Aug 2025 18:37:07 -0500 Subject: [PATCH 1/4] Limit Alloca->LDS promotion based on speculations as to eventual register pressure --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index f226c7f381aa2..0dd327207fed6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -27,7 +27,9 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InstSimplifyFolder.h" @@ -36,6 +38,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -45,6 +48,9 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include +#include + #define DEBUG_TYPE "amdgpu-promote-alloca" using namespace llvm; @@ -100,6 +106,14 @@ class AMDGPUPromoteAllocaImpl { unsigned VGPRBudgetRatio; unsigned MaxVectorRegs; + std::unordered_map> + SGPRLiveIns; + size_t getSGPRPressureEstimate(AllocaInst &I); + + std::unordered_map> + VGPRLiveIns; + size_t getVGPRPressureEstimate(AllocaInst &I); + bool IsAMDGCN = false; bool IsAMDHSA = false; @@ -1471,9 +1485,83 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { return true; } +size_t AMDGPUPromoteAllocaImpl::getSGPRPressureEstimate(AllocaInst &I) { + Function &F = *I.getParent()->getParent(); + size_t MaxLive = 0; + for (BasicBlock *BB : post_order(&F)) { + if (SGPRLiveIns.count(BB)) + continue; + + std::unordered_set CurrentlyLive; + for (BasicBlock *SuccBB : successors(BB)) + if (SGPRLiveIns.count(SuccBB)) + for (const auto &R : SGPRLiveIns[SuccBB]) + CurrentlyLive.insert(R); + + for (auto RIt = BB->rbegin(); RIt != BB->rend(); RIt++) { + if (&*RIt == &I) + return (MaxLive + CurrentlyLive.size()) / 2; + + MaxLive = std::max(MaxLive, CurrentlyLive.size()); + + for (auto &Op : RIt->operands()) + if (!Op.get()->getType()->isVectorTy()) + if (Instruction *U = dyn_cast(Op)) + CurrentlyLive.insert(U); + + if (!RIt->getType()->isVectorTy()) + CurrentlyLive.erase(&*RIt); + } + } + + llvm_unreachable("Woops, we fell off the edge of the world. Bye bye."); +} + +size_t AMDGPUPromoteAllocaImpl::getVGPRPressureEstimate(AllocaInst &I) { + Function &F = *I.getParent()->getParent(); + size_t MaxLive = 0; + for (BasicBlock *BB : post_order(&F)) { + if (VGPRLiveIns.count(BB)) + continue; + + std::unordered_set CurrentlyLive; + for (BasicBlock *SuccBB : successors(BB)) + if (VGPRLiveIns.count(SuccBB)) + for (const auto &R : VGPRLiveIns[SuccBB]) + CurrentlyLive.insert(R); + + for (auto RIt = BB->rbegin(); RIt != BB->rend(); RIt++) { + if (&*RIt == &I) + return (MaxLive + CurrentlyLive.size() / 2); + + MaxLive = std::max(MaxLive, CurrentlyLive.size()); + + for (auto &Op : RIt->operands()) + if (Op.get()->getType()->isVectorTy()) + if (Instruction *U = dyn_cast(Op)) + CurrentlyLive.insert(U); + + if (RIt->getType()->isVectorTy()) + CurrentlyLive.erase(&*RIt); + } + } + + llvm_unreachable("Woops, we fell off the edge of the world. Bye bye."); +} + // FIXME: Should try to pick the most likely to be profitable allocas first. bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS) { + const unsigned SGPRPressureLimit = AMDGPU::SGPR_32RegClass.getNumRegs(); + const unsigned VGPRPressureLimit = AMDGPU::VGPR_32RegClass.getNumRegs(); + + if (getSGPRPressureEstimate(I) < SGPRPressureLimit && + getVGPRPressureEstimate(I) < VGPRPressureLimit) { + LLVM_DEBUG(dbgs() << "Declining to promote " << I + << " to LDS since pressure is relatively low.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n'); if (DisablePromoteAllocaToLDS) { From 7b86ccc794e1aed371da1d75c430d91d71c09d08 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 8 Aug 2025 19:00:23 -0500 Subject: [PATCH 2/4] Limit based on high not low register pressure --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 0dd327207fed6..9108c86750fc7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1500,7 +1500,7 @@ size_t AMDGPUPromoteAllocaImpl::getSGPRPressureEstimate(AllocaInst &I) { for (auto RIt = BB->rbegin(); RIt != BB->rend(); RIt++) { if (&*RIt == &I) - return (MaxLive + CurrentlyLive.size()) / 2; + return MaxLive; MaxLive = std::max(MaxLive, CurrentlyLive.size()); @@ -1532,7 +1532,7 @@ size_t AMDGPUPromoteAllocaImpl::getVGPRPressureEstimate(AllocaInst &I) { for (auto RIt = BB->rbegin(); RIt != BB->rend(); RIt++) { if (&*RIt == &I) - return (MaxLive + CurrentlyLive.size() / 2); + return MaxLive; MaxLive = std::max(MaxLive, CurrentlyLive.size()); @@ -1555,10 +1555,10 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, const unsigned SGPRPressureLimit = AMDGPU::SGPR_32RegClass.getNumRegs(); const unsigned VGPRPressureLimit = AMDGPU::VGPR_32RegClass.getNumRegs(); - if (getSGPRPressureEstimate(I) < SGPRPressureLimit && - getVGPRPressureEstimate(I) < VGPRPressureLimit) { + if (getSGPRPressureEstimate(I) > SGPRPressureLimit || + getVGPRPressureEstimate(I) > VGPRPressureLimit) { LLVM_DEBUG(dbgs() << "Declining to promote " << I - << " to LDS since pressure is relatively low.\n"); + << " to LDS since pressure is relatively high.\n"); return false; } From 1d3e1c156cf3d7322135ec6473dbbcb79747b635 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 11 Aug 2025 14:56:23 -0500 Subject: [PATCH 3/4] Add important qualifications. --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 9108c86750fc7..f93eed44b06cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1512,6 +1512,8 @@ size_t AMDGPUPromoteAllocaImpl::getSGPRPressureEstimate(AllocaInst &I) { if (!RIt->getType()->isVectorTy()) CurrentlyLive.erase(&*RIt); } + + SGPRLiveIns[BB] = CurrentlyLive; } llvm_unreachable("Woops, we fell off the edge of the world. Bye bye."); @@ -1544,6 +1546,8 @@ size_t AMDGPUPromoteAllocaImpl::getVGPRPressureEstimate(AllocaInst &I) { if (RIt->getType()->isVectorTy()) CurrentlyLive.erase(&*RIt); } + + VGPRLiveIns[BB] = CurrentlyLive; } llvm_unreachable("Woops, we fell off the edge of the world. Bye bye."); From be47b63fbd5847cee544edb4f15002c9ee95fb20 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 11 Aug 2025 14:57:02 -0500 Subject: [PATCH 4/4] Update llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Co-authored-by: Tim Gymnich --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index f93eed44b06cb..fe41705beeb2a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1486,7 +1486,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } size_t AMDGPUPromoteAllocaImpl::getSGPRPressureEstimate(AllocaInst &I) { - Function &F = *I.getParent()->getParent(); + Function &F = *I.getFunction(); size_t MaxLive = 0; for (BasicBlock *BB : post_order(&F)) { if (SGPRLiveIns.count(BB))