Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 39 additions & 47 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
class AMDGPUAtomicOptimizerImpl
: public InstVisitor<AMDGPUAtomicOptimizerImpl> {
private:
Function &F;
SmallVector<ReplacementInfo, 8> ToReplace;
const UniformityInfo *UA;
const DataLayout *DL;
const UniformityInfo &UA;
const DataLayout &DL;
DomTreeUpdater &DTU;
const GCNSubtarget *ST;
const GCNSubtarget &ST;
bool IsPixelShader;
ScanOptions ScanImpl;

Expand All @@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
public:
AMDGPUAtomicOptimizerImpl() = delete;

AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
DomTreeUpdater &DTU, const GCNSubtarget *ST,
bool IsPixelShader, ScanOptions ScanImpl)
: UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA,
DomTreeUpdater &DTU, const GCNSubtarget &ST,
ScanOptions ScanImpl)
: F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS),
ScanImpl(ScanImpl) {}

bool run(Function &F);
bool run();

void visitAtomicRMWInst(AtomicRMWInst &I);
void visitIntrinsicInst(IntrinsicInst &I);
Expand All @@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
return false;
}

const UniformityInfo *UA =
&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
const DataLayout *DL = &F.getDataLayout();
const UniformityInfo &UA =
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();

DominatorTreeWrapperPass *const DTW =
DominatorTreeWrapperPass *DTW =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
DomTreeUpdater::UpdateStrategy::Lazy);

const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;

return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
.run(F);
return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
}

PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
FunctionAnalysisManager &AM) {

const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
const DataLayout *DL = &F.getDataLayout();
const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);

DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
DomTreeUpdater::UpdateStrategy::Lazy);
const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);

bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

bool IsChanged =
AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
.run(F);
bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();

if (!IsChanged) {
return PreservedAnalyses::all();
Expand All @@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
return PA;
}

bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
bool AMDGPUAtomicOptimizerImpl::run() {

// Scan option None disables the Pass
if (ScanImpl == ScanOptions::None) {
Expand Down Expand Up @@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {

// If the pointer operand is divergent, then each lane is doing an atomic
// operation on a different address, and we cannot optimize that.
if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) {
return;
}

bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));

// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
return;

if (!isLegalCrossLaneType(I.getType()))
Expand Down Expand Up @@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {

const unsigned ValIdx = 0;

const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));

// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
return;

if (!isLegalCrossLaneType(I.getType()))
Expand All @@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// If any of the other arguments to the intrinsic are divergent, we can't
// optimize the operation.
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
if (UA->isDivergentUse(I.getOperandUse(Idx))) {
if (UA.isDivergentUse(I.getOperandUse(Idx))) {
return;
}
}
Expand Down Expand Up @@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
}

// Reduce within each pair of rows (i.e. 32 lanes).
assert(ST->hasPermLaneX16());
assert(ST.hasPermLaneX16());
Value *Permlanex16Call =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
{PoisonValue::get(AtomicTy), V, B.getInt32(0),
B.getInt32(0), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
if (ST->isWave32()) {
if (ST.isWave32()) {
return V;
}

if (ST->hasPermLane64()) {
if (ST.hasPermLane64()) {
// Reduce across the upper and lower 32 lanes.
Value *Permlane64Call =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
Expand Down Expand Up @@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
{Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
}
if (ST->hasDPPBroadcasts()) {
if (ST.hasDPPBroadcasts()) {
// GFX9 has DPP row broadcast operations.
V = buildNonAtomicBinOp(
B, Op, V,
Expand All @@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,

// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
assert(ST->hasPermLaneX16());
assert(ST.hasPermLaneX16());
Value *PermX =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
{PoisonValue::get(AtomicTy), V, B.getInt32(-1),
Expand All @@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);

if (!ST->isWave32()) {
if (!ST.isWave32()) {
// Combine lane 31 into lanes 32..63.
Value *const Lane31 = B.CreateIntrinsic(
AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
Expand All @@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
M, Intrinsic::amdgcn_update_dpp, AtomicTy);
if (ST->hasDPPWavefrontShifts()) {
if (ST.hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
V = B.CreateCall(UpdateDPP,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
Expand All @@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
B.getInt32(16), V});

if (!ST->isWave32()) {
if (!ST.isWave32()) {
// Copy the old lane 31 to the new lane 32.
V = B.CreateCall(
WriteLane,
Expand All @@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
auto *Ty = I.getType();
auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
auto *WaveTy = B.getIntNTy(ST.getWavefrontSize());
auto *EntryBB = I.getParent();
auto NeedResult = !I.use_empty();

Expand Down Expand Up @@ -698,15 +690,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Type *const Ty = I.getType();
Type *Int32Ty = B.getInt32Ty();
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
[[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
[[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);

// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
Value *V = I.getOperand(ValIdx);

// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
CallInst *const Ballot =
B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());

Expand All @@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
Value *Mbcnt;
if (ST->isWave32()) {
if (ST.isWave32()) {
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
{Ballot, B.getInt32(0)});
} else {
Expand Down Expand Up @@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// that they can correctly contribute to the final result.
NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
if (!NeedResult && ST->hasPermLaneX16()) {
if (!NeedResult && ST.hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
// for performance.
Expand All @@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// Read the value from the last lane, which has accumulated the values
// of each active lane in the wavefront. This will be our new value
// which we will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
}
Expand Down
Loading