Skip to content

Commit de0e91f

Browse files
committed
AMDGPU: Refactor lowering of s_barrier to split barriers
Let's do the lowering of non-split into split barriers and the downgrading of barriers based on the workgroup size in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code. v2: - turn into a Module pass - also handle the downgrading of barriers for single-wave workgroups in the IR pass - add tests for the new pass (cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)
1 parent c984132 commit de0e91f

12 files changed

+353
-77
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
6262
ModulePass *
6363
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
6464
ModulePass *createAMDGPULowerBufferFatPointersPass();
65+
ModulePass *createAMDGPULowerIntrinsicsLegacyPass();
6566
FunctionPass *createSIModeRegisterPass();
6667
FunctionPass *createGCNPreRAOptimizationsLegacyPass();
6768
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
@@ -153,6 +154,16 @@ struct AMDGPULowerBufferFatPointersPass
153154
const TargetMachine &TM;
154155
};
155156

157+
void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);
158+
159+
struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
160+
AMDGPULowerIntrinsicsPass(const AMDGPUTargetMachine &TM) : TM(TM) {}
161+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
162+
163+
private:
164+
const AMDGPUTargetMachine &TM;
165+
};
166+
156167
void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
157168
extern char &AMDGPUPrepareAGPRAllocLegacyID;
158169

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1989,39 +1989,6 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
19891989
return selectImpl(MI, *CoverageInfo);
19901990
}
19911991

1992-
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1993-
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1994-
if (TM.getOptLevel() > CodeGenOptLevel::None) {
1995-
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1996-
if (WGSize <= STI.getWavefrontSize()) {
1997-
// If the workgroup fits in a wave, remove s_barrier_signal and lower
1998-
// s_barrier/s_barrier_wait to wave_barrier.
1999-
if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
2000-
IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
2001-
MachineBasicBlock *MBB = MI.getParent();
2002-
const DebugLoc &DL = MI.getDebugLoc();
2003-
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
2004-
}
2005-
MI.eraseFromParent();
2006-
return true;
2007-
}
2008-
}
2009-
2010-
if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
2011-
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
2012-
MachineBasicBlock *MBB = MI.getParent();
2013-
const DebugLoc &DL = MI.getDebugLoc();
2014-
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
2015-
.addImm(AMDGPU::Barrier::WORKGROUP);
2016-
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
2017-
.addImm(AMDGPU::Barrier::WORKGROUP);
2018-
MI.eraseFromParent();
2019-
return true;
2020-
}
2021-
2022-
return selectImpl(MI, *CoverageInfo);
2023-
}
2024-
20251992
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
20261993
bool &IsTexFail) {
20271994
if (TexFailCtrl)
@@ -2338,10 +2305,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23382305
return selectDSAppendConsume(I, false);
23392306
case Intrinsic::amdgcn_init_whole_wave:
23402307
return selectInitWholeWave(I);
2341-
case Intrinsic::amdgcn_s_barrier:
2342-
case Intrinsic::amdgcn_s_barrier_signal:
2343-
case Intrinsic::amdgcn_s_barrier_wait:
2344-
return selectSBarrier(I);
23452308
case Intrinsic::amdgcn_raw_buffer_load_lds:
23462309
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
23472310
case Intrinsic::amdgcn_struct_buffer_load_lds:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
124124
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
125125
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
126126
bool selectInitWholeWave(MachineInstr &MI) const;
127-
bool selectSBarrier(MachineInstr &MI) const;
128127
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
129128

130129
bool selectImageIntrinsic(MachineInstr &MI,
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Lower intrinsics that would otherwise require separate handling in both
10+
// SelectionDAG and GlobalISel.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "AMDGPU.h"
15+
#include "AMDGPUTargetMachine.h"
16+
#include "GCNSubtarget.h"
17+
#include "llvm/IR/IRBuilder.h"
18+
#include "llvm/IR/IntrinsicInst.h"
19+
#include "llvm/IR/IntrinsicsAMDGPU.h"
20+
#include "llvm/InitializePasses.h"
21+
22+
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
23+
24+
using namespace llvm;
25+
26+
namespace {
27+
28+
class AMDGPULowerIntrinsicsImpl {
29+
public:
30+
Module &M;
31+
const AMDGPUTargetMachine &TM;
32+
33+
AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
34+
: M(M), TM(TM) {}
35+
36+
bool run();
37+
38+
private:
39+
bool visitBarrier(IntrinsicInst &I);
40+
};
41+
42+
class AMDGPULowerIntrinsicsLegacy : public ModulePass {
43+
public:
44+
static char ID;
45+
46+
AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}
47+
48+
bool runOnModule(Module &M) override;
49+
50+
void getAnalysisUsage(AnalysisUsage &AU) const override {
51+
AU.addRequired<TargetPassConfig>();
52+
AU.setPreservesCFG();
53+
}
54+
};
55+
56+
template <class T> static void forEachCall(Function &Intrin, T Callback) {
57+
for (User *U : make_early_inc_range(Intrin.users())) {
58+
if (auto *CI = dyn_cast<IntrinsicInst>(U))
59+
Callback(CI);
60+
}
61+
}
62+
63+
} // anonymous namespace
64+
65+
bool AMDGPULowerIntrinsicsImpl::run() {
66+
bool Changed = false;
67+
68+
for (Function &F : M) {
69+
switch (F.getIntrinsicID()) {
70+
default:
71+
continue;
72+
case Intrinsic::amdgcn_s_barrier:
73+
case Intrinsic::amdgcn_s_barrier_signal:
74+
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
75+
case Intrinsic::amdgcn_s_barrier_wait:
76+
forEachCall(F, [&](IntrinsicInst *II) {
77+
if (visitBarrier(*II))
78+
Changed = true;
79+
});
80+
break;
81+
}
82+
}
83+
84+
return Changed;
85+
}
86+
87+
// Optimize barriers and lower s_barrier to a sequence of split barrier
88+
// intrinsics.
89+
bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
90+
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
91+
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
92+
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
93+
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
94+
95+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
96+
bool IsSingleWaveWG = false;
97+
98+
if (TM.getOptLevel() > CodeGenOptLevel::None) {
99+
unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
100+
IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
101+
}
102+
103+
IRBuilder<> B(&I);
104+
105+
if (IsSingleWaveWG) {
106+
// Down-grade waits, remove split signals.
107+
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
108+
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
109+
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
110+
} else if (I.getIntrinsicID() ==
111+
Intrinsic::amdgcn_s_barrier_signal_isfirst) {
112+
// If we're the only wave of the workgroup, we're always first.
113+
I.replaceAllUsesWith(B.getInt1(true));
114+
}
115+
I.eraseFromParent();
116+
return true;
117+
}
118+
119+
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
120+
ST.hasSplitBarriers()) {
121+
// Lower to split barriers.
122+
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
123+
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
124+
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
125+
{BarrierID_32});
126+
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
127+
{BarrierID_16});
128+
I.eraseFromParent();
129+
return true;
130+
}
131+
132+
return false;
133+
}
134+
135+
PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
136+
ModuleAnalysisManager &MAM) {
137+
AMDGPULowerIntrinsicsImpl Impl(M, TM);
138+
if (!Impl.run())
139+
return PreservedAnalyses::all();
140+
PreservedAnalyses PA;
141+
PA.preserveSet<CFGAnalyses>();
142+
return PA;
143+
}
144+
145+
bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
146+
auto &TPC = getAnalysis<TargetPassConfig>();
147+
const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
148+
149+
AMDGPULowerIntrinsicsImpl Impl(M, TM);
150+
return Impl.run();
151+
}
152+
153+
#define PASS_DESC "AMDGPU lower intrinsics"
154+
INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
155+
false)
156+
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
157+
INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
158+
false)
159+
160+
char AMDGPULowerIntrinsicsLegacy::ID = 0;
161+
162+
ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
163+
return new AMDGPULowerIntrinsicsLegacy;
164+
}

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
2020
MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
2121
MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
2222
AMDGPULowerBufferFatPointersPass(*this))
23+
MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
2324
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
2425
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
2526
MODULE_PASS("amdgpu-perf-hint",

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
577577
initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
578578
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
579579
initializeAMDGPULowerBufferFatPointersPass(*PR);
580+
initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
580581
initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
581582
initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
582583
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1418,6 +1419,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
14181419
// nodes out of the graph, which leads to function-level passes not
14191420
// being run on them, which causes crashes in the resource usage analysis).
14201421
addPass(createAMDGPULowerBufferFatPointersPass());
1422+
addPass(createAMDGPULowerIntrinsicsLegacyPass());
14211423
// In accordance with the above FIXME, manually force all the
14221424
// function-level passes into a CGSCCPassManager.
14231425
addPass(new DummyCGSCCPass());
@@ -2155,9 +2157,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
21552157
// nodes out of the graph, which leads to function-level passes not
21562158
// being run on them, which causes crashes in the resource usage analysis).
21572159
addPass(AMDGPULowerBufferFatPointersPass(TM));
2158-
21592160
addPass.requireCGSCCOrder();
21602161

2162+
addPass(AMDGPULowerIntrinsicsPass(TM));
2163+
21612164
Base::addCodeGenPrepare(addPass);
21622165

21632166
if (isPassEnabled(EnableLoadStoreVectorizer))

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ add_llvm_target(AMDGPUCodeGen
7171
AMDGPUImageIntrinsicOptimizer.cpp
7272
AMDGPULibFunc.cpp
7373
AMDGPULowerBufferFatPointers.cpp
74+
AMDGPULowerIntrinsics.cpp
7475
AMDGPULowerKernelArguments.cpp
7576
AMDGPULowerKernelAttributes.cpp
7677
AMDGPULowerModuleLDSPass.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10421,41 +10421,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1042110421
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
1042210422
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
1042310423
}
10424-
case Intrinsic::amdgcn_s_barrier:
10425-
case Intrinsic::amdgcn_s_barrier_signal:
10426-
case Intrinsic::amdgcn_s_barrier_wait: {
10427-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
10428-
if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
10429-
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
10430-
if (WGSize <= ST.getWavefrontSize()) {
10431-
// If the workgroup fits in a wave, remove s_barrier_signal and lower
10432-
// s_barrier/s_barrier_wait to wave_barrier.
10433-
if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
10434-
return Op.getOperand(0);
10435-
else
10436-
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
10437-
MVT::Other, Op.getOperand(0)),
10438-
0);
10439-
}
10440-
}
10441-
10442-
if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
10443-
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
10444-
SDValue K =
10445-
DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
10446-
SDValue BarSignal =
10447-
SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
10448-
MVT::Other, K, Op.getOperand(0)),
10449-
0);
10450-
SDValue BarWait =
10451-
SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
10452-
BarSignal.getValue(0)),
10453-
0);
10454-
return BarWait;
10455-
}
10456-
10457-
return SDValue();
10458-
};
1045910424

1046010425
case Intrinsic::amdgcn_struct_tbuffer_store:
1046110426
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {

0 commit comments

Comments
 (0)