Skip to content

Commit 8aa0f83

Browse files
perlfumemfrob
authored andcommitted
[AMDGPU] Move kill lowering to WQM pass and add live mask tracking
Move implementation of kill intrinsics to WQM pass. Add live lane tracking by updating a stored exec mask when lanes are killed. Use live lane tracking to enable early termination of shader at any point in control flow. Reviewed By: piotr Differential Revision: https://reviews.llvm.org/D94746
1 parent 8fe5774 commit 8aa0f83

17 files changed

+973
-867
lines changed

llvm/lib/Target/AMDGPU/SIInsertSkips.cpp

Lines changed: 7 additions & 228 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,18 @@ class SIInsertSkips : public MachineFunctionPass {
4444
bool shouldSkip(const MachineBasicBlock &From,
4545
const MachineBasicBlock &To) const;
4646

47-
bool dominatesAllReachable(MachineBasicBlock &MBB);
4847
void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
49-
void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
50-
DebugLoc DL);
5148

52-
bool kill(MachineInstr &MI);
5349
void earlyTerm(MachineInstr &MI);
5450

5551
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
5652

5753
public:
5854
static char ID;
5955

56+
unsigned MovOpc;
57+
Register ExecReg;
58+
6059
SIInsertSkips() : MachineFunctionPass(ID) {}
6160

6261
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -138,15 +137,6 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
138137
return false;
139138
}
140139

141-
/// Check whether \p MBB dominates all blocks that are reachable from it.
142-
bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
143-
for (MachineBasicBlock *Other : depth_first(&MBB)) {
144-
if (!MDT->dominates(&MBB, Other))
145-
return false;
146-
}
147-
return true;
148-
}
149-
150140
static void generateEndPgm(MachineBasicBlock &MBB,
151141
MachineBasicBlock::iterator I, DebugLoc DL,
152142
const SIInstrInfo *TII, bool IsPS) {
@@ -181,11 +171,8 @@ void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
181171
}
182172

183173
if (ClearExec && !EarlyExitClearsExec) {
184-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
185-
unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
186-
Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
187174
auto ExitI = EarlyExitBlock->getFirstNonPHI();
188-
BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
175+
BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0);
189176
EarlyExitClearsExec = true;
190177
}
191178
}
@@ -205,175 +192,6 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
205192
MDT->getBase().applyUpdates(DTUpdates);
206193
}
207194

208-
/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
209-
/// iterator. Only applies to pixel shaders.
210-
void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
211-
MachineBasicBlock::iterator I, DebugLoc DL) {
212-
MachineFunction *MF = MBB.getParent();
213-
(void)MF;
214-
assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
215-
216-
// It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
217-
// basic block that has no further successors (e.g., there was an
218-
// `unreachable` there in IR). This can happen with original source of the
219-
// form:
220-
//
221-
// if (uniform_condition) {
222-
// write_to_memory();
223-
// discard;
224-
// }
225-
//
226-
// In this case, we write the "null_export; s_endpgm" skip code in the
227-
// already-existing basic block.
228-
auto NextBBI = std::next(MBB.getIterator());
229-
bool NoSuccessor =
230-
I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
231-
232-
if (NoSuccessor) {
233-
generateEndPgm(MBB, I, DL, TII, true);
234-
} else {
235-
ensureEarlyExitBlock(MBB, false);
236-
237-
MachineInstr *BranchMI =
238-
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
239-
.addMBB(EarlyExitBlock);
240-
241-
// Split the block if the branch will not come at the end.
242-
auto Next = std::next(BranchMI->getIterator());
243-
if (Next != MBB.end() && !Next->isTerminator())
244-
splitBlock(MBB, *BranchMI, MDT);
245-
246-
MBB.addSuccessor(EarlyExitBlock);
247-
MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
248-
}
249-
}
250-
251-
/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
252-
/// Return true unless the terminator is a no-op.
253-
bool SIInsertSkips::kill(MachineInstr &MI) {
254-
MachineBasicBlock &MBB = *MI.getParent();
255-
DebugLoc DL = MI.getDebugLoc();
256-
257-
switch (MI.getOpcode()) {
258-
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
259-
unsigned Opcode = 0;
260-
261-
// The opcodes are inverted because the inline immediate has to be
262-
// the first operand, e.g. from "x < imm" to "imm > x"
263-
switch (MI.getOperand(2).getImm()) {
264-
case ISD::SETOEQ:
265-
case ISD::SETEQ:
266-
Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
267-
break;
268-
case ISD::SETOGT:
269-
case ISD::SETGT:
270-
Opcode = AMDGPU::V_CMPX_LT_F32_e64;
271-
break;
272-
case ISD::SETOGE:
273-
case ISD::SETGE:
274-
Opcode = AMDGPU::V_CMPX_LE_F32_e64;
275-
break;
276-
case ISD::SETOLT:
277-
case ISD::SETLT:
278-
Opcode = AMDGPU::V_CMPX_GT_F32_e64;
279-
break;
280-
case ISD::SETOLE:
281-
case ISD::SETLE:
282-
Opcode = AMDGPU::V_CMPX_GE_F32_e64;
283-
break;
284-
case ISD::SETONE:
285-
case ISD::SETNE:
286-
Opcode = AMDGPU::V_CMPX_LG_F32_e64;
287-
break;
288-
case ISD::SETO:
289-
Opcode = AMDGPU::V_CMPX_O_F32_e64;
290-
break;
291-
case ISD::SETUO:
292-
Opcode = AMDGPU::V_CMPX_U_F32_e64;
293-
break;
294-
case ISD::SETUEQ:
295-
Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
296-
break;
297-
case ISD::SETUGT:
298-
Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
299-
break;
300-
case ISD::SETUGE:
301-
Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
302-
break;
303-
case ISD::SETULT:
304-
Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
305-
break;
306-
case ISD::SETULE:
307-
Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
308-
break;
309-
case ISD::SETUNE:
310-
Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
311-
break;
312-
default:
313-
llvm_unreachable("invalid ISD:SET cond code");
314-
}
315-
316-
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
317-
if (ST.hasNoSdstCMPX())
318-
Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
319-
320-
assert(MI.getOperand(0).isReg());
321-
322-
if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
323-
MI.getOperand(0).getReg())) {
324-
Opcode = AMDGPU::getVOPe32(Opcode);
325-
BuildMI(MBB, &MI, DL, TII->get(Opcode))
326-
.add(MI.getOperand(1))
327-
.add(MI.getOperand(0));
328-
} else {
329-
auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
330-
if (!ST.hasNoSdstCMPX())
331-
I.addReg(AMDGPU::VCC, RegState::Define);
332-
333-
I.addImm(0) // src0 modifiers
334-
.add(MI.getOperand(1))
335-
.addImm(0) // src1 modifiers
336-
.add(MI.getOperand(0));
337-
338-
I.addImm(0); // omod
339-
}
340-
return true;
341-
}
342-
case AMDGPU::SI_KILL_I1_TERMINATOR: {
343-
const MachineFunction *MF = MI.getParent()->getParent();
344-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
345-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
346-
const MachineOperand &Op = MI.getOperand(0);
347-
int64_t KillVal = MI.getOperand(1).getImm();
348-
assert(KillVal == 0 || KillVal == -1);
349-
350-
// Kill all threads if Op0 is an immediate and equal to the Kill value.
351-
if (Op.isImm()) {
352-
int64_t Imm = Op.getImm();
353-
assert(Imm == 0 || Imm == -1);
354-
355-
if (Imm == KillVal) {
356-
BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
357-
: AMDGPU::S_MOV_B64), Exec)
358-
.addImm(0);
359-
return true;
360-
}
361-
return false;
362-
}
363-
364-
unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
365-
if (ST.isWave32())
366-
Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
367-
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
368-
.addReg(Exec)
369-
.add(Op);
370-
return true;
371-
}
372-
default:
373-
llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
374-
}
375-
}
376-
377195
void SIInsertSkips::earlyTerm(MachineInstr &MI) {
378196
MachineBasicBlock &MBB = *MI.getParent();
379197
const DebugLoc DL = MI.getDebugLoc();
@@ -415,7 +233,9 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
415233
MDT = &getAnalysis<MachineDominatorTree>();
416234
SkipThreshold = SkipThresholdFlag;
417235

418-
SmallVector<MachineInstr *, 4> KillInstrs;
236+
MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
237+
ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
238+
419239
SmallVector<MachineInstr *, 4> EarlyTermInstrs;
420240
bool MadeChange = false;
421241

@@ -440,41 +260,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
440260
}
441261
break;
442262

443-
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
444-
case AMDGPU::SI_KILL_I1_TERMINATOR: {
445-
MadeChange = true;
446-
bool CanKill = kill(MI);
447-
448-
// Check if we can add an early "if exec=0 { end shader }".
449-
//
450-
// Note that we _always_ do this if it is correct, even if the kill
451-
// happens fairly late in the shader, because the null export should
452-
// generally still be cheaper than normal export(s).
453-
//
454-
// TODO: The dominatesAllReachable check is conservative: if the
455-
// dominance is only missing due to _uniform_ branches, we could
456-
// in fact insert the early-exit as well.
457-
if (CanKill &&
458-
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
459-
dominatesAllReachable(MBB)) {
460-
// Mark the instruction for kill-if-dead insertion. We delay this
461-
// change because it modifies the CFG.
462-
KillInstrs.push_back(&MI);
463-
} else {
464-
MI.eraseFromParent();
465-
}
466-
break;
467-
}
468-
469-
case AMDGPU::SI_KILL_CLEANUP:
470-
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
471-
dominatesAllReachable(MBB)) {
472-
KillInstrs.push_back(&MI);
473-
} else {
474-
MI.eraseFromParent();
475-
}
476-
break;
477-
478263
case AMDGPU::SI_EARLY_TERMINATE_SCC0:
479264
EarlyTermInstrs.push_back(&MI);
480265
break;
@@ -491,12 +276,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
491276
earlyTerm(*Instr);
492277
Instr->eraseFromParent();
493278
}
494-
for (MachineInstr *Kill : KillInstrs) {
495-
skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
496-
Kill->getDebugLoc());
497-
Kill->eraseFromParent();
498-
}
499-
KillInstrs.clear();
500279
EarlyTermInstrs.clear();
501280
EarlyExitBlock = nullptr;
502281

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1641,6 +1641,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
16411641
MI.setDesc(get(AMDGPU::S_ANDN2_B32));
16421642
break;
16431643

1644+
case AMDGPU::S_AND_B64_term:
1645+
// This is only a terminator to get the correct spill code placement during
1646+
// register allocation.
1647+
MI.setDesc(get(AMDGPU::S_AND_B64));
1648+
break;
1649+
1650+
case AMDGPU::S_AND_B32_term:
1651+
// This is only a terminator to get the correct spill code placement during
1652+
// register allocation.
1653+
MI.setDesc(get(AMDGPU::S_AND_B32));
1654+
break;
1655+
16441656
case AMDGPU::V_MOV_B64_PSEUDO: {
16451657
Register Dst = MI.getOperand(0).getReg();
16461658
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -2272,10 +2284,12 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
22722284
case AMDGPU::S_XOR_B64_term:
22732285
case AMDGPU::S_OR_B64_term:
22742286
case AMDGPU::S_ANDN2_B64_term:
2287+
case AMDGPU::S_AND_B64_term:
22752288
case AMDGPU::S_MOV_B32_term:
22762289
case AMDGPU::S_XOR_B32_term:
22772290
case AMDGPU::S_OR_B32_term:
22782291
case AMDGPU::S_ANDN2_B32_term:
2292+
case AMDGPU::S_AND_B32_term:
22792293
break;
22802294
case AMDGPU::SI_IF:
22812295
case AMDGPU::SI_ELSE:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,13 +232,15 @@ def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
232232
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
233233
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
234234
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
235+
def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
235236
}
236237

237238
let WaveSizePredicate = isWave32 in {
238239
def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
239240
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
240241
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
241242
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
243+
def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
242244
}
243245

244246

@@ -339,24 +341,22 @@ multiclass PseudoInstKill <dag ins> {
339341
// required in degenerate cases (when V_CMPX cannot be used due to constant
340342
// bus limitations) and because it allows us to avoid having to track SCC
341343
// liveness across basic blocks.
342-
let Defs = [EXEC,VCC,SCC] in
344+
let Defs = [EXEC,SCC] in
343345
def _PSEUDO : PseudoInstSI <(outs), ins> {
344346
let isConvergent = 1;
345347
let usesCustomInserter = 1;
346348
}
347349

348-
let Defs = [EXEC,VCC,SCC] in
350+
let Defs = [EXEC,SCC] in
349351
def _TERMINATOR : SPseudoInstSI <(outs), ins> {
350352
let isTerminator = 1;
351353
}
352354
}
353355

354356
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
357+
let Defs = [VCC] in
355358
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
356359

357-
let Defs = [EXEC] in
358-
def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
359-
360360
let Defs = [EXEC,VCC] in
361361
def SI_ILLEGAL_COPY : SPseudoInstSI <
362362
(outs unknown:$dst), (ins unknown:$src),

0 commit comments

Comments
 (0)