Skip to content

Commit d458859

Browse files
authored
Fold operands across basic blocks (llvm#2529)
2 parents 34d8f4e + 79b90f4 commit d458859

File tree

4 files changed

+172
-16
lines changed

4 files changed

+172
-16
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
#define DEBUG_TYPE "si-fold-operands"
2424
using namespace llvm;
2525

26+
static cl::opt<int> SIFoldOperandsPreheaderThreshold(
27+
"amdgpu-si-fold-operands-preheader-threshold", cl::init(1000),
28+
cl::desc("Threshold for operand folding hazard check. "
29+
"Defaults to 1000 MIs, upper limit 10000."));
30+
2631
namespace {
2732

2833
struct FoldCandidate {
@@ -1253,10 +1258,9 @@ void SIFoldOperandsImpl::foldOperand(
12531258
}
12541259

12551260
if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1256-
if (execMayBeModifiedBeforeUse(*MRI,
1257-
UseMI->getOperand(UseOpIdx).getReg(),
1258-
*OpToFold.getParent(),
1259-
*UseMI))
1261+
if (checkIfExecMayBeModifiedBeforeUseAcrossBB(
1262+
*MRI, UseMI->getOperand(UseOpIdx).getReg(),
1263+
*OpToFold.getParent(), *UseMI, SIFoldOperandsPreheaderThreshold))
12601264
return;
12611265

12621266
// %vgpr = COPY %sgpr0

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9812,6 +9812,82 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
98129812
return nullptr;
98139813
}
98149814

9815+
// helper function to checkIfExecMayBeModifiedBeforeUseAcrossBB and
9816+
// execMayBeModifiedBeforeUse. This checks possible EXEC register modifications
9817+
// for a straight-line sequence of instructions between BeginIterator and
9818+
// EndIterator (both inclusive) upto a pre-defined limit MaxInstScan
9819+
bool execMayBeModifiedBeforeUseUtil(
9820+
const TargetRegisterInfo *TRI,
9821+
const MachineInstrBundleIterator<const MachineInstr> BeginIterator,
9822+
const MachineInstrBundleIterator<const MachineInstr> EndIterator,
9823+
const int MaxInstScan) {
9824+
9825+
int NumInst = 0;
9826+
for (auto I = BeginIterator; I != EndIterator; ++I) {
9827+
if (I->isMetaInstruction())
9828+
continue;
9829+
9830+
if (++NumInst > MaxInstScan)
9831+
return true;
9832+
9833+
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9834+
return true;
9835+
}
9836+
return false;
9837+
}
9838+
9839+
// Variant of execMayBeModifiedBeforeUse(), where DefMI and UseMI belong to
9840+
// different basic blocks. Current code is limited to a very simple case: DefMI
9841+
// in the predecessor BB of the single BB loop where UseMI resides.
9842+
bool llvm::checkIfExecMayBeModifiedBeforeUseAcrossBB(
9843+
const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
9844+
const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold) {
9845+
9846+
assert(MRI.isSSA() && "Must be run on SSA");
9847+
auto *TRI = MRI.getTargetRegisterInfo();
9848+
auto *DefBB = DefMI.getParent();
9849+
const int MaxInstScan = (SIFoldOperandsPreheaderThreshold > 10000)
9850+
? 10000
9851+
: SIFoldOperandsPreheaderThreshold;
9852+
9853+
// Check whether EXEC is modified along all possible control flow between
9854+
// DefMI and UseMI, which may include loop backedge
9855+
// 1. UseBB is the only successor of DefBB
9856+
// 2. UseBB is a single basic block loop (only two predecessor blocks: DefBB
9857+
// and UseBB)
9858+
// 3. check if EXEC is modified
9859+
auto *UseBB = UseMI.getParent();
9860+
if (UseBB != DefBB) {
9861+
if (!(DefBB->isSuccessor(UseBB) && (DefBB->succ_size() == 1)))
9862+
return true;
9863+
9864+
if (!((UseBB->pred_size() == 2) && UseBB->isPredecessor(UseBB) &&
9865+
UseBB->isPredecessor(DefBB)))
9866+
return true;
9867+
9868+
bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
9869+
TRI, UseBB->begin(), UseBB->end(), MaxInstScan);
9870+
if (canExecBeModifiedBeforeUse)
9871+
return true;
9872+
9873+
// Stop scan at the end of the DEF basic block.
9874+
// If we are here, we know for sure that the instructions in focus are in
9875+
// the same basic block. Scan them to be safe.
9876+
canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
9877+
TRI, std::next(DefMI.getIterator()), DefBB->end(), MaxInstScan);
9878+
if (canExecBeModifiedBeforeUse)
9879+
return true;
9880+
9881+
} else {
9882+
// Stop scan at the use.
9883+
bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
9884+
TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
9885+
if (canExecBeModifiedBeforeUse)
9886+
return true;
9887+
}
9888+
return false;
9889+
}
9890+
98159891
bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
98169892
Register VReg,
98179893
const MachineInstr &DefMI,
@@ -9830,17 +9906,10 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
98309906
int NumInst = 0;
98319907

98329908
// Stop scan at the use.
9833-
auto E = UseMI.getIterator();
9834-
for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9835-
if (I->isDebugInstr())
9836-
continue;
9837-
9838-
if (++NumInst > MaxInstScan)
9839-
return true;
9840-
9841-
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9842-
return true;
9843-
}
9909+
bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
9910+
TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
9911+
if (canExecBeModifiedBeforeUse)
9912+
return true;
98449913

98459914
return false;
98469915
}
@@ -9877,7 +9946,7 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
98779946
for (auto I = std::next(DefMI.getIterator()); ; ++I) {
98789947
assert(I != DefBB->end());
98799948

9880-
if (I->isDebugInstr())
9949+
if (I->isMetaInstruction())
98819950
continue;
98829951

98839952
if (++NumInst > MaxInstScan)

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,6 +1563,10 @@ bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
15631563
const MachineInstr &DefMI,
15641564
const MachineInstr &UseMI);
15651565

1566+
bool checkIfExecMayBeModifiedBeforeUseAcrossBB(
1567+
const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
1568+
const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold);
1569+
15661570
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
15671571
/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
15681572
/// track between blocks.
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=CHECK1 %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-si-fold-operands-preheader-threshold=10 < %s | FileCheck --check-prefix=CHECK2 %s
3+
4+
define protected amdgpu_kernel void @main(ptr addrspace(1) noundef %args.coerce, ptr addrspace(1) noundef %args.coerce2, ptr addrspace(1) noundef %args.coerce4, i32 noundef %args12) {
5+
; CHECK1-LABEL: main:
6+
; check that non-redundant readfirstlanes are not removed
7+
; CHECK1: v_readfirstlane_b32
8+
; check that all redundant readfirstlanes are removed
9+
; CHECK1-NOT: v_readfirstlane_b32
10+
; CHECK1: s_endpgm
11+
12+
; CHECK2-LABEL: main:
13+
; CHECK2: v_readfirstlane_b32
14+
; check that all redundant readfirstlanes across basic blocks persist
15+
; CHECK2: v_readfirstlane_b32
16+
; CHECK2: v_readfirstlane_b32
17+
; CHECK2: s_endpgm
18+
entry:
19+
%wid = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
20+
%div1 = lshr i32 %wid, 6
21+
%rfl1 = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %div1)
22+
%sub1 = add nsw i32 %args12, 1023
23+
%div2 = sdiv i32 %sub1, 1024
24+
%rfl2 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %div2)
25+
%cmp24.i = icmp sgt i32 %rfl2, 0
26+
br i1 %cmp24.i, label %for.body.lr.ph.i, label %add.exit
27+
28+
for.body.lr.ph.i: ; preds = %entry
29+
%pti1 = ptrtoint ptr addrspace(1) %args.coerce4 to i64
30+
%pti2 = ptrtoint ptr addrspace(1) %args.coerce2 to i64
31+
%pti3 = ptrtoint ptr addrspace(1) %args.coerce to i64
32+
%lshr1 = lshr i32 %rfl1, 2
33+
%mbl = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
34+
%mbh = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbl)
35+
%lshr2 = lshr i32 %mbh, 6
36+
%add8 = add i32 %lshr1, %lshr2
37+
%sub3 = shl i32 %rfl1, 8
38+
%mul2 = and i32 %sub3, 768
39+
%add1 = or disjoint i32 %mbh, %mul2
40+
%add3 = add nsw i32 %add1, %add8
41+
%sext1 = add i64 4294967296, 4611686014132420608
42+
%conv1 = lshr exact i64 64, 32
43+
%add4 = add nuw nsw i64 %conv1, 1
44+
%zext2 = zext i32 1 to i64
45+
%tmp.sroa = add nuw nsw i64 %zext2, 4294967295
46+
%sub5 = add i64 %tmp.sroa, 4294967296
47+
%sext2 = mul i64 %sub5, 4294967296
48+
%conv2 = lshr exact i64 %sext2, 32
49+
%add5 = add nuw nsw i64 %add4, %conv2
50+
%conv3 = trunc i64 %add5 to i32
51+
%mul4 = shl i32 %conv3, 2
52+
%bc1 = bitcast i64 %pti3 to <2 x i32>
53+
%ee1 = extractelement <2 x i32> %bc1, i64 0
54+
%ee2 = extractelement <2 x i32> %bc1, i64 1
55+
br label %for.body.i
56+
57+
for.body.i: ; preds = %for.body.i, %for.body.lr.ph.i
58+
%loopi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc.i, %for.body.i ]
59+
%tmp1 = phi i32 [ %add3, %for.body.lr.ph.i ], [ %cnt, %for.body.i ]
60+
%rfl3 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee1)
61+
%rfl4 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee2)
62+
%rfl5 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %mul4)
63+
%ie1 = insertelement <4 x i32> <i32 poison, i32 poison, i32 poison, i32 131072>, i32 %rfl3, i64 0
64+
%ie2 = insertelement <4 x i32> %ie1, i32 %rfl4, i64 1
65+
%ie3 = insertelement <4 x i32> %ie2, i32 %rfl5, i64 2
66+
%mul5 = shl i32 %tmp1, 2
67+
%buffload1 = tail call contract noundef <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> noundef %ie2, i32 noundef %mul5, i32 noundef 0, i32 noundef 0) #6
68+
%add6 = add nsw i32 %tmp1, 1
69+
%buffload3 = tail call contract noundef <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> noundef %ie3, i32 noundef %mul5, i32 noundef 0, i32 noundef 0) #6
70+
%vec_add1 = fadd contract <4 x float> %buffload1, %buffload3
71+
tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> noundef %vec_add1, <4 x i32> noundef %ie3, i32 noundef %mul5, i32 noundef 0, i32 noundef 0) #6
72+
%cnt = add nsw i32 %tmp1, 1024
73+
%inc.i = add nuw nsw i32 %loopi, 1
74+
%exitcond.not.i = icmp eq i32 %inc.i, %rfl2
75+
br i1 %exitcond.not.i, label %add.exit, label %for.body.i
76+
77+
add.exit: ; preds = %for.body.i, %entry
78+
ret void
79+
}

0 commit comments

Comments
 (0)