Skip to content

Commit 2895d7f

Browse files
authored
[AMDGPU] Revert Old optimization to fold READFIRSTLANEs across BBs (#446)
2 parents 9045866 + 46dc274 commit 2895d7f

File tree

4 files changed

+21
-105
lines changed

4 files changed

+21
-105
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,6 @@
2323
#define DEBUG_TYPE "si-fold-operands"
2424
using namespace llvm;
2525

26-
static cl::opt<int> SIFoldOperandsPreheaderThreshold(
27-
"amdgpu-si-fold-operands-preheader-threshold", cl::init(1000),
28-
cl::desc("Threshold for operand folding hazard check. "
29-
"Defaults to 1000 MIs, upper limit 10000."));
30-
3126
namespace {
3227

3328
/// Track a value we may want to fold into downstream users, applying
@@ -1458,9 +1453,9 @@ void SIFoldOperandsImpl::foldOperand(
14581453
}
14591454

14601455
if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1461-
if (checkIfExecMayBeModifiedBeforeUseAcrossBB(
1456+
if (execMayBeModifiedBeforeUse(
14621457
*MRI, UseMI->getOperand(UseOpIdx).getReg(),
1463-
*OpToFold.DefMI, *UseMI, SIFoldOperandsPreheaderThreshold))
1458+
*OpToFold.DefMI, *UseMI))
14641459
return;
14651460

14661461
// %vgpr = COPY %sgpr0

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 13 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -10246,82 +10246,6 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
1024610246
return nullptr;
1024710247
}
1024810248

10249-
// helper function to checkIfExecMayBeModifiedBeforeUseAcrossBB and
10250-
// execMayBeModifiedBeforeUse. This checks possible EXEC register modifications
10251-
// for a straight-line sequence of instructions between BeginIterator and
10252-
// EndIterator (both inclusive) upto a pre-defined limit MaxInstScan
10253-
bool execMayBeModifiedBeforeUseUtil(
10254-
const TargetRegisterInfo *TRI,
10255-
const MachineInstrBundleIterator<const MachineInstr> BeginIterator,
10256-
const MachineInstrBundleIterator<const MachineInstr> EndIterator,
10257-
const int MaxInstScan) {
10258-
10259-
int NumInst = 0;
10260-
for (auto I = BeginIterator; I != EndIterator; ++I) {
10261-
if (I->isMetaInstruction())
10262-
continue;
10263-
10264-
if (++NumInst > MaxInstScan)
10265-
return true;
10266-
10267-
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10268-
return true;
10269-
}
10270-
return false;
10271-
}
10272-
10273-
// Variant of execMayBeModifiedBeforeUse(), where DefMI and UseMI belong to
10274-
// different basic blocks. Current code is limited to a very simple case: DefMI
10275-
// in the predecessor BB of the single BB loop where UseMI resides.
10276-
bool llvm::checkIfExecMayBeModifiedBeforeUseAcrossBB(
10277-
const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
10278-
const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold) {
10279-
10280-
assert(MRI.isSSA() && "Must be run on SSA");
10281-
auto *TRI = MRI.getTargetRegisterInfo();
10282-
auto *DefBB = DefMI.getParent();
10283-
const int MaxInstScan = (SIFoldOperandsPreheaderThreshold > 10000)
10284-
? 10000
10285-
: SIFoldOperandsPreheaderThreshold;
10286-
10287-
// Check whether EXEC is modified along all possible control flow between
10288-
// DefMI and UseMI, which may include loop backedge
10289-
// 1. UseBB is the only successor of DefBB
10290-
// 2. UseBB is a single basic block loop (only two predecessor blocks: DefBB
10291-
// and UseBB)
10292-
// 3. check if EXEC is modified
10293-
auto *UseBB = UseMI.getParent();
10294-
if (UseBB != DefBB) {
10295-
if (!(DefBB->isSuccessor(UseBB) && (DefBB->succ_size() == 1)))
10296-
return true;
10297-
10298-
if (!((UseBB->pred_size() == 2) && UseBB->isPredecessor(UseBB) &&
10299-
UseBB->isPredecessor(DefBB)))
10300-
return true;
10301-
10302-
bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
10303-
TRI, UseBB->begin(), UseBB->end(), MaxInstScan);
10304-
if (canExecBeModifiedBeforeUse)
10305-
return true;
10306-
10307-
// Stop scan at the end of the DEF basic block.
10308-
// If we are here, we know for sure that the instructions in focus are in
10309-
// the same basic block. Scan them to be safe.
10310-
canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
10311-
TRI, std::next(DefMI.getIterator()), DefBB->end(), MaxInstScan);
10312-
if (canExecBeModifiedBeforeUse)
10313-
return true;
10314-
10315-
} else {
10316-
// Stop scan at the use.
10317-
bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
10318-
TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
10319-
if (canExecBeModifiedBeforeUse)
10320-
return true;
10321-
}
10322-
return false;
10323-
}
10324-
1032510249
bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
1032610250
Register VReg,
1032710251
const MachineInstr &DefMI,
@@ -10337,12 +10261,20 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
1033710261
return true;
1033810262

1033910263
const int MaxInstScan = 20;
10264+
int NumInst = 0;
1034010265

1034110266
// Stop scan at the use.
10342-
bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
10343-
TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
10344-
if (canExecBeModifiedBeforeUse)
10345-
return true;
10267+
auto E = UseMI.getIterator();
10268+
for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10269+
if (I->isDebugInstr())
10270+
continue;
10271+
10272+
if (++NumInst > MaxInstScan)
10273+
return true;
10274+
10275+
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10276+
return true;
10277+
}
1034610278

1034710279
return false;
1034810280
}
@@ -10379,7 +10311,7 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
1037910311
for (auto I = std::next(DefMI.getIterator()); ; ++I) {
1038010312
assert(I != DefBB->end());
1038110313

10382-
if (I->isMetaInstruction())
10314+
if (I->isDebugInstr())
1038310315
continue;
1038410316

1038510317
if (++NumInst > MaxInstScan)

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1714,10 +1714,6 @@ bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
17141714
const MachineInstr &DefMI,
17151715
const MachineInstr &UseMI);
17161716

1717-
bool checkIfExecMayBeModifiedBeforeUseAcrossBB(
1718-
const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
1719-
const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold);
1720-
17211717
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
17221718
/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
17231719
/// track between blocks.

llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,13 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=CHECK1 %s
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-si-fold-operands-preheader-threshold=10 < %s | FileCheck --check-prefix=CHECK2 %s
3-
; XFAIL: *
1+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=CHECK %s
2+
43
define protected amdgpu_kernel void @main(ptr addrspace(1) noundef %args.coerce, ptr addrspace(1) noundef %args.coerce2, ptr addrspace(1) noundef %args.coerce4, i32 noundef %args12) {
5-
; CHECK1-LABEL: main:
4+
; CHECK-LABEL: main:
65
; check that non-redundant readfirstlanes are not removed
7-
; CHECK1: v_readfirstlane_b32
6+
; CHECK: v_readfirstlane_b32
87
; check that all redundant readfirstlanes are removed
9-
; CHECK1-NOT: v_readfirstlane_b32
10-
; CHECK1: s_endpgm
8+
; CHECK-NOT: v_readfirstlane_b32
9+
; CHECK: s_endpgm
1110

12-
; CHECK2-LABEL: main:
13-
; CHECK2: v_readfirstlane_b32
14-
; check that all redundant readfirstlanes across basic blocks persist
15-
; CHECK2: v_readfirstlane_b32
16-
; CHECK2: v_readfirstlane_b32
17-
; CHECK2: s_endpgm
1811
entry:
1912
%wid = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
2013
%div1 = lshr i32 %wid, 6

0 commit comments

Comments
 (0)