Skip to content

Commit 44466c2

Browse files
committed
[AMDGPU][SIInsertWaitCnts] use simplifyWaitcnt code path
1 parent cca7a3e commit 44466c2

File tree

1 file changed

+21
-30
lines changed

1 file changed

+21
-30
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -633,8 +633,11 @@ class WaitcntBrackets {
633633
const MachineOperand &Op) const;
634634

635635
bool counterOutOfOrder(InstCounterType T) const;
636-
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
636+
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
637637
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
638+
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
639+
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
640+
void simplifyXcnt(AMDGPU::Waitcnt &Wait);
638641

639642
void determineWait(InstCounterType T, RegInterval Interval,
640643
AMDGPU::Waitcnt &Wait) const;
@@ -646,9 +649,6 @@ class WaitcntBrackets {
646649

647650
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
648651
void applyWaitcnt(InstCounterType T, unsigned Count);
649-
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
650-
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
651-
void applyXcnt(const AMDGPU::Waitcnt &Wait);
652652
void updateByEvent(WaitEventType E, MachineInstr &MI);
653653

654654
unsigned hasPendingEvent() const { return PendingEvents; }
@@ -1194,15 +1194,15 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11941194

11951195
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
11961196
/// whether a waitcnt instruction is needed at all.
1197-
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1197+
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
11981198
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
11991199
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
12001200
simplifyWaitcnt(DS_CNT, Wait.DsCnt);
12011201
simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
12021202
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
12031203
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
12041204
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1205-
simplifyWaitcnt(X_CNT, Wait.XCnt);
1205+
simplifyXcnt(Wait);
12061206
}
12071207

12081208
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1272,7 +1272,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
12721272
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
12731273
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
12741274
applyWaitcnt(KM_CNT, Wait.KmCnt);
1275-
applyXcnt(Wait);
1275+
applyWaitcnt(X_CNT, Wait.XCnt);
12761276
}
12771277

12781278
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1304,7 +1304,11 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
13041304
!hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
13051305
}
13061306

1307-
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1307+
void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &Wait) {
1308+
// Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1309+
// optimizations. On entry to a block with multiple predescessors, there may
1310+
// be pending SMEM and VMEM events active at the same time.
1311+
// In such cases, only clear one active event at a time.
13081312
if (hasRedundantXCntWithKmCnt(Wait)) {
13091313
if (hasPendingEvent(VMEM_GROUP)) {
13101314
// Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
@@ -1313,15 +1317,10 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
13131317
} else {
13141318
applyWaitcnt(X_CNT, 0);
13151319
}
1316-
return;
1317-
}
1318-
if (canOptimizeXCntWithLoadCnt(Wait)) {
1319-
// On entry to a block with multiple predescessors, there may
1320-
// be pending SMEM and VMEM events active at the same time.
1321-
// In such cases, only clear one active event at a time.
1322-
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1320+
} else if (canOptimizeXCntWithLoadCnt(Wait)) {
1321+
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
13231322
}
1324-
applyWaitcnt(X_CNT, Wait.XCnt);
1323+
simplifyWaitcnt(X_CNT, Wait.XCnt);
13251324
}
13261325

13271326
// Where there are multiple types of event in the bracket of a counter,
@@ -1753,7 +1752,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17531752
ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
17541753
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
17551754
// due to taking the backedge of a block.
1756-
ScoreBrackets.applyXcnt(PreCombine);
1755+
ScoreBrackets.simplifyXcnt(PreCombine);
17571756
}
17581757
if (!WaitInstrs[CT])
17591758
continue;
@@ -2169,19 +2168,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21692168
<< "Update Instr: " << *It);
21702169
}
21712170

2172-
// XCnt may be already consumed by a load wait.
2173-
if (Wait.XCnt != ~0u) {
2174-
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
2175-
Wait.XCnt = ~0u;
2176-
2177-
if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2178-
Wait.XCnt = ~0u;
2179-
2180-
// Since the translation for VMEM addresses occur in-order, we can skip the
2181-
// XCnt if the current instruction is of VMEM type and has a memory
2182-
// dependency with another VMEM instruction in flight.
2183-
if (isVmemAccess(*It))
2184-
Wait.XCnt = ~0u;
2171+
// Since the translation for VMEM addresses occur in-order, we can skip the
2172+
// XCnt if the current instruction is of VMEM type and has a memory
2173+
// dependency with another VMEM instruction in flight.
2174+
if (Wait.XCnt != ~0u && isVmemAccess(*It)) {
2175+
Wait.XCnt = ~0u;
21852176
}
21862177

21872178
if (WCG->createNewWaitcnt(Block, It, Wait))

0 commit comments

Comments
 (0)