Skip to content

Commit 4a7ca8c

Browse files
kerbowamemfrob
authored andcommitted
[AMDGPU] Revise handling of preexisting waitcnt
Preexisting waitcnt may not update the scoreboard if the instruction being examined needed to wait on fewer counters than what was encoded in the old waitcnt instruction. Fixing this results in the elimination of some redudnat waitcnt. These changes also enable combining consecutive waitcnt into a single S_WAITCNT or S_WAITCNT_VSCNT instruction. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D100281
1 parent 599d2a3 commit 4a7ca8c

14 files changed

+469
-253
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 134 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,8 @@ class WaitcntBrackets {
245245
const SIRegisterInfo *TRI, unsigned OpNo) const;
246246

247247
bool counterOutOfOrder(InstCounterType T) const;
248-
bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
249-
bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
248+
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
249+
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
250250
void determineWait(InstCounterType T, unsigned ScoreToWait,
251251
AMDGPU::Waitcnt &Wait) const;
252252
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
@@ -418,7 +418,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
418418
}
419419

420420
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
421-
DebugCounter::shouldExecute(ForceLgkmCounter)) {
421+
DebugCounter::shouldExecute(ForceLgkmCounter)) {
422422
ForceEmitWaitcnt[LGKM_CNT] = true;
423423
} else {
424424
ForceEmitWaitcnt[LGKM_CNT] = false;
@@ -442,6 +442,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
442442
WaitcntBrackets *ScoreBrackets);
443443
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
444444
WaitcntBrackets &ScoreBrackets);
445+
bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
446+
MachineInstr &OldWaitcntInstr,
447+
AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
445448
};
446449

447450
} // end anonymous namespace
@@ -708,22 +711,23 @@ void WaitcntBrackets::print(raw_ostream &OS) {
708711

709712
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
710713
/// whether a waitcnt instruction is needed at all.
711-
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
712-
return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
713-
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
714-
simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
715-
simplifyWaitcnt(VS_CNT, Wait.VsCnt);
714+
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
715+
simplifyWaitcnt(VM_CNT, Wait.VmCnt);
716+
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
717+
simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
718+
simplifyWaitcnt(VS_CNT, Wait.VsCnt);
716719
}
717720

718-
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
721+
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
719722
unsigned &Count) const {
720723
const unsigned LB = getScoreLB(T);
721724
const unsigned UB = getScoreUB(T);
722-
if (Count < UB && UB - Count > LB)
723-
return true;
724725

725-
Count = ~0u;
726-
return false;
726+
// The number of outstanding events for this type, T, can be calculated
727+
// as (UB - LB). If the current Count is greater than or equal to the number
728+
// of outstanding events, then the wait for this counter is redundant.
729+
if (Count >= UB - LB)
730+
Count = ~0u;
727731
}
728732

729733
void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
@@ -798,6 +802,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
798802
return new SIInsertWaitcnts();
799803
}
800804

805+
/// Combine consecutive waitcnt instructions that precede \p MI and follow
806+
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
807+
/// by previous passes. Currently this pass conservatively assumes that these
808+
/// preexisting waitcnt are required for correctness.
809+
bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
810+
MachineInstr &OldWaitcntInstr,
811+
AMDGPU::Waitcnt &Wait,
812+
const MachineInstr *MI) {
813+
bool Modified = false;
814+
MachineInstr *WaitcntInstr = nullptr;
815+
MachineInstr *WaitcntVsCntInstr = nullptr;
816+
for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
817+
&*II != MI; II = NextI, ++NextI) {
818+
if (II->isMetaInstruction())
819+
continue;
820+
821+
if (II->getOpcode() == AMDGPU::S_WAITCNT) {
822+
// Conservatively update required wait if this waitcnt was added in an
823+
// earlier pass. In this case it will not exist in the tracked waitcnt
824+
// set.
825+
if (!TrackedWaitcntSet.count(&*II)) {
826+
unsigned IEnc = II->getOperand(0).getImm();
827+
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
828+
Wait = Wait.combined(OldWait);
829+
}
830+
831+
// Merge consecutive waitcnt of the same type by erasing multiples.
832+
if (!WaitcntInstr) {
833+
WaitcntInstr = &*II;
834+
} else {
835+
II->eraseFromParent();
836+
Modified = true;
837+
}
838+
839+
} else {
840+
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
841+
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
842+
if (!TrackedWaitcntSet.count(&*II)) {
843+
unsigned OldVSCnt =
844+
TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
845+
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
846+
}
847+
848+
if (!WaitcntVsCntInstr) {
849+
WaitcntVsCntInstr = &*II;
850+
} else {
851+
II->eraseFromParent();
852+
Modified = true;
853+
}
854+
}
855+
}
856+
857+
// Updated encoding of merged waitcnt with the required wait.
858+
if (WaitcntInstr) {
859+
if (Wait.hasWaitExceptVsCnt()) {
860+
unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
861+
unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
862+
if (OldEnc != NewEnc) {
863+
WaitcntInstr->getOperand(0).setImm(NewEnc);
864+
Modified = true;
865+
}
866+
ScoreBrackets.applyWaitcnt(Wait);
867+
Wait.VmCnt = ~0u;
868+
Wait.LgkmCnt = ~0u;
869+
Wait.ExpCnt = ~0u;
870+
871+
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
872+
<< "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
873+
<< '\n');
874+
} else {
875+
WaitcntInstr->eraseFromParent();
876+
Modified = true;
877+
}
878+
}
879+
880+
if (WaitcntVsCntInstr) {
881+
if (Wait.hasWaitVsCnt()) {
882+
assert(ST->hasVscnt());
883+
unsigned OldVSCnt =
884+
TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
885+
->getImm();
886+
if (Wait.VsCnt != OldVSCnt) {
887+
TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
888+
->setImm(Wait.VsCnt);
889+
Modified = true;
890+
}
891+
ScoreBrackets.applyWaitcnt(Wait);
892+
Wait.VsCnt = ~0u;
893+
894+
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
895+
<< "Old Instr: " << MI
896+
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
897+
} else {
898+
WaitcntVsCntInstr->eraseFromParent();
899+
Modified = true;
900+
}
901+
}
902+
903+
return Modified;
904+
}
905+
801906
static bool readsVCCZ(const MachineInstr &MI) {
802907
unsigned Opc = MI.getOpcode();
803908
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -833,12 +938,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
833938
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
834939
MachineInstr *OldWaitcntInstr) {
835940
setForceEmitWaitcnt();
836-
bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
837941

838942
if (MI.isMetaInstruction())
839943
return false;
840944

841945
AMDGPU::Waitcnt Wait;
946+
bool Modified = false;
842947

843948
// See if this instruction has a forced S_WAITCNT VM.
844949
// TODO: Handle other cases of NeedsWaitcntVmBefore()
@@ -1053,32 +1158,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
10531158
}
10541159
}
10551160

1056-
// Early-out if no wait is indicated.
1057-
if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1058-
bool Modified = false;
1059-
if (OldWaitcntInstr) {
1060-
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1061-
&*II != &MI; II = NextI, ++NextI) {
1062-
if (II->isDebugInstr())
1063-
continue;
1064-
1065-
if (TrackedWaitcntSet.count(&*II)) {
1066-
TrackedWaitcntSet.erase(&*II);
1067-
II->eraseFromParent();
1068-
Modified = true;
1069-
} else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1070-
int64_t Imm = II->getOperand(0).getImm();
1071-
ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1072-
} else {
1073-
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1074-
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1075-
auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
1076-
ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
1077-
}
1078-
}
1079-
}
1080-
return Modified;
1081-
}
1161+
// Verify that the wait is actually needed.
1162+
ScoreBrackets.simplifyWaitcnt(Wait);
10821163

10831164
if (ForceEmitZeroWaitcnts)
10841165
Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
@@ -1092,57 +1173,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
10921173
if (ForceEmitWaitcnt[VS_CNT])
10931174
Wait.VsCnt = 0;
10941175

1095-
ScoreBrackets.applyWaitcnt(Wait);
1096-
1097-
AMDGPU::Waitcnt OldWait;
1098-
bool Modified = false;
1099-
11001176
if (OldWaitcntInstr) {
1101-
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1102-
&*II != &MI; II = NextI, NextI++) {
1103-
if (II->isDebugInstr())
1104-
continue;
1105-
1106-
if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1107-
unsigned IEnc = II->getOperand(0).getImm();
1108-
AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1109-
OldWait = OldWait.combined(IWait);
1110-
if (!TrackedWaitcntSet.count(&*II))
1111-
Wait = Wait.combined(IWait);
1112-
unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
1113-
if (IEnc != NewEnc) {
1114-
II->getOperand(0).setImm(NewEnc);
1115-
Modified = true;
1116-
}
1117-
Wait.VmCnt = ~0u;
1118-
Wait.LgkmCnt = ~0u;
1119-
Wait.ExpCnt = ~0u;
1120-
} else {
1121-
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1122-
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1123-
1124-
unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
1125-
->getImm();
1126-
OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
1127-
if (!TrackedWaitcntSet.count(&*II))
1128-
Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
1129-
if (Wait.VsCnt != ICnt) {
1130-
TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
1131-
Modified = true;
1132-
}
1133-
Wait.VsCnt = ~0u;
1134-
}
1135-
1136-
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1137-
<< "Old Instr: " << MI
1138-
<< "New Instr: " << *II << '\n');
1139-
1140-
if (!Wait.hasWait())
1141-
return Modified;
1142-
}
1177+
// Try to merge the required wait with preexisting waitcnt instructions.
1178+
// Also erase redundant waitcnt.
1179+
Modified =
1180+
applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
1181+
} else {
1182+
// Update waitcnt brackets after determining the required wait.
1183+
ScoreBrackets.applyWaitcnt(Wait);
11431184
}
11441185

1145-
if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
1186+
// Build new waitcnt instructions unless no wait is needed or the old waitcnt
1187+
// instruction was modified to handle the required wait.
1188+
if (Wait.hasWaitExceptVsCnt()) {
11461189
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
11471190
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
11481191
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
@@ -1155,7 +1198,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
11551198
<< "New Instr: " << *SWaitInst << '\n');
11561199
}
11571200

1158-
if (Wait.VsCnt != ~0u) {
1201+
if (Wait.hasWaitVsCnt()) {
11591202
assert(ST->hasVscnt());
11601203

11611204
auto SWaitInst =
@@ -1430,7 +1473,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
14301473
Iter != E;) {
14311474
MachineInstr &Inst = *Iter;
14321475

1433-
// Track pre-existing waitcnts from earlier iterations.
1476+
// Track pre-existing waitcnts that were added in earlier iterations or by
1477+
// the memory legalizer.
14341478
if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
14351479
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
14361480
Inst.getOperand(0).isReg() &&

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,14 @@ struct Waitcnt {
477477
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
478478
}
479479

480+
bool hasWaitExceptVsCnt() const {
481+
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
482+
}
483+
484+
bool hasWaitVsCnt() const {
485+
return VsCnt != ~0u;
486+
}
487+
480488
bool dominates(const Waitcnt &Other) const {
481489
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
482490
LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
184184
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
185185
; GFX9-NEXT: s_and_b32 s0, s0, 15
186186
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
187-
; GFX9-NEXT: s_waitcnt vmcnt(0)
188187
; GFX9-NEXT: v_mov_b32_e32 v0, 15
189188
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
190189
; GFX9-NEXT: scratch_store_dword off, v0, s1
@@ -357,7 +356,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
357356
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
358357
; GFX9-NEXT: s_and_b32 s0, s0, 15
359358
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
360-
; GFX9-NEXT: s_waitcnt vmcnt(0)
361359
; GFX9-NEXT: v_mov_b32_e32 v0, 15
362360
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
363361
; GFX9-NEXT: scratch_store_dword off, v0, s1

llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
7272
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7373
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7474
; GCN-NEXT: buffer_wbinvl1_vol
75-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
7675
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7776
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7877
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]

llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -788,7 +788,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
788788
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
789789
; GFX9-NEXT: s_and_b32 s0, s0, 15
790790
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
791-
; GFX9-NEXT: s_waitcnt vmcnt(0)
792791
; GFX9-NEXT: v_mov_b32_e32 v0, 15
793792
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
794793
; GFX9-NEXT: scratch_store_dword off, v0, s1
@@ -1419,7 +1418,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
14191418
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
14201419
; GFX9-NEXT: s_and_b32 s0, s0, 15
14211420
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
1422-
; GFX9-NEXT: s_waitcnt vmcnt(0)
14231421
; GFX9-NEXT: v_mov_b32_e32 v0, 15
14241422
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
14251423
; GFX9-NEXT: scratch_store_dword off, v0, s1

llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
630630
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
631631
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
632632
; GFX90A-NEXT: buffer_wbinvl1_vol
633-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
634633
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
635634
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
636635
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
@@ -706,7 +705,6 @@ define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
706705
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
707706
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
708707
; GFX90A-NEXT: buffer_wbinvl1_vol
709-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
710708
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
711709
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
712710
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
@@ -731,7 +729,6 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
731729
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
732730
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
733731
; GFX90A-NEXT: buffer_wbinvl1_vol
734-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
735732
; GFX90A-NEXT: s_setpc_b64 s[30:31]
736733
main_body:
737734
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst

0 commit comments

Comments
 (0)