@@ -245,8 +245,8 @@ class WaitcntBrackets {
245
245
const SIRegisterInfo *TRI, unsigned OpNo) const ;
246
246
247
247
bool counterOutOfOrder (InstCounterType T) const ;
248
- bool simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
249
- bool simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
248
+ void simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
249
+ void simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
250
250
void determineWait (InstCounterType T, unsigned ScoreToWait,
251
251
AMDGPU::Waitcnt &Wait) const ;
252
252
void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
@@ -418,7 +418,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
418
418
}
419
419
420
420
if (DebugCounter::isCounterSet (ForceLgkmCounter) &&
421
- DebugCounter::shouldExecute (ForceLgkmCounter)) {
421
+ DebugCounter::shouldExecute (ForceLgkmCounter)) {
422
422
ForceEmitWaitcnt[LGKM_CNT] = true ;
423
423
} else {
424
424
ForceEmitWaitcnt[LGKM_CNT] = false ;
@@ -442,6 +442,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
442
442
WaitcntBrackets *ScoreBrackets);
443
443
bool insertWaitcntInBlock (MachineFunction &MF, MachineBasicBlock &Block,
444
444
WaitcntBrackets &ScoreBrackets);
445
+ bool applyPreexistingWaitcnt (WaitcntBrackets &ScoreBrackets,
446
+ MachineInstr &OldWaitcntInstr,
447
+ AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
445
448
};
446
449
447
450
} // end anonymous namespace
@@ -708,22 +711,23 @@ void WaitcntBrackets::print(raw_ostream &OS) {
708
711
709
712
// / Simplify the waitcnt, in the sense of removing redundant counts, and return
710
713
// / whether a waitcnt instruction is needed at all.
711
- bool WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const {
712
- return simplifyWaitcnt (VM_CNT, Wait.VmCnt ) |
713
- simplifyWaitcnt (EXP_CNT, Wait.ExpCnt ) |
714
- simplifyWaitcnt (LGKM_CNT, Wait.LgkmCnt ) |
715
- simplifyWaitcnt (VS_CNT, Wait.VsCnt );
714
+ void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const {
715
+ simplifyWaitcnt (VM_CNT, Wait.VmCnt );
716
+ simplifyWaitcnt (EXP_CNT, Wait.ExpCnt );
717
+ simplifyWaitcnt (LGKM_CNT, Wait.LgkmCnt );
718
+ simplifyWaitcnt (VS_CNT, Wait.VsCnt );
716
719
}
717
720
718
- bool WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
721
+ void WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
719
722
unsigned &Count) const {
720
723
const unsigned LB = getScoreLB (T);
721
724
const unsigned UB = getScoreUB (T);
722
- if (Count < UB && UB - Count > LB)
723
- return true ;
724
725
725
- Count = ~0u ;
726
- return false ;
726
+ // The number of outstanding events for this type, T, can be calculated
727
+ // as (UB - LB). If the current Count is greater than or equal to the number
728
+ // of outstanding events, then the wait for this counter is redundant.
729
+ if (Count >= UB - LB)
730
+ Count = ~0u ;
727
731
}
728
732
729
733
void WaitcntBrackets::determineWait (InstCounterType T, unsigned ScoreToWait,
@@ -798,6 +802,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
798
802
return new SIInsertWaitcnts ();
799
803
}
800
804
805
+ // / Combine consecutive waitcnt instructions that precede \p MI and follow
806
+ // / \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
807
+ // / by previous passes. Currently this pass conservatively assumes that these
808
+ // / preexisting waitcnt are required for correctness.
809
+ bool SIInsertWaitcnts::applyPreexistingWaitcnt (WaitcntBrackets &ScoreBrackets,
810
+ MachineInstr &OldWaitcntInstr,
811
+ AMDGPU::Waitcnt &Wait,
812
+ const MachineInstr *MI) {
813
+ bool Modified = false ;
814
+ MachineInstr *WaitcntInstr = nullptr ;
815
+ MachineInstr *WaitcntVsCntInstr = nullptr ;
816
+ for (auto II = OldWaitcntInstr.getIterator (), NextI = std::next (II);
817
+ &*II != MI; II = NextI, ++NextI) {
818
+ if (II->isMetaInstruction ())
819
+ continue ;
820
+
821
+ if (II->getOpcode () == AMDGPU::S_WAITCNT) {
822
+ // Conservatively update required wait if this waitcnt was added in an
823
+ // earlier pass. In this case it will not exist in the tracked waitcnt
824
+ // set.
825
+ if (!TrackedWaitcntSet.count (&*II)) {
826
+ unsigned IEnc = II->getOperand (0 ).getImm ();
827
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt (IV, IEnc);
828
+ Wait = Wait.combined (OldWait);
829
+ }
830
+
831
+ // Merge consecutive waitcnt of the same type by erasing multiples.
832
+ if (!WaitcntInstr) {
833
+ WaitcntInstr = &*II;
834
+ } else {
835
+ II->eraseFromParent ();
836
+ Modified = true ;
837
+ }
838
+
839
+ } else {
840
+ assert (II->getOpcode () == AMDGPU::S_WAITCNT_VSCNT);
841
+ assert (II->getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
842
+ if (!TrackedWaitcntSet.count (&*II)) {
843
+ unsigned OldVSCnt =
844
+ TII->getNamedOperand (*II, AMDGPU::OpName::simm16)->getImm ();
845
+ Wait.VsCnt = std::min (Wait.VsCnt , OldVSCnt);
846
+ }
847
+
848
+ if (!WaitcntVsCntInstr) {
849
+ WaitcntVsCntInstr = &*II;
850
+ } else {
851
+ II->eraseFromParent ();
852
+ Modified = true ;
853
+ }
854
+ }
855
+ }
856
+
857
+ // Updated encoding of merged waitcnt with the required wait.
858
+ if (WaitcntInstr) {
859
+ if (Wait.hasWaitExceptVsCnt ()) {
860
+ unsigned NewEnc = AMDGPU::encodeWaitcnt (IV, Wait);
861
+ unsigned OldEnc = WaitcntInstr->getOperand (0 ).getImm ();
862
+ if (OldEnc != NewEnc) {
863
+ WaitcntInstr->getOperand (0 ).setImm (NewEnc);
864
+ Modified = true ;
865
+ }
866
+ ScoreBrackets.applyWaitcnt (Wait);
867
+ Wait.VmCnt = ~0u ;
868
+ Wait.LgkmCnt = ~0u ;
869
+ Wait.ExpCnt = ~0u ;
870
+
871
+ LLVM_DEBUG (dbgs () << " generateWaitcntInstBefore\n "
872
+ << " Old Instr: " << MI << " New Instr: " << *WaitcntInstr
873
+ << ' \n ' );
874
+ } else {
875
+ WaitcntInstr->eraseFromParent ();
876
+ Modified = true ;
877
+ }
878
+ }
879
+
880
+ if (WaitcntVsCntInstr) {
881
+ if (Wait.hasWaitVsCnt ()) {
882
+ assert (ST->hasVscnt ());
883
+ unsigned OldVSCnt =
884
+ TII->getNamedOperand (*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
885
+ ->getImm ();
886
+ if (Wait.VsCnt != OldVSCnt) {
887
+ TII->getNamedOperand (*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
888
+ ->setImm (Wait.VsCnt );
889
+ Modified = true ;
890
+ }
891
+ ScoreBrackets.applyWaitcnt (Wait);
892
+ Wait.VsCnt = ~0u ;
893
+
894
+ LLVM_DEBUG (dbgs () << " generateWaitcntInstBefore\n "
895
+ << " Old Instr: " << MI
896
+ << " New Instr: " << *WaitcntVsCntInstr << ' \n ' );
897
+ } else {
898
+ WaitcntVsCntInstr->eraseFromParent ();
899
+ Modified = true ;
900
+ }
901
+ }
902
+
903
+ return Modified;
904
+ }
905
+
801
906
static bool readsVCCZ (const MachineInstr &MI) {
802
907
unsigned Opc = MI.getOpcode ();
803
908
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -833,12 +938,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
833
938
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
834
939
MachineInstr *OldWaitcntInstr) {
835
940
setForceEmitWaitcnt ();
836
- bool IsForceEmitWaitcnt = isForceEmitWaitcnt ();
837
941
838
942
if (MI.isMetaInstruction ())
839
943
return false ;
840
944
841
945
AMDGPU::Waitcnt Wait;
946
+ bool Modified = false ;
842
947
843
948
// See if this instruction has a forced S_WAITCNT VM.
844
949
// TODO: Handle other cases of NeedsWaitcntVmBefore()
@@ -1053,32 +1158,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
1053
1158
}
1054
1159
}
1055
1160
1056
- // Early-out if no wait is indicated.
1057
- if (!ScoreBrackets.simplifyWaitcnt (Wait) && !IsForceEmitWaitcnt) {
1058
- bool Modified = false ;
1059
- if (OldWaitcntInstr) {
1060
- for (auto II = OldWaitcntInstr->getIterator (), NextI = std::next (II);
1061
- &*II != &MI; II = NextI, ++NextI) {
1062
- if (II->isDebugInstr ())
1063
- continue ;
1064
-
1065
- if (TrackedWaitcntSet.count (&*II)) {
1066
- TrackedWaitcntSet.erase (&*II);
1067
- II->eraseFromParent ();
1068
- Modified = true ;
1069
- } else if (II->getOpcode () == AMDGPU::S_WAITCNT) {
1070
- int64_t Imm = II->getOperand (0 ).getImm ();
1071
- ScoreBrackets.applyWaitcnt (AMDGPU::decodeWaitcnt (IV, Imm));
1072
- } else {
1073
- assert (II->getOpcode () == AMDGPU::S_WAITCNT_VSCNT);
1074
- assert (II->getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
1075
- auto W = TII->getNamedOperand (*II, AMDGPU::OpName::simm16)->getImm ();
1076
- ScoreBrackets.applyWaitcnt (AMDGPU::Waitcnt (~0u , ~0u , ~0u , W));
1077
- }
1078
- }
1079
- }
1080
- return Modified;
1081
- }
1161
+ // Verify that the wait is actually needed.
1162
+ ScoreBrackets.simplifyWaitcnt (Wait);
1082
1163
1083
1164
if (ForceEmitZeroWaitcnts)
1084
1165
Wait = AMDGPU::Waitcnt::allZero (ST->hasVscnt ());
@@ -1092,57 +1173,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
1092
1173
if (ForceEmitWaitcnt[VS_CNT])
1093
1174
Wait.VsCnt = 0 ;
1094
1175
1095
- ScoreBrackets.applyWaitcnt (Wait);
1096
-
1097
- AMDGPU::Waitcnt OldWait;
1098
- bool Modified = false ;
1099
-
1100
1176
if (OldWaitcntInstr) {
1101
- for (auto II = OldWaitcntInstr->getIterator (), NextI = std::next (II);
1102
- &*II != &MI; II = NextI, NextI++) {
1103
- if (II->isDebugInstr ())
1104
- continue ;
1105
-
1106
- if (II->getOpcode () == AMDGPU::S_WAITCNT) {
1107
- unsigned IEnc = II->getOperand (0 ).getImm ();
1108
- AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt (IV, IEnc);
1109
- OldWait = OldWait.combined (IWait);
1110
- if (!TrackedWaitcntSet.count (&*II))
1111
- Wait = Wait.combined (IWait);
1112
- unsigned NewEnc = AMDGPU::encodeWaitcnt (IV, Wait);
1113
- if (IEnc != NewEnc) {
1114
- II->getOperand (0 ).setImm (NewEnc);
1115
- Modified = true ;
1116
- }
1117
- Wait.VmCnt = ~0u ;
1118
- Wait.LgkmCnt = ~0u ;
1119
- Wait.ExpCnt = ~0u ;
1120
- } else {
1121
- assert (II->getOpcode () == AMDGPU::S_WAITCNT_VSCNT);
1122
- assert (II->getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
1123
-
1124
- unsigned ICnt = TII->getNamedOperand (*II, AMDGPU::OpName::simm16)
1125
- ->getImm ();
1126
- OldWait.VsCnt = std::min (OldWait.VsCnt , ICnt);
1127
- if (!TrackedWaitcntSet.count (&*II))
1128
- Wait.VsCnt = std::min (Wait.VsCnt , ICnt);
1129
- if (Wait.VsCnt != ICnt) {
1130
- TII->getNamedOperand (*II, AMDGPU::OpName::simm16)->setImm (Wait.VsCnt );
1131
- Modified = true ;
1132
- }
1133
- Wait.VsCnt = ~0u ;
1134
- }
1135
-
1136
- LLVM_DEBUG (dbgs () << " generateWaitcntInstBefore\n "
1137
- << " Old Instr: " << MI
1138
- << " New Instr: " << *II << ' \n ' );
1139
-
1140
- if (!Wait.hasWait ())
1141
- return Modified;
1142
- }
1177
+ // Try to merge the required wait with preexisting waitcnt instructions.
1178
+ // Also erase redundant waitcnt.
1179
+ Modified =
1180
+ applyPreexistingWaitcnt (ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
1181
+ } else {
1182
+ // Update waitcnt brackets after determining the required wait.
1183
+ ScoreBrackets.applyWaitcnt (Wait);
1143
1184
}
1144
1185
1145
- if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u ) {
1186
+ // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1187
+ // instruction was modified to handle the required wait.
1188
+ if (Wait.hasWaitExceptVsCnt ()) {
1146
1189
unsigned Enc = AMDGPU::encodeWaitcnt (IV, Wait);
1147
1190
auto SWaitInst = BuildMI (*MI.getParent (), MI.getIterator (),
1148
1191
MI.getDebugLoc (), TII->get (AMDGPU::S_WAITCNT))
@@ -1155,7 +1198,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
1155
1198
<< " New Instr: " << *SWaitInst << ' \n ' );
1156
1199
}
1157
1200
1158
- if (Wait.VsCnt != ~ 0u ) {
1201
+ if (Wait.hasWaitVsCnt () ) {
1159
1202
assert (ST->hasVscnt ());
1160
1203
1161
1204
auto SWaitInst =
@@ -1430,7 +1473,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1430
1473
Iter != E;) {
1431
1474
MachineInstr &Inst = *Iter;
1432
1475
1433
- // Track pre-existing waitcnts from earlier iterations.
1476
+ // Track pre-existing waitcnts that were added in earlier iterations or by
1477
+ // the memory legalizer.
1434
1478
if (Inst.getOpcode () == AMDGPU::S_WAITCNT ||
1435
1479
(Inst.getOpcode () == AMDGPU::S_WAITCNT_VSCNT &&
1436
1480
Inst.getOperand (0 ).isReg () &&
0 commit comments