@@ -349,6 +349,16 @@ class WaitcntBrackets {
349349 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
350350 }
351351
352+ bool hasPendingGDS () const {
353+ return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
354+ }
355+
356+ unsigned getPendingGDSWait () const {
357+ return std::min (getScoreUB (DS_CNT) - LastGDS, getWaitCountMax (DS_CNT) - 1 );
358+ }
359+
360+ void setPendingGDS () { LastGDS = ScoreUBs[DS_CNT]; }
361+
352362 // Return true if there might be pending writes to the vgpr-interval by VMEM
353363 // instructions with types different from V.
354364 bool hasOtherPendingVmemTypes (RegInterval Interval, VmemType V) const {
@@ -427,6 +437,8 @@ class WaitcntBrackets {
427437 unsigned PendingEvents = 0 ;
428438 // Remember the last flat memory operation.
429439 unsigned LastFlat[NUM_INST_CNTS] = {0 };
440+ // Remember the last GDS operation.
441+ unsigned LastGDS = 0 ;
430442 // wait_cnt scores for every vgpr.
431443 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
432444 int VgprUB = -1 ;
@@ -729,6 +741,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
729741 MachineInstr *OldWaitcntInstr);
730742 void updateEventWaitcntAfter (MachineInstr &Inst,
731743 WaitcntBrackets *ScoreBrackets);
744+ bool isNextENDPGM (MachineBasicBlock::instr_iterator It,
745+ MachineBasicBlock *Block) const ;
746+ bool insertForcedWaitAfter (MachineInstr &Inst, MachineBasicBlock &Block,
747+ WaitcntBrackets &ScoreBrackets);
732748 bool insertWaitcntInBlock (MachineFunction &MF, MachineBasicBlock &Block,
733749 WaitcntBrackets &ScoreBrackets);
734750};
@@ -1682,6 +1698,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16821698 }
16831699 }
16841700
1701+ // Wait for any pending GDS instruction to complete before any
1702+ // "Always GDS" instruction.
1703+ if (TII->isAlwaysGDS (MI.getOpcode ()) && ScoreBrackets.hasPendingGDS ())
1704+ addWait (Wait, DS_CNT, ScoreBrackets.getPendingGDSWait ());
1705+
16851706 if (MI.isCall () && callWaitsOnFunctionEntry (MI)) {
16861707 // The function is going to insert a wait on everything in its prolog.
16871708 // This still needs to be careful if the call target is a load (e.g. a GOT
@@ -1986,6 +2007,64 @@ static bool isCacheInvOrWBInst(MachineInstr &Inst) {
19862007 Opc == AMDGPU::GLOBAL_WBINV;
19872008}
19882009
2010+ // Return true if the next instruction is S_ENDPGM, following fallthrough
2011+ // blocks if necessary.
2012+ bool SIInsertWaitcnts::isNextENDPGM (MachineBasicBlock::instr_iterator It,
2013+ MachineBasicBlock *Block) const {
2014+ auto BlockEnd = Block->getParent ()->end ();
2015+ auto BlockIter = Block->getIterator ();
2016+
2017+ while (true ) {
2018+ if (It.isEnd ()) {
2019+ if (++BlockIter != BlockEnd) {
2020+ It = BlockIter->instr_begin ();
2021+ continue ;
2022+ }
2023+
2024+ return false ;
2025+ }
2026+
2027+ if (!It->isMetaInstruction ())
2028+ break ;
2029+
2030+ It++;
2031+ }
2032+
2033+ assert (!It.isEnd ());
2034+
2035+ return It->getOpcode () == AMDGPU::S_ENDPGM;
2036+ }
2037+
2038+ // Add a wait after an instruction if architecture requirements mandate one.
2039+ bool SIInsertWaitcnts::insertForcedWaitAfter (MachineInstr &Inst,
2040+ MachineBasicBlock &Block,
2041+ WaitcntBrackets &ScoreBrackets) {
2042+ AMDGPU::Waitcnt Wait;
2043+ bool NeedsEndPGMCheck = false ;
2044+
2045+ if (ST->isPreciseMemoryEnabled () && Inst.mayLoadOrStore ())
2046+ Wait = WCG->getAllZeroWaitcnt (Inst.mayStore () &&
2047+ !SIInstrInfo::isAtomicRet (Inst));
2048+
2049+ if (TII->isAlwaysGDS (Inst.getOpcode ())) {
2050+ Wait.DsCnt = 0 ;
2051+ NeedsEndPGMCheck = true ;
2052+ }
2053+
2054+ ScoreBrackets.simplifyWaitcnt (Wait);
2055+
2056+ auto SuccessorIt = std::next (Inst.getIterator ());
2057+ bool Result = generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
2058+ /* OldWaitcntInstr=*/ nullptr );
2059+
2060+ if (Result && NeedsEndPGMCheck && isNextENDPGM (SuccessorIt, &Block)) {
2061+ BuildMI (Block, SuccessorIt, Inst.getDebugLoc (), TII->get (AMDGPU::S_NOP))
2062+ .addImm (0 );
2063+ }
2064+
2065+ return Result;
2066+ }
2067+
19892068void SIInsertWaitcnts::updateEventWaitcntAfter (MachineInstr &Inst,
19902069 WaitcntBrackets *ScoreBrackets) {
19912070 // Now look at the instruction opcode. If it is a memory access
@@ -1998,6 +2077,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
19982077 TII->hasModifiersSet (Inst, AMDGPU::OpName::gds)) {
19992078 ScoreBrackets->updateByEvent (TII, TRI, MRI, GDS_ACCESS, Inst);
20002079 ScoreBrackets->updateByEvent (TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2080+ ScoreBrackets->setPendingGDS ();
20012081 } else {
20022082 ScoreBrackets->updateByEvent (TII, TRI, MRI, LDS_ACCESS, Inst);
20032083 }
@@ -2128,6 +2208,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
21282208
21292209 StrictDom |= mergeScore (M, LastFlat[T], Other.LastFlat [T]);
21302210
2211+ if (T == DS_CNT)
2212+ StrictDom |= mergeScore (M, LastGDS, Other.LastGDS );
2213+
21312214 for (int J = 0 ; J <= VgprUB; J++)
21322215 StrictDom |= mergeScore (M, VgprScores[T][J], Other.VgprScores [T][J]);
21332216
@@ -2253,13 +2336,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
22532336
22542337 updateEventWaitcntAfter (Inst, &ScoreBrackets);
22552338
2256- if (ST->isPreciseMemoryEnabled () && Inst.mayLoadOrStore ()) {
2257- AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt (
2258- Inst.mayStore () && !SIInstrInfo::isAtomicRet (Inst));
2259- ScoreBrackets.simplifyWaitcnt (Wait);
2260- Modified |= generateWaitcnt (Wait, std::next (Inst.getIterator ()), Block,
2261- ScoreBrackets, /* OldWaitcntInstr=*/ nullptr );
2262- }
2339+ Modified |= insertForcedWaitAfter (Inst, Block, ScoreBrackets);
22632340
22642341 LLVM_DEBUG ({
22652342 Inst.print (dbgs ());
0 commit comments