@@ -121,6 +121,7 @@ struct HardwareLimits {
121121 DECL (LDS_ACCESS) /* lds read & write */ \
122122 DECL (GDS_ACCESS) /* gds read & write */ \
123123 DECL (SQ_MESSAGE) /* send message */ \
124+ DECL (SCC_WRITE) /* write to SCC from barrier */ \
124125 DECL (SMEM_ACCESS) /* scalar-memory read & write */ \
125126 DECL (SMEM_GROUP) /* scalar-memory group */ \
126127 DECL (EXP_GPR_LOCK) /* export holding on its data src */ \
@@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
149150// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
150151// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
151152// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
153+ // NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
152154// We reserve a fixed number of VGPR slots in the scoring tables for
153155// special tokens like SCMEM_LDS (needed for buffer load to LDS).
154156enum RegisterMapping {
@@ -163,6 +165,9 @@ enum RegisterMapping {
163165 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
164166 NUM_LDS_VGPRS = 9 , // One more than the stores we track.
165167 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
168+ NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
169+ // Remaining non-allocatable registers
170+ SCC = NUM_ALL_ALLOCATABLE
166171};
167172
168173// Enumerate different types of result-returning VMEM operations. Although
@@ -401,7 +406,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
401406 eventMask ({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
402407 eventMask ({VMEM_SAMPLER_READ_ACCESS}),
403408 eventMask ({VMEM_BVH_READ_ACCESS}),
404- eventMask ({SMEM_ACCESS, SQ_MESSAGE}),
409+ eventMask ({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE }),
405410 eventMask ({VMEM_GROUP, SMEM_GROUP})};
406411
407412 return WaitEventMaskForInstGFX12Plus;
@@ -586,6 +591,7 @@ class SIInsertWaitcnts {
586591 WaitcntBrackets &ScoreBrackets);
587592 bool insertWaitcntInBlock (MachineFunction &MF, MachineBasicBlock &Block,
588593 WaitcntBrackets &ScoreBrackets);
594+ static bool asynchronouslyWritesSCC (unsigned Opcode);
589595};
590596
591597// This objects maintains the current score brackets of each wait counter, and
@@ -626,7 +632,12 @@ class WaitcntBrackets {
626632 unsigned getRegScore (int GprNo, InstCounterType T) const {
627633 if (GprNo < NUM_ALL_VGPRS)
628634 return VgprScores[T][GprNo];
629- return SgprScores[getSgprScoresIdx (T)][GprNo - NUM_ALL_VGPRS];
635+
636+ if (GprNo < NUM_ALL_ALLOCATABLE)
637+ return SgprScores[getSgprScoresIdx (T)][GprNo - NUM_ALL_VGPRS];
638+
639+ assert (GprNo == SCC);
640+ return SCCScore;
630641 }
631642
632643 bool merge (const WaitcntBrackets &Other);
@@ -646,6 +657,7 @@ class WaitcntBrackets {
646657 AMDGPU::Waitcnt &Wait) const {
647658 determineWait (T, {RegNo, RegNo + 1 }, Wait);
648659 }
660+ void tryClearSCCWriteEvent (MachineInstr *Inst);
649661
650662 void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
651663 void applyWaitcnt (InstCounterType T, unsigned Count);
@@ -785,6 +797,10 @@ class WaitcntBrackets {
785797 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
786798 // X_CNT score.
787799 unsigned SgprScores[2 ][SQ_MAX_PGM_SGPRS] = {{0 }};
800+ // Reg score for SCC.
801+ unsigned SCCScore = 0 ;
802+ // The unique instruction that has an SCC write pending, if there is one.
803+ const MachineInstr *PendingSCCWrite = nullptr ;
788804 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
789805 // write to each vgpr.
790806 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0 };
@@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
820836 const MachineRegisterInfo *MRI,
821837 const SIRegisterInfo *TRI,
822838 const MachineOperand &Op) const {
839+ if (Op.getReg () == AMDGPU::SCC)
840+ return {SCC, SCC + 1 };
841+
823842 if (!TRI->isInAllocatableClass (Op.getReg ()))
824843 return {-1 , -1 };
825844
@@ -873,9 +892,12 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
873892 if (RegNo < NUM_ALL_VGPRS) {
874893 VgprUB = std::max (VgprUB, RegNo);
875894 VgprScores[CntTy][RegNo] = Score;
876- } else {
895+ } else if (RegNo < NUM_ALL_ALLOCATABLE) {
877896 SgprUB = std::max (SgprUB, RegNo - NUM_ALL_VGPRS);
878897 SgprScores[getSgprScoresIdx (CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
898+ } else {
899+ assert (RegNo == SCC);
900+ SCCScore = Score;
879901 }
880902 }
881903}
@@ -1086,6 +1108,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
10861108 if (Slot)
10871109 setRegScore (FIRST_LDS_VGPR, T, CurrScore);
10881110 }
1111+
1112+ if (Context->asynchronouslyWritesSCC (Inst.getOpcode ())) {
1113+ setRegScore (SCC, T, CurrScore);
1114+ PendingSCCWrite = &Inst;
1115+ }
10891116 }
10901117}
10911118
@@ -1154,6 +1181,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11541181 OS << RelScore << " :s" << J << " " ;
11551182 }
11561183 }
1184+ if (T == KM_CNT && SCCScore > 0 )
1185+ OS << SCCScore << " :scc " ;
11571186 }
11581187 OS << ' \n ' ;
11591188 }
@@ -1228,6 +1257,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
12281257 }
12291258}
12301259
1260+ void WaitcntBrackets::tryClearSCCWriteEvent (MachineInstr *Inst) {
1261+ // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1262+ // SCC has landed
1263+ if (PendingSCCWrite &&
1264+ PendingSCCWrite->getOpcode () == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1265+ PendingSCCWrite->getOperand (0 ).getImm () == Inst->getOperand (0 ).getImm ()) {
1266+ unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1267+ // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1268+ if ((PendingEvents & Context->WaitEventMaskForInst [KM_CNT]) ==
1269+ SCC_WRITE_PendingEvent) {
1270+ setScoreLB (KM_CNT, getScoreUB (KM_CNT));
1271+ }
1272+
1273+ PendingEvents &= ~SCC_WRITE_PendingEvent;
1274+ PendingSCCWrite = nullptr ;
1275+ }
1276+ }
1277+
12311278void WaitcntBrackets::applyWaitcnt (const AMDGPU::Waitcnt &Wait) {
12321279 applyWaitcnt (LOAD_CNT, Wait.LoadCnt );
12331280 applyWaitcnt (EXP_CNT, Wait.ExpCnt );
@@ -1917,6 +1964,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
19171964 Wait);
19181965 }
19191966 }
1967+ } else if (MI.getOpcode () == AMDGPU::S_BARRIER_WAIT) {
1968+ ScoreBrackets.tryClearSCCWriteEvent (&MI);
19201969 } else {
19211970 // FIXME: Should not be relying on memoperands.
19221971 // Look at the source operands of every instruction to see if
@@ -2006,6 +2055,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20062055 ScoreBrackets.determineWait (EXP_CNT, Interval, Wait);
20072056 }
20082057 ScoreBrackets.determineWait (DS_CNT, Interval, Wait);
2058+ } else if (Op.getReg () == AMDGPU::SCC) {
2059+ ScoreBrackets.determineWait (KM_CNT, Interval, Wait);
20092060 } else {
20102061 ScoreBrackets.determineWait (SmemAccessCounter, Interval, Wait);
20112062 }
@@ -2343,6 +2394,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23432394 ScoreBrackets->updateByEvent (TII, TRI, MRI, EXP_POS_ACCESS, Inst);
23442395 else
23452396 ScoreBrackets->updateByEvent (TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2397+ } else if (asynchronouslyWritesSCC (Inst.getOpcode ())) {
2398+ ScoreBrackets->updateByEvent (TII, TRI, MRI, SCC_WRITE, Inst);
23462399 } else {
23472400 switch (Inst.getOpcode ()) {
23482401 case AMDGPU::S_SENDMSG:
@@ -2353,9 +2406,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23532406 break ;
23542407 case AMDGPU::S_MEMTIME:
23552408 case AMDGPU::S_MEMREALTIME:
2356- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2357- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2358- case AMDGPU::S_BARRIER_LEAVE:
23592409 case AMDGPU::S_GET_BARRIER_STATE_M0:
23602410 case AMDGPU::S_GET_BARRIER_STATE_IMM:
23612411 ScoreBrackets->updateByEvent (TII, TRI, MRI, SMEM_ACCESS, Inst);
@@ -2422,6 +2472,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
24222472 if (T == DS_CNT)
24232473 StrictDom |= mergeScore (M, LastGDS, Other.LastGDS );
24242474
2475+ if (T == KM_CNT) {
2476+ StrictDom |= mergeScore (M, SCCScore, Other.SCCScore );
2477+ if (Other.hasPendingEvent (SCC_WRITE)) {
2478+ unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2479+ if (!OldEventsHasSCCWrite) {
2480+ PendingSCCWrite = Other.PendingSCCWrite ;
2481+ } else {
2482+ if (PendingSCCWrite != Other.PendingSCCWrite )
2483+ PendingSCCWrite = nullptr ;
2484+ }
2485+ }
2486+ }
2487+
24252488 for (int J = 0 ; J <= VgprUB; J++)
24262489 StrictDom |= mergeScore (M, VgprScores[T][J], Other.VgprScores [T][J]);
24272490
@@ -2453,6 +2516,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
24532516 counterTypeForInstr (Opcode).has_value ();
24542517}
24552518
2519+ bool SIInsertWaitcnts::asynchronouslyWritesSCC (unsigned Opcode) {
2520+ return Opcode == AMDGPU::S_BARRIER_LEAVE ||
2521+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
2522+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
2523+ }
2524+
24562525// Generate s_waitcnt instructions where needed.
24572526bool SIInsertWaitcnts::insertWaitcntInBlock (MachineFunction &MF,
24582527 MachineBasicBlock &Block,
0 commit comments