@@ -104,25 +104,38 @@ struct HardwareLimits {
104104 unsigned KmcntMax; // gfx12+ only.
105105};
106106
107+ #define AMDGPU_DECLARE_WAIT_EVENTS (DECL ) \
108+ DECL (VMEM_ACCESS) /* vmem read & write */ \
109+ DECL (VMEM_READ_ACCESS) /* vmem read */ \
110+ DECL (VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
111+ DECL (VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
112+ DECL (VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
113+ DECL (SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
114+ DECL (LDS_ACCESS) /* lds read & write */ \
115+ DECL (GDS_ACCESS) /* gds read & write */ \
116+ DECL (SQ_MESSAGE) /* send message */ \
117+ DECL (SMEM_ACCESS) /* scalar-memory read & write */ \
118+ DECL (EXP_GPR_LOCK) /* export holding on its data src */ \
119+ DECL (GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
120+ DECL (EXP_POS_ACCESS) /* write to export position */ \
121+ DECL (EXP_PARAM_ACCESS) /* write to export parameter */ \
122+ DECL (VMW_GPR_LOCK) /* vmem write holding on its data src */ \
123+ DECL (EXP_LDS_ACCESS) /* read by ldsdir counting as export */
124+
125+ // clang-format off
126+ #define AMDGPU_EVENT_ENUM (Name ) Name,
107127enum WaitEventType {
108- VMEM_ACCESS, // vector-memory read & write
109- VMEM_READ_ACCESS, // vector-memory read
110- VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
111- VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
112- VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
113- SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
114- LDS_ACCESS, // lds read & write
115- GDS_ACCESS, // gds read & write
116- SQ_MESSAGE, // send message
117- SMEM_ACCESS, // scalar-memory read & write
118- EXP_GPR_LOCK, // export holding on its data src
119- GDS_GPR_LOCK, // GDS holding on its data and addr src
120- EXP_POS_ACCESS, // write to export position
121- EXP_PARAM_ACCESS, // write to export parameter
122- VMW_GPR_LOCK, // vector-memory write holding on its data src
123- EXP_LDS_ACCESS, // read by ldsdir counting as export
124- NUM_WAIT_EVENTS,
128+ AMDGPU_DECLARE_WAIT_EVENTS (AMDGPU_EVENT_ENUM)
129+ NUM_WAIT_EVENTS
125130};
131+ #undef AMDGPU_EVENT_ENUM
132+
133+ #define AMDGPU_EVENT_NAME (Name ) #Name,
134+ static constexpr StringLiteral WaitEventTypeName[] = {
135+ AMDGPU_DECLARE_WAIT_EVENTS (AMDGPU_EVENT_NAME)
136+ };
137+ #undef AMDGPU_EVENT_NAME
138+ // clang-format on
126139
127140// The mapping is:
128141// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
@@ -1100,6 +1113,20 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11001113 }
11011114 OS << ' \n ' ;
11021115 }
1116+
1117+ OS << " Pending Events: " ;
1118+ if (hasPendingEvent ()) {
1119+ ListSeparator LS;
1120+ for (unsigned I = 0 ; I != NUM_WAIT_EVENTS; ++I) {
1121+ if (hasPendingEvent ((WaitEventType)I)) {
1122+ OS << LS << WaitEventTypeName[I];
1123+ }
1124+ }
1125+ } else {
1126+ OS << " none" ;
1127+ }
1128+ OS << ' \n ' ;
1129+
11031130 OS << ' \n ' ;
11041131}
11051132
@@ -1265,10 +1292,15 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
12651292 MachineInstr *WaitcntInstr = nullptr ;
12661293 MachineInstr *WaitcntVsCntInstr = nullptr ;
12671294
1295+ LLVM_DEBUG (dbgs () << " PreGFX12::applyPreexistingWaitcnt at: " << *It);
1296+
12681297 for (auto &II :
12691298 make_early_inc_range (make_range (OldWaitcntInstr.getIterator (), It))) {
1270- if (II.isMetaInstruction ())
1299+ LLVM_DEBUG (dbgs () << " pre-existing iter: " << II);
1300+ if (II.isMetaInstruction ()) {
1301+ LLVM_DEBUG (dbgs () << " skipped meta instruction\n " );
12711302 continue ;
1303+ }
12721304
12731305 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode (II.getOpcode ());
12741306 bool TrySimplify = Opcode != II.getOpcode () && !OptNone;
@@ -1320,9 +1352,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13201352
13211353 LLVM_DEBUG (It == WaitcntInstr->getParent ()->end ()
13221354 ? dbgs ()
1323- << " applyPreexistingWaitcnt \n "
1355+ << " applied pre-existing waitcnt \n "
13241356 << " New Instr at block end: " << *WaitcntInstr << ' \n '
1325- : dbgs () << " applyPreexistingWaitcnt \n "
1357+ : dbgs () << " applied pre-existing waitcnt \n "
13261358 << " Old Instr: " << *It
13271359 << " New Instr: " << *WaitcntInstr << ' \n ' );
13281360 }
@@ -1336,10 +1368,10 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13361368 Wait.StoreCnt = ~0u ;
13371369
13381370 LLVM_DEBUG (It == WaitcntVsCntInstr->getParent ()->end ()
1339- ? dbgs () << " applyPreexistingWaitcnt \n "
1371+ ? dbgs () << " applied pre-existing waitcnt \n "
13401372 << " New Instr at block end: " << *WaitcntVsCntInstr
13411373 << ' \n '
1342- : dbgs () << " applyPreexistingWaitcnt \n "
1374+ : dbgs () << " applied pre-existing waitcnt \n "
13431375 << " Old Instr: " << *It
13441376 << " New Instr: " << *WaitcntVsCntInstr << ' \n ' );
13451377 }
@@ -1413,10 +1445,15 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
14131445 MachineInstr *CombinedStoreDsCntInstr = nullptr ;
14141446 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
14151447
1448+ LLVM_DEBUG (dbgs () << " GFX12Plus::applyPreexistingWaitcnt at: " << *It);
1449+
14161450 for (auto &II :
14171451 make_early_inc_range (make_range (OldWaitcntInstr.getIterator (), It))) {
1418- if (II.isMetaInstruction ())
1452+ LLVM_DEBUG (dbgs () << " pre-existing iter: " << II);
1453+ if (II.isMetaInstruction ()) {
1454+ LLVM_DEBUG (dbgs () << " skipped meta instruction\n " );
14191455 continue ;
1456+ }
14201457
14211458 MachineInstr **UpdatableInstr;
14221459
@@ -1486,10 +1523,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
14861523 Wait.DsCnt = ~0u ;
14871524
14881525 LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
1489- ? dbgs () << " applyPreexistingWaitcnt \n "
1526+ ? dbgs () << " applied pre-existing waitcnt \n "
14901527 << " New Instr at block end: "
14911528 << *CombinedLoadDsCntInstr << ' \n '
1492- : dbgs () << " applyPreexistingWaitcnt \n "
1529+ : dbgs () << " applied pre-existing waitcnt \n "
14931530 << " Old Instr: " << *It << " New Instr: "
14941531 << *CombinedLoadDsCntInstr << ' \n ' );
14951532 } else {
@@ -1511,10 +1548,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15111548 Wait.DsCnt = ~0u ;
15121549
15131550 LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
1514- ? dbgs () << " applyPreexistingWaitcnt \n "
1551+ ? dbgs () << " applied pre-existing waitcnt \n "
15151552 << " New Instr at block end: "
15161553 << *CombinedStoreDsCntInstr << ' \n '
1517- : dbgs () << " applyPreexistingWaitcnt \n "
1554+ : dbgs () << " applied pre-existing waitcnt \n "
15181555 << " Old Instr: " << *It << " New Instr: "
15191556 << *CombinedStoreDsCntInstr << ' \n ' );
15201557 } else {
@@ -1570,10 +1607,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15701607 setNoWait (Wait, CT);
15711608
15721609 LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
1573- ? dbgs () << " applyPreexistingWaitcnt \n "
1610+ ? dbgs () << " applied pre-existing waitcnt \n "
15741611 << " New Instr at block end: " << *WaitInstrs[CT]
15751612 << ' \n '
1576- : dbgs () << " applyPreexistingWaitcnt \n "
1613+ : dbgs () << " applied pre-existing waitcnt \n "
15771614 << " Old Instr: " << *It
15781615 << " New Instr: " << *WaitInstrs[CT] << ' \n ' );
15791616 } else {
@@ -2306,7 +2343,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
23062343 bool Modified = false ;
23072344
23082345 LLVM_DEBUG ({
2309- dbgs () << " *** Block" << Block.getNumber () << " ***" ;
2346+ dbgs () << " *** Begin Block: " ;
2347+ Block.printName (dbgs ());
23102348 ScoreBrackets.dump ();
23112349 });
23122350
@@ -2437,6 +2475,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24372475 Modified |= generateWaitcnt (Wait, Block.instr_end (), Block, ScoreBrackets,
24382476 OldWaitcntInstr);
24392477
2478+ LLVM_DEBUG ({
2479+ dbgs () << " *** End Block: " ;
2480+ Block.printName (dbgs ());
2481+ ScoreBrackets.dump ();
2482+ });
2483+
24402484 return Modified;
24412485}
24422486
@@ -2699,17 +2743,21 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
26992743 BlockInfo &SuccBI = SuccBII->second ;
27002744 if (!SuccBI.Incoming ) {
27012745 SuccBI.Dirty = true ;
2702- if (SuccBII <= BII)
2746+ if (SuccBII <= BII) {
2747+ LLVM_DEBUG (dbgs () << " repeat on backedge\n " );
27032748 Repeat = true ;
2749+ }
27042750 if (!MoveBracketsToSucc) {
27052751 MoveBracketsToSucc = &SuccBI;
27062752 } else {
27072753 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
27082754 }
27092755 } else if (SuccBI.Incoming ->merge (*Brackets)) {
27102756 SuccBI.Dirty = true ;
2711- if (SuccBII <= BII)
2757+ if (SuccBII <= BII) {
2758+ LLVM_DEBUG (dbgs () << " repeat on backedge\n " );
27122759 Repeat = true ;
2760+ }
27132761 }
27142762 }
27152763 if (MoveBracketsToSucc)
0 commit comments