@@ -407,8 +407,13 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
407407};
408408
409409class SIInsertWaitcnts {
410+ public:
411+ const GCNSubtarget *ST;
412+ InstCounterType SmemAccessCounter;
413+ InstCounterType MaxCounter;
414+ const unsigned *WaitEventMaskForInst;
415+
410416private:
411- const GCNSubtarget *ST = nullptr ;
412417 const SIInstrInfo *TII = nullptr ;
413418 const SIRegisterInfo *TRI = nullptr ;
414419 const MachineRegisterInfo *MRI = nullptr ;
@@ -424,8 +429,6 @@ class SIInsertWaitcnts {
424429 bool Dirty = true ;
425430 };
426431
427- InstCounterType SmemAccessCounter;
428-
429432 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
430433
431434 bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -442,7 +445,7 @@ class SIInsertWaitcnts {
442445 // message.
443446 DenseSet<MachineInstr *> ReleaseVGPRInsts;
444447
445- InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS ;
448+ HardwareLimits Limits ;
446449
447450public:
448451 SIInsertWaitcnts (MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -453,6 +456,30 @@ class SIInsertWaitcnts {
453456 (void )ForceVMCounter;
454457 }
455458
459+ unsigned getWaitCountMax (InstCounterType T) const {
460+ switch (T) {
461+ case LOAD_CNT:
462+ return Limits.LoadcntMax ;
463+ case DS_CNT:
464+ return Limits.DscntMax ;
465+ case EXP_CNT:
466+ return Limits.ExpcntMax ;
467+ case STORE_CNT:
468+ return Limits.StorecntMax ;
469+ case SAMPLE_CNT:
470+ return Limits.SamplecntMax ;
471+ case BVH_CNT:
472+ return Limits.BvhcntMax ;
473+ case KM_CNT:
474+ return Limits.KmcntMax ;
475+ case X_CNT:
476+ return Limits.XcntMax ;
477+ default :
478+ break ;
479+ }
480+ return 0 ;
481+ }
482+
456483 bool shouldFlushVmCnt (MachineLoop *ML, const WaitcntBrackets &Brackets);
457484 bool isPreheaderToFlush (MachineBasicBlock &MBB,
458485 const WaitcntBrackets &ScoreBrackets);
@@ -568,39 +595,10 @@ class SIInsertWaitcnts {
568595// "s_waitcnt 0" before use.
569596class WaitcntBrackets {
570597public:
571- WaitcntBrackets (const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
572- HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
573- InstCounterType SmemAccessCounter)
574- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
575- WaitEventMaskForInst (WaitEventMaskForInst),
576- SmemAccessCounter(SmemAccessCounter) {}
577-
578- unsigned getWaitCountMax (InstCounterType T) const {
579- switch (T) {
580- case LOAD_CNT:
581- return Limits.LoadcntMax ;
582- case DS_CNT:
583- return Limits.DscntMax ;
584- case EXP_CNT:
585- return Limits.ExpcntMax ;
586- case STORE_CNT:
587- return Limits.StorecntMax ;
588- case SAMPLE_CNT:
589- return Limits.SamplecntMax ;
590- case BVH_CNT:
591- return Limits.BvhcntMax ;
592- case KM_CNT:
593- return Limits.KmcntMax ;
594- case X_CNT:
595- return Limits.XcntMax ;
596- default :
597- break ;
598- }
599- return 0 ;
600- }
598+ WaitcntBrackets (const SIInsertWaitcnts *Context) : Context(Context) {}
601599
602600 bool isSmemCounter (InstCounterType T) const {
603- return T == SmemAccessCounter || T == X_CNT;
601+ return T == Context-> SmemAccessCounter || T == X_CNT;
604602 }
605603
606604 unsigned getSgprScoresIdx (InstCounterType T) const {
@@ -658,7 +656,7 @@ class WaitcntBrackets {
658656 return PendingEvents & (1 << E);
659657 }
660658 unsigned hasPendingEvent (InstCounterType T) const {
661- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
659+ unsigned HasPending = PendingEvents & Context-> WaitEventMaskForInst [T];
662660 assert ((HasPending != 0 ) == (getScoreRange (T) != 0 ));
663661 return HasPending;
664662 }
@@ -686,7 +684,8 @@ class WaitcntBrackets {
686684 }
687685
688686 unsigned getPendingGDSWait () const {
689- return std::min (getScoreUB (DS_CNT) - LastGDS, getWaitCountMax (DS_CNT) - 1 );
687+ return std::min (getScoreUB (DS_CNT) - LastGDS,
688+ Context->getWaitCountMax (DS_CNT) - 1 );
690689 }
691690
692691 void setPendingGDS () { LastGDS = ScoreUBs[DS_CNT]; }
@@ -710,8 +709,9 @@ class WaitcntBrackets {
710709 }
711710
712711 void setStateOnFunctionEntryOrReturn () {
713- setScoreUB (STORE_CNT, getScoreUB (STORE_CNT) + getWaitCountMax (STORE_CNT));
714- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
712+ setScoreUB (STORE_CNT,
713+ getScoreUB (STORE_CNT) + Context->getWaitCountMax (STORE_CNT));
714+ PendingEvents |= Context->WaitEventMaskForInst [STORE_CNT];
715715 }
716716
717717 ArrayRef<const MachineInstr *> getLDSDMAStores () const {
@@ -747,8 +747,8 @@ class WaitcntBrackets {
747747 if (T != EXP_CNT)
748748 return ;
749749
750- if (getScoreRange (EXP_CNT) > getWaitCountMax (EXP_CNT))
751- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax (EXP_CNT);
750+ if (getScoreRange (EXP_CNT) > Context-> getWaitCountMax (EXP_CNT))
751+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context-> getWaitCountMax (EXP_CNT);
752752 }
753753
754754 void setRegScore (int GprNo, InstCounterType T, unsigned Val) {
@@ -763,11 +763,8 @@ class WaitcntBrackets {
763763 const MachineOperand &Op, InstCounterType CntTy,
764764 unsigned Val);
765765
766- const GCNSubtarget *ST = nullptr ;
767- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
768- HardwareLimits Limits = {};
769- const unsigned *WaitEventMaskForInst;
770- InstCounterType SmemAccessCounter;
766+ const SIInsertWaitcnts *Context;
767+
771768 unsigned ScoreLBs[NUM_INST_CNTS] = {0 };
772769 unsigned ScoreUBs[NUM_INST_CNTS] = {0 };
773770 unsigned PendingEvents = 0 ;
@@ -829,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
829826
830827 RegInterval Result;
831828
832- MCRegister MCReg = AMDGPU::getMCReg (Op.getReg (), *ST);
829+ MCRegister MCReg = AMDGPU::getMCReg (Op.getReg (), *Context-> ST );
833830 unsigned RegIdx = TRI->getHWRegIndex (MCReg);
834831 assert (isUInt<8 >(RegIdx));
835832
@@ -887,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
887884// this at compile time, so we have to assume it might be applied if the
888885// instruction supports it).
889886bool WaitcntBrackets::hasPointSampleAccel (const MachineInstr &MI) const {
890- if (!ST->hasPointSampleAccel () || !SIInstrInfo::isMIMG (MI))
887+ if (!Context-> ST ->hasPointSampleAccel () || !SIInstrInfo::isMIMG (MI))
891888 return false ;
892889
893890 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo (MI.getOpcode ());
@@ -913,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
913910 const SIRegisterInfo *TRI,
914911 const MachineRegisterInfo *MRI,
915912 WaitEventType E, MachineInstr &Inst) {
916- InstCounterType T = eventCounter (WaitEventMaskForInst, E);
913+ InstCounterType T = eventCounter (Context-> WaitEventMaskForInst , E);
917914
918915 unsigned UB = getScoreUB (T);
919916 unsigned CurrScore = UB + 1 ;
@@ -1082,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
10821079}
10831080
10841081void WaitcntBrackets::print (raw_ostream &OS) const {
1082+ const GCNSubtarget *ST = Context->ST ;
1083+
10851084 OS << ' \n ' ;
1086- for (auto T : inst_counter_types (MaxCounter)) {
1085+ for (auto T : inst_counter_types (Context-> MaxCounter )) {
10871086 unsigned SR = getScoreRange (T);
10881087
10891088 switch (T) {
@@ -1197,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
11971196 // s_waitcnt instruction.
11981197 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
11991198 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat () &&
1200- !ST->hasFlatLgkmVMemCountInOrder ()) {
1199+ !Context-> ST ->hasFlatLgkmVMemCountInOrder ()) {
12011200 // If there is a pending FLAT operation, and this is a VMem or LGKM
12021201 // waitcnt and the target can report early completion, then we need
12031202 // to force a waitcnt 0.
@@ -1211,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
12111210 // If a counter has been maxed out avoid overflow by waiting for
12121211 // MAX(CounterType) - 1 instead.
12131212 unsigned NeededWait =
1214- std::min (UB - ScoreToWait, getWaitCountMax (T) - 1 );
1213+ std::min (UB - ScoreToWait, Context-> getWaitCountMax (T) - 1 );
12151214 addWait (Wait, T, NeededWait);
12161215 }
12171216 }
@@ -1239,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12391238 setScoreLB (T, std::max (getScoreLB (T), UB - Count));
12401239 } else {
12411240 setScoreLB (T, UB);
1242- PendingEvents &= ~WaitEventMaskForInst[T];
1241+ PendingEvents &= ~Context-> WaitEventMaskForInst [T];
12431242 }
12441243}
12451244
@@ -1264,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
12641263// the decrement may go out of order.
12651264bool WaitcntBrackets::counterOutOfOrder (InstCounterType T) const {
12661265 // Scalar memory read always can go out of order.
1267- if ((T == SmemAccessCounter && hasPendingEvent (SMEM_ACCESS)) ||
1266+ if ((T == Context-> SmemAccessCounter && hasPendingEvent (SMEM_ACCESS)) ||
12681267 (T == X_CNT && hasPendingEvent (SMEM_GROUP)))
12691268 return true ;
12701269 return hasMixedPendingEvents (T);
@@ -2388,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
23882387 VgprUB = std::max (VgprUB, Other.VgprUB );
23892388 SgprUB = std::max (SgprUB, Other.SgprUB );
23902389
2391- for (auto T : inst_counter_types (MaxCounter)) {
2390+ for (auto T : inst_counter_types (Context-> MaxCounter )) {
23922391 // Merge event flags for this counter
2392+ const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst ;
23932393 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
23942394 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
23952395 if (OtherEvents & ~OldEvents)
@@ -2748,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
27482748 for (auto T : inst_counter_types ())
27492749 ForceEmitWaitcnt[T] = false ;
27502750
2751- const unsigned * WaitEventMaskForInst = WCG->getWaitEventMask ();
2751+ WaitEventMaskForInst = WCG->getWaitEventMask ();
27522752
27532753 SmemAccessCounter = eventCounter (WaitEventMaskForInst, SMEM_ACCESS);
27542754
2755- HardwareLimits Limits = {};
27562755 if (ST->hasExtendedWaitCounts ()) {
27572756 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask (IV);
27582757 Limits.DscntMax = AMDGPU::getDscntBitMask (IV);
@@ -2809,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
28092808 BuildMI (EntryBB, I, DebugLoc (), TII->get (AMDGPU::S_WAITCNT)).addImm (0 );
28102809 }
28112810
2812- auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2813- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2811+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this );
28142812 NonKernelInitialState->setStateOnFunctionEntryOrReturn ();
28152813 BlockInfos[&EntryBB].Incoming = std::move (NonKernelInitialState);
28162814
@@ -2841,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
28412839 *Brackets = *BI.Incoming ;
28422840 } else {
28432841 if (!Brackets) {
2844- Brackets = std::make_unique<WaitcntBrackets>(
2845- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2842+ Brackets = std::make_unique<WaitcntBrackets>(this );
28462843 } else {
28472844 // Reinitialize in-place. N.B. do not do this by assigning from a
28482845 // temporary because the WaitcntBrackets class is large and it could
28492846 // cause this function to use an unreasonable amount of stack space.
28502847 Brackets->~WaitcntBrackets ();
2851- new (Brackets.get ()) WaitcntBrackets (
2852- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2848+ new (Brackets.get ()) WaitcntBrackets (this );
28532849 }
28542850 }
28552851
0 commit comments