@@ -444,21 +444,24 @@ struct SchedConfig {
444444 MASK_SKIP_CLUSTER = 1U << 3 ,
445445 MASK_SKIP_HOLD = 1U << 4 ,
446446 MASK_NOT_ITERATE = 1U << 5 ,
447+ MASK_TRY_SUBTREE_SCHEDULE = 1U << 6 ,
447448 };
448449 unsigned Dump : 1 ;
449450 unsigned UseLatency : 1 ;
450451 unsigned UseMinReg : 1 ;
451452 unsigned SkipClustering : 1 ; // default 0 i.e. try min-reg with clustering
452453 unsigned SkipHoldList : 1 ; // default 0 i.e. use hold list in latency-hiding
453454 unsigned DoNotIterate : 1 ; // default 0 i.e. iterative latency-scheduling
455+ unsigned TrySubtreeSchedule : 1 ; // prefer sub-tree schedule heuristic
454456
455457 explicit SchedConfig (unsigned Config)
456458 : Dump((Config & MASK_DUMP) != 0),
457459 UseLatency((Config & MASK_LATENCY) != 0),
458460 UseMinReg((Config & MASK_MIN_REG) != 0),
459461 SkipClustering((Config & MASK_SKIP_CLUSTER) != 0),
460462 SkipHoldList((Config & MASK_SKIP_HOLD) != 0),
461- DoNotIterate((Config & MASK_NOT_ITERATE) != 0) {}
463+ DoNotIterate((Config & MASK_NOT_ITERATE) != 0),
464+ TrySubtreeSchedule((Config & MASK_TRY_SUBTREE_SCHEDULE) != 0) {}
462465};
463466
464467#define SCHED_DUMP (X ) \
@@ -512,7 +515,8 @@ class BB_Scheduler {
512515 // UpperBoundGRF is the measure max reg-pressure of this kernel before scheduling
513516 bool scheduleBlockForLatency (unsigned &MaxPressure, bool ReassignID,
514517 unsigned UpperBoundGRF);
515- void SethiUllmanScheduling (bool DoClustering);
518+ void SethiUllmanScheduling (bool DoClustering,
519+ bool UseSubtreeHeuristic = false );
516520
517521private:
518522 void LatencyScheduling (unsigned GroupingThreshold);
@@ -823,9 +827,15 @@ class SethiUllmanQueue : public QueueBase {
823827 // the max time-stamp among node uses
824828 std::vector<unsigned > LiveTS;
825829
830+ std::vector<G4_INST *> &schedule;
831+
832+ bool UseSubtreeHeuristic = false ;
833+
826834public:
827- SethiUllmanQueue (preDDD &ddd, RegisterPressure &rp, SchedConfig config)
828- : QueueBase(ddd, rp, config) {
835+ SethiUllmanQueue (preDDD &ddd, RegisterPressure &rp, SchedConfig config,
836+ std::vector<G4_INST *> &s, bool SubtreeHeuristic)
837+ : QueueBase(ddd, rp, config), schedule(s) {
838+ UseSubtreeHeuristic = SubtreeHeuristic;
829839 init ();
830840 }
831841
@@ -839,7 +849,7 @@ class SethiUllmanQueue : public QueueBase {
839849
840850 bool empty () const { return Q.empty (); }
841851
842- friend void BB_Scheduler::SethiUllmanScheduling (bool );
852+ friend void BB_Scheduler::SethiUllmanScheduling (bool , bool );
843853
844854private:
845855 // Initialize Sethi-Ullman numbers.
@@ -973,6 +983,36 @@ bool SethiUllmanQueue::compare(preNode *N1, preNode *N2) {
973983 if (SU1 > SU2)
974984 return true ;
975985
986+ if (UseSubtreeHeuristic && !schedule.empty ()) {
987+ // Select next node from queue that's a predecessor of last scheduled node.
988+ // This can retire registers early, but it can worsen latency.
989+ //
990+ // If SU1 is immediate parent of last scheduled node then select it.
991+ // If SU2 is immediate parent of last scheduled node then select it.
992+ G4_INST *LastScheduled = nullptr ;
993+ for (auto RI = schedule.rbegin (); RI != schedule.rend (); ++RI) {
994+ auto *Inst = (*RI);
995+ if (Inst->isPseudoKill ())
996+ continue ;
997+ LastScheduled = Inst;
998+ break ;
999+ }
1000+ if (LastScheduled) {
1001+ auto &SuccS1 = N1->Succs ;
1002+ auto &SuccS2 = N2->Succs ;
1003+ for (auto &EdgeS1 : SuccS1) {
1004+ auto *SuccS1 = EdgeS1.getNode ();
1005+ if (SuccS1->getInst () == LastScheduled)
1006+ return false ;
1007+ }
1008+ for (auto &EdgeS2 : SuccS2) {
1009+ auto *SuccS2 = EdgeS2.getNode ();
1010+ if (SuccS2->getInst () == LastScheduled)
1011+ return true ;
1012+ }
1013+ }
1014+ }
1015+
9761016 // Otherwise, break tie with their IDs. Smaller ID means higher priority.
9771017 return N1->getID () > N2->getID ();
9781018}
@@ -1153,8 +1193,21 @@ bool BB_Scheduler::scheduleBlockForPressure(unsigned &MaxPressure,
11531193 // try clustering first
11541194 SethiUllmanScheduling (true );
11551195 if (commitIfBeneficial (MaxPressure)) {
1196+ // If MaxPressure is still > 2x Threshold, attempt subtree scheduling
1197+ // heuristic. This costs compile time, so run it only if pressure is
1198+ // very high.
1199+ bool SubtreeHeuristicChosen = false ;
1200+ if (MaxPressure > (2 * Threshold) && config.TrySubtreeSchedule ) {
1201+ ddd.reset (false );
1202+ SethiUllmanScheduling (false , true );
1203+ if (commitIfBeneficial (MaxPressure)) {
1204+ SCHED_DUMP (std::cerr << " Chose subtree heuristic\n " );
1205+ SubtreeHeuristicChosen = true ;
1206+ }
1207+ }
1208+ if (!SubtreeHeuristicChosen)
1209+ kernel.fg .builder ->getJitInfo ()->statsVerbose .minRegClusterCount ++;
11561210 SCHED_DUMP (rp.dump (ddd.getBB (), " After scheduling for presssure, " ));
1157- kernel.fg .builder ->getJitInfo ()->statsVerbose .minRegClusterCount ++;
11581211 Changed = true ;
11591212 } else {
11601213 ddd.reset (false );
@@ -1164,9 +1217,25 @@ bool BB_Scheduler::scheduleBlockForPressure(unsigned &MaxPressure,
11641217 // try not-clustering
11651218 SethiUllmanScheduling (false );
11661219 if (commitIfBeneficial (MaxPressure)) {
1220+ if (MaxPressure > (2 * Threshold) && config.TrySubtreeSchedule ) {
1221+ SethiUllmanScheduling (false , true );
1222+ commitIfBeneficial (MaxPressure);
1223+ }
11671224 SCHED_DUMP (rp.dump (ddd.getBB (), " After scheduling for presssure, " ));
11681225 kernel.fg .builder ->getJitInfo ()->statsVerbose .minRegSUCount ++;
11691226 Changed = true ;
1227+ } else if (config.TrySubtreeSchedule ) {
1228+ ddd.reset (false );
1229+ }
1230+ }
1231+
1232+ if (!Changed) {
1233+ if (MaxPressure > (2 * Threshold) && config.TrySubtreeSchedule ) {
1234+ SethiUllmanScheduling (false , true );
1235+ if (commitIfBeneficial (MaxPressure)) {
1236+ SCHED_DUMP (rp.dump (ddd.getBB (), " After scheduling for presssure, " ));
1237+ Changed = true ;
1238+ }
11701239 }
11711240 }
11721241 }
@@ -1175,9 +1244,10 @@ bool BB_Scheduler::scheduleBlockForPressure(unsigned &MaxPressure,
11751244 return Changed;
11761245}
11771246
1178- void BB_Scheduler::SethiUllmanScheduling (bool DoClustering) {
1247+ void BB_Scheduler::SethiUllmanScheduling (bool DoClustering,
1248+ bool UseSubtreeHeuristic) {
11791249 schedule.clear ();
1180- SethiUllmanQueue Q (ddd, rp, config);
1250+ SethiUllmanQueue Q (ddd, rp, config, schedule, UseSubtreeHeuristic );
11811251 Q.push (ddd.getExitNode ());
11821252
11831253 while (!Q.empty ()) {
0 commit comments