@@ -338,6 +338,17 @@ class LoopCarriedOrderDepsTracker {
338338 void addLoopCarriedDepenenciesForChunks (const LoadStoreChunk &From,
339339 const LoadStoreChunk &To);
340340
341+ // / Add a loop-carried order dependency between \p Src and \p Dst if we
342+ // / cannot prove they are independent. When \p PerformCheapCheck is true, a
343+ // / lightweight dependency test (referred to as "cheap check" below) is
344+ // / performed at first. Note that the cheap check is retained to maintain the
345+ // / existing behavior and not expected to be used anymore.
346+ // /
347+ // / TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
348+ void addDependenciesBetweenSUs (const SUnitWithMemInfo &Src,
349+ const SUnitWithMemInfo &Dst,
350+ bool PerformCheapCheck = false );
351+
341352 void computeDependenciesAux ();
342353};
343354
@@ -673,7 +684,7 @@ void SwingSchedulerDAG::schedule() {
673684 Topo.InitDAGTopologicalSorting ();
674685 changeDependences ();
675686 postProcessDAG ();
676- DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
687+ DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE );
677688 LLVM_DEBUG ({
678689 dump ();
679690 dbgs () << " ===== Loop Carried Edges Begin =====\n " ;
@@ -958,36 +969,44 @@ bool SUnitWithMemInfo::getUnderlyingObjects() {
958969
959970// / Returns true if there is a loop-carried order dependency from \p Src to \p
960971// / Dst.
961- static bool hasLoopCarriedMemDep ( const SUnitWithMemInfo &Src,
962- const SUnitWithMemInfo &Dst,
963- BatchAAResults &BAA,
964- const TargetInstrInfo *TII ,
965- const TargetRegisterInfo *TRI ) {
972+ static bool
973+ hasLoopCarriedMemDep ( const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
974+ BatchAAResults &BAA, const TargetInstrInfo *TII ,
975+ const TargetRegisterInfo *TRI ,
976+ const SwingSchedulerDAG *SSD, bool PerformCheapCheck ) {
966977 if (Src.isTriviallyDisjoint (Dst))
967978 return false ;
968979 if (isSuccOrder (Src.SU , Dst.SU ))
969980 return false ;
970981
971982 MachineInstr &SrcMI = *Src.SU ->getInstr ();
972983 MachineInstr &DstMI = *Dst.SU ->getInstr ();
973- // First, perform the cheaper check that compares the base register.
974- // If they are the same and the load offset is less than the store
975- // offset, then mark the dependence as loop carried potentially.
976- const MachineOperand *BaseOp1, *BaseOp2;
977- int64_t Offset1, Offset2;
978- bool Offset1IsScalable, Offset2IsScalable;
979- if (TII->getMemOperandWithOffset (SrcMI, BaseOp1, Offset1, Offset1IsScalable,
980- TRI) &&
981- TII->getMemOperandWithOffset (DstMI, BaseOp2, Offset2, Offset2IsScalable,
982- TRI)) {
983- if (BaseOp1->isIdenticalTo (*BaseOp2) &&
984- Offset1IsScalable == Offset2IsScalable && (int )Offset1 < (int )Offset2) {
985- assert (TII->areMemAccessesTriviallyDisjoint (SrcMI, DstMI) &&
986- " What happened to the chain edge?" );
987- return true ;
984+ if (PerformCheapCheck) {
985+ // First, perform the cheaper check that compares the base register.
986+ // If they are the same and the load offset is less than the store
987+ // offset, then mark the dependence as loop carried potentially.
988+ //
989+ // TODO: This check will be removed.
990+ const MachineOperand *BaseOp1, *BaseOp2;
991+ int64_t Offset1, Offset2;
992+ bool Offset1IsScalable, Offset2IsScalable;
993+ if (TII->getMemOperandWithOffset (SrcMI, BaseOp1, Offset1, Offset1IsScalable,
994+ TRI) &&
995+ TII->getMemOperandWithOffset (DstMI, BaseOp2, Offset2, Offset2IsScalable,
996+ TRI)) {
997+ if (BaseOp1->isIdenticalTo (*BaseOp2) &&
998+ Offset1IsScalable == Offset2IsScalable &&
999+ (int )Offset1 < (int )Offset2) {
1000+ assert (TII->areMemAccessesTriviallyDisjoint (SrcMI, DstMI) &&
1001+ " What happened to the chain edge?" );
1002+ return true ;
1003+ }
9881004 }
9891005 }
9901006
1007+ if (!SSD->mayOverlapInLaterIter (&SrcMI, &DstMI))
1008+ return false ;
1009+
9911010 // Second, the more expensive check that uses alias analysis on the
9921011 // base registers. If they alias, and the load offset is less than
9931012 // the store offset, the mark the dependence as loop carried.
@@ -1056,20 +1075,34 @@ LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
10561075 return std::nullopt ;
10571076}
10581077
1078+ void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs (
1079+ const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1080+ bool PerformCheapCheck) {
1081+ // Avoid self-dependencies.
1082+ if (Src.SU == Dst.SU )
1083+ return ;
1084+
1085+ if (hasLoopCarriedMemDep (Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1086+ LoopCarried[Src.SU ->NodeNum ].set (Dst.SU ->NodeNum );
1087+ }
1088+
10591089void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks (
10601090 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1061- // Add dependencies for load-to-store (WAR) from top to bottom .
1091+ // Add load-to-store dependencies (WAR).
10621092 for (const SUnitWithMemInfo &Src : From.Loads )
10631093 for (const SUnitWithMemInfo &Dst : To.Stores )
1064- if (Src.SU ->NodeNum < Dst.SU ->NodeNum &&
1065- hasLoopCarriedMemDep (Src, Dst, *BAA, TII, TRI))
1066- LoopCarried[Src.SU ->NodeNum ].set (Dst.SU ->NodeNum );
1094+ // Perform a cheap check first if this is a forward dependency.
1095+ addDependenciesBetweenSUs (Src, Dst, Src.SU ->NodeNum < Dst.SU ->NodeNum );
10671096
1068- // TODO: The following dependencies are missed.
1069- //
1070- // - Dependencies for load-to-store from bottom to top.
1071- // - Dependencies for store-to-load (RAW).
1072- // - Dependencies for store-to-store (WAW).
1097+ // Add store-to-load dependencies (RAW).
1098+ for (const SUnitWithMemInfo &Src : From.Stores )
1099+ for (const SUnitWithMemInfo &Dst : To.Loads )
1100+ addDependenciesBetweenSUs (Src, Dst);
1101+
1102+ // Add store-to-store dependencies (WAW).
1103+ for (const SUnitWithMemInfo &Src : From.Stores )
1104+ for (const SUnitWithMemInfo &Dst : To.Stores )
1105+ addDependenciesBetweenSUs (Src, Dst);
10731106}
10741107
10751108void LoopCarriedOrderDepsTracker::computeDependenciesAux () {
@@ -1116,7 +1149,7 @@ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
11161149 for (const int Succ : LCODTracker.getLoopCarried (I).set_bits ())
11171150 LCE.OrderDeps [&SUnits[I]].insert (&SUnits[Succ]);
11181151
1119- LCE.modifySUnits (SUnits);
1152+ LCE.modifySUnits (SUnits, TII );
11201153 return LCE;
11211154}
11221155
@@ -2676,6 +2709,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
26762709 });
26772710 } while (++NI != NE && scheduleFound);
26782711
2712+ // If a schedule is found, validate it against the validation-only
2713+ // dependencies.
2714+ if (scheduleFound)
2715+ scheduleFound = DDG->isValidSchedule (Schedule);
2716+
26792717 // If a schedule is found, ensure non-pipelined instructions are in stage 0
26802718 if (scheduleFound)
26812719 scheduleFound =
@@ -4118,6 +4156,8 @@ SwingSchedulerDDG::getEdges(const SUnit *SU) const {
41184156
41194157void SwingSchedulerDDG::addEdge (const SUnit *SU,
41204158 const SwingSchedulerDDGEdge &Edge) {
4159+ assert (!Edge.isValidationOnly () &&
4160+ " Validation-only edges are not expected here." );
41214161 auto &Edges = getEdges (SU);
41224162 if (Edge.getSrc () == SU)
41234163 Edges.Succs .push_back (Edge);
@@ -4127,25 +4167,43 @@ void SwingSchedulerDDG::addEdge(const SUnit *SU,
41274167
41284168void SwingSchedulerDDG::initEdges (SUnit *SU) {
41294169 for (const auto &PI : SU->Preds ) {
4130- SwingSchedulerDDGEdge Edge (SU, PI, false );
4170+ SwingSchedulerDDGEdge Edge (SU, PI, /* IsSucc=*/ false ,
4171+ /* IsValidationOnly=*/ false );
41314172 addEdge (SU, Edge);
41324173 }
41334174
41344175 for (const auto &SI : SU->Succs ) {
4135- SwingSchedulerDDGEdge Edge (SU, SI, true );
4176+ SwingSchedulerDDGEdge Edge (SU, SI, /* IsSucc=*/ true ,
4177+ /* IsValidationOnly=*/ false );
41364178 addEdge (SU, Edge);
41374179 }
41384180}
41394181
41404182SwingSchedulerDDG::SwingSchedulerDDG (std::vector<SUnit> &SUnits, SUnit *EntrySU,
4141- SUnit *ExitSU)
4183+ SUnit *ExitSU, const LoopCarriedEdges &LCE )
41424184 : EntrySU(EntrySU), ExitSU(ExitSU) {
41434185 EdgesVec.resize (SUnits.size ());
41444186
4187+ // Add non-loop-carried edges based on the DAG.
41454188 initEdges (EntrySU);
41464189 initEdges (ExitSU);
41474190 for (auto &SU : SUnits)
41484191 initEdges (&SU);
4192+
4193+ // Add loop-carried edges, which are not represented in the DAG.
4194+ for (SUnit &SU : SUnits) {
4195+ SUnit *Src = &SU;
4196+ if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull (Src)) {
4197+ SDep Base (Src, SDep::Barrier);
4198+ Base.setLatency (1 );
4199+ for (SUnit *Dst : *OD) {
4200+ SwingSchedulerDDGEdge Edge (Dst, Base, /* IsSucc=*/ false ,
4201+ /* IsValidationOnly=*/ true );
4202+ Edge.setDistance (1 );
4203+ ValidationOnlyEdges.push_back (Edge);
4204+ }
4205+ }
4206+ }
41494207}
41504208
41514209const SwingSchedulerDDG::EdgesType &
@@ -4158,17 +4216,73 @@ SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
41584216 return getEdges (SU).Succs ;
41594217}
41604218
4161- void LoopCarriedEdges::modifySUnits (std::vector<SUnit> &SUnits) {
4162- // Currently this function simply adds all dependencies represented by this
4163- // object. After we properly handle missed dependencies, the logic here will
4164- // be more complex, as currently missed edges should not be added to the DAG.
4219+ // / Check if \p Schedule doesn't violate the validation-only dependencies.
4220+ bool SwingSchedulerDDG::isValidSchedule (const SMSchedule &Schedule) const {
4221+ unsigned II = Schedule.getInitiationInterval ();
4222+
4223+ auto ExpandCycle = [&](SUnit *SU) {
4224+ int Stage = Schedule.stageScheduled (SU);
4225+ int Cycle = Schedule.cycleScheduled (SU);
4226+ return Cycle + (Stage * II);
4227+ };
4228+
4229+ for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4230+ SUnit *Src = Edge.getSrc ();
4231+ SUnit *Dst = Edge.getDst ();
4232+ if (!Src->isInstr () || !Dst->isInstr ())
4233+ continue ;
4234+ int CycleSrc = ExpandCycle (Src);
4235+ int CycleDst = ExpandCycle (Dst);
4236+ int MaxLateStart = CycleDst + Edge.getDistance () * II - Edge.getLatency ();
4237+ if (CycleSrc > MaxLateStart) {
4238+ LLVM_DEBUG ({
4239+ dbgs () << " Validation failed for edge from " << Src->NodeNum << " to "
4240+ << Dst->NodeNum << " \n " ;
4241+ });
4242+ return false ;
4243+ }
4244+ }
4245+ return true ;
4246+ }
4247+
4248+ void LoopCarriedEdges::modifySUnits (std::vector<SUnit> &SUnits,
4249+ const TargetInstrInfo *TII) {
41654250 for (SUnit &SU : SUnits) {
41664251 SUnit *Src = &SU;
41674252 if (auto *OrderDep = getOrderDepOrNull (Src)) {
41684253 SDep Dep (Src, SDep::Barrier);
41694254 Dep.setLatency (1 );
4170- for (SUnit *Dst : *OrderDep)
4171- Dst->addPred (Dep);
4255+ for (SUnit *Dst : *OrderDep) {
4256+ SUnit *From = Src;
4257+ SUnit *To = Dst;
4258+ if (From->NodeNum > To->NodeNum )
4259+ std::swap (From, To);
4260+
4261+ // Add a forward edge if the following conditions are met:
4262+ //
4263+ // - The instruction of the source node (FromMI) may read memory.
4264+ // - The instruction of the target node (ToMI) may modify memory, but
4265+ // does not read it.
4266+ // - Neither instruction is a global barrier.
4267+ // - The load appears before the store in the original basic block.
4268+ // - There are no barrier or store instructions between the two nodes.
4269+ // - The target node is unreachable from the source node in the current
4270+ // DAG.
4271+ //
4272+ // TODO: These conditions are inherited from a previous implementation,
4273+ // and some may no longer be necessary. For now, we conservatively
4274+ // retain all of them to avoid regressions, but the logic could
4275+ // potentially be simplified
4276+ MachineInstr *FromMI = From->getInstr ();
4277+ MachineInstr *ToMI = To->getInstr ();
4278+ if (FromMI->mayLoad () && !ToMI->mayLoad () && ToMI->mayStore () &&
4279+ !TII->isGlobalMemoryObject (FromMI) &&
4280+ !TII->isGlobalMemoryObject (ToMI) && !isSuccOrder (From, To)) {
4281+ SDep Pred = Dep;
4282+ Pred.setSUnit (Src);
4283+ Dst->addPred (Pred);
4284+ }
4285+ }
41724286 }
41734287 }
41744288}
0 commit comments