@@ -265,6 +265,82 @@ struct SUnitWithMemInfo {
265265 bool getUnderlyingObjects ();
266266};
267267
268+ // / Add loop-carried chain dependencies. This class handles the same type of
269+ // / dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
270+ // / account dependencies across iterations.
271+ class LoopCarriedOrderDepsTracker {
272+ // Type of instruction that is relevant to order-dependencies
273+ enum class InstrTag {
274+ Barrier = 0 , // /< A barrier event instruction.
275+ LoadOrStore = 1 , // /< An instruction that may load or store memory, but is
276+ // /< not a barrier event.
277+ FPExceptions = 2 , // /< An instruction that does not match above, but may
278+ // /< raise floatin-point exceptions.
279+ };
280+
281+ struct TaggedSUnit : PointerIntPair<SUnit *, 2 > {
282+ TaggedSUnit (SUnit *SU, InstrTag Tag)
283+ : PointerIntPair<SUnit *, 2 >(SU, unsigned (Tag)) {}
284+
285+ InstrTag getTag () const { return InstrTag (getInt ()); }
286+ };
287+
288+ // / Holds loads and stores with memory related information.
289+ struct LoadStoreChunk {
290+ SmallVector<SUnitWithMemInfo, 4 > Loads;
291+ SmallVector<SUnitWithMemInfo, 4 > Stores;
292+
293+ void append (SUnit *SU);
294+ };
295+
296+ SwingSchedulerDAG *DAG;
297+ BatchAAResults *BAA;
298+ std::vector<SUnit> &SUnits;
299+
300+ // / The size of SUnits, for convenience.
301+ const unsigned N;
302+
303+ // / Loop-carried Edges.
304+ std::vector<BitVector> LoopCarried;
305+
306+ // / Instructions related to chain dependencies. They are one of the
307+ // / following:
308+ // /
309+ // / 1. Barrier event.
310+ // / 2. Load, but neither a barrier event, invariant load, nor may load trap
311+ // / value.
312+ // / 3. Store, but not a barrier event.
313+ // / 4. None of them, but may raise floating-point exceptions.
314+ // /
315+ // / This is used when analyzing loop-carried dependencies that access global
316+ // / barrier instructions.
317+ std::vector<TaggedSUnit> TaggedSUnits;
318+
319+ const TargetInstrInfo *TII = nullptr ;
320+ const TargetRegisterInfo *TRI = nullptr ;
321+
322+ public:
323+ LoopCarriedOrderDepsTracker (SwingSchedulerDAG *SSD, BatchAAResults *BAA,
324+ const TargetInstrInfo *TII,
325+ const TargetRegisterInfo *TRI);
326+
327+ // / The main function to compute loop-carried order-dependencies.
328+ void computeDependencies ();
329+
330+ const BitVector &getLoopCarried (unsigned Idx) const {
331+ return LoopCarried[Idx];
332+ }
333+
334+ private:
335+ // / Tags to \p SU if the instruction may affect the order-dependencies.
336+ std::optional<InstrTag> getInstrTag (SUnit *SU) const ;
337+
338+ void addLoopCarriedDepenenciesForChunks (const LoadStoreChunk &From,
339+ const LoadStoreChunk &To);
340+
341+ void computeDependenciesAux ();
342+ };
343+
268344} // end anonymous namespace
269345
270346// / The "main" function for implementing Swing Modulo Scheduling.
@@ -592,13 +668,19 @@ void SwingSchedulerDAG::setMAX_II() {
592668// / scheduling part of the Swing Modulo Scheduling algorithm.
593669void SwingSchedulerDAG::schedule () {
594670 buildSchedGraph (AA);
595- addLoopCarriedDependences ();
671+ const LoopCarriedEdges LCE = addLoopCarriedDependences ();
596672 updatePhiDependences ();
597673 Topo.InitDAGTopologicalSorting ();
598674 changeDependences ();
599675 postProcessDAG ();
600676 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
601- LLVM_DEBUG (dump ());
677+ LLVM_DEBUG ({
678+ dump ();
679+ dbgs () << " ===== Loop Carried Edges Begin =====\n " ;
680+ for (SUnit &SU : SUnits)
681+ LCE.dump (&SU, TRI, &MRI);
682+ dbgs () << " ===== Loop Carried Edges End =====\n " ;
683+ });
602684
603685 NodeSetType NodeSets;
604686 findCircuits (NodeSets);
@@ -831,15 +913,6 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
831913 return false ;
832914}
833915
834- // / Return true if the instruction causes a chain between memory
835- // / references before and after it.
836- static bool isDependenceBarrier (MachineInstr &MI) {
837- return MI.isCall () || MI.mayRaiseFPException () ||
838- MI.hasUnmodeledSideEffects () ||
839- (MI.hasOrderedMemoryRef () &&
840- (!MI.mayLoad () || !MI.isDereferenceableInvariantLoad ()));
841- }
842-
843916SUnitWithMemInfo::SUnitWithMemInfo (SUnit *SU) : SU(SU) {
844917 if (!getUnderlyingObjects ())
845918 return ;
@@ -940,28 +1013,111 @@ static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
9401013 return false ;
9411014}
9421015
1016+ void LoopCarriedOrderDepsTracker::LoadStoreChunk::append (SUnit *SU) {
1017+ const MachineInstr *MI = SU->getInstr ();
1018+ if (!MI->mayLoadOrStore ())
1019+ return ;
1020+ (MI->mayStore () ? Stores : Loads).emplace_back (SU);
1021+ }
1022+
1023+ LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker (
1024+ SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1025+ const TargetRegisterInfo *TRI)
1026+ : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1027+ LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1028+
1029+ void LoopCarriedOrderDepsTracker::computeDependencies () {
1030+ // Traverse all instructions and extract only what we are targetting.
1031+ for (auto &SU : SUnits) {
1032+ auto Tagged = getInstrTag (&SU);
1033+
1034+ // This instruction has no loop-carried order-dependencies.
1035+ if (!Tagged)
1036+ continue ;
1037+ TaggedSUnits.emplace_back (&SU, *Tagged);
1038+ }
1039+
1040+ computeDependenciesAux ();
1041+ }
1042+
1043+ std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1044+ LoopCarriedOrderDepsTracker::getInstrTag (SUnit *SU) const {
1045+ MachineInstr *MI = SU->getInstr ();
1046+ if (TII->isGlobalMemoryObject (MI))
1047+ return InstrTag::Barrier;
1048+
1049+ if (MI->mayStore () ||
1050+ (MI->mayLoad () && !MI->isDereferenceableInvariantLoad ()))
1051+ return InstrTag::LoadOrStore;
1052+
1053+ if (MI->mayRaiseFPException ())
1054+ return InstrTag::FPExceptions;
1055+
1056+ return std::nullopt ;
1057+ }
1058+
1059+ void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks (
1060+ const LoadStoreChunk &From, const LoadStoreChunk &To) {
1061+ // Add dependencies for load-to-store (WAR) from top to bottom.
1062+ for (const SUnitWithMemInfo &Src : From.Loads )
1063+ for (const SUnitWithMemInfo &Dst : To.Stores )
1064+ if (Src.SU ->NodeNum < Dst.SU ->NodeNum &&
1065+ hasLoopCarriedMemDep (Src, Dst, *BAA, TII, TRI))
1066+ LoopCarried[Src.SU ->NodeNum ].set (Dst.SU ->NodeNum );
1067+
1068+ // TODO: The following dependencies are missed.
1069+ //
1070+ // - Dependencies for load-to-store from bottom to top.
1071+ // - Dependencies for store-to-load (RAW).
1072+ // - Dependencies for store-to-store (WAW).
1073+ }
1074+
1075+ void LoopCarriedOrderDepsTracker::computeDependenciesAux () {
1076+ SmallVector<LoadStoreChunk, 2 > Chunks (1 );
1077+ for (const auto &TSU : TaggedSUnits) {
1078+ InstrTag Tag = TSU.getTag ();
1079+ SUnit *SU = TSU.getPointer ();
1080+ switch (Tag) {
1081+ case InstrTag::Barrier:
1082+ Chunks.emplace_back ();
1083+ break ;
1084+ case InstrTag::LoadOrStore:
1085+ Chunks.back ().append (SU);
1086+ break ;
1087+ case InstrTag::FPExceptions:
1088+ // TODO: Handle this properly.
1089+ break ;
1090+ }
1091+ }
1092+
1093+ // Add dependencies between memory operations. If there are one or more
1094+ // barrier events between two memory instructions, we don't add a
1095+ // loop-carried dependence for them.
1096+ for (const LoadStoreChunk &Chunk : Chunks)
1097+ addLoopCarriedDepenenciesForChunks (Chunk, Chunk);
1098+
1099+ // TODO: If there are multiple barrier instructions, dependencies from the
1100+ // last barrier instruction (or load/store below it) to the first barrier
1101+ // instruction (or load/store above it).
1102+ }
1103+
9431104// / Add a chain edge between a load and store if the store can be an
9441105// / alias of the load on a subsequent iteration, i.e., a loop carried
9451106// / dependence. This code is very similar to the code in ScheduleDAGInstrs
9461107// / but that code doesn't create loop carried dependences.
947- void SwingSchedulerDAG::addLoopCarriedDependences () {
948- SmallVector<SUnitWithMemInfo, 4 > PendingLoads;
949- for (auto &SU : SUnits) {
950- MachineInstr &MI = *SU.getInstr ();
951- if (isDependenceBarrier (MI))
952- PendingLoads.clear ();
953- else if (MI.mayLoad ()) {
954- PendingLoads.emplace_back (&SU);
955- } else if (MI.mayStore ()) {
956- SUnitWithMemInfo Store (&SU);
957- for (const SUnitWithMemInfo &Load : PendingLoads)
958- if (hasLoopCarriedMemDep (Load, Store, BAA, TII, TRI)) {
959- SDep Dep (Load.SU , SDep::Barrier);
960- Dep.setLatency (1 );
961- SU.addPred (Dep);
962- }
963- }
964- }
1108+ // / TODO: Also compute output-dependencies.
1109+ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences () {
1110+ LoopCarriedEdges LCE;
1111+
1112+ // Add loop-carried order-dependencies
1113+ LoopCarriedOrderDepsTracker LCODTracker (this , &BAA, TII, TRI);
1114+ LCODTracker.computeDependencies ();
1115+ for (unsigned I = 0 ; I != SUnits.size (); I++)
1116+ for (const int Succ : LCODTracker.getLoopCarried (I).set_bits ())
1117+ LCE.OrderDeps [&SUnits[I]].insert (&SUnits[Succ]);
1118+
1119+ LCE.modifySUnits (SUnits);
1120+ return LCE;
9651121}
9661122
9671123// / Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
@@ -4001,3 +4157,37 @@ const SwingSchedulerDDG::EdgesType &
40014157SwingSchedulerDDG::getOutEdges (const SUnit *SU) const {
40024158 return getEdges (SU).Succs ;
40034159}
4160+
4161+ void LoopCarriedEdges::modifySUnits (std::vector<SUnit> &SUnits) {
4162+ // Currently this function simply adds all dependencies represented by this
4163+ // object. After we properly handle missed dependencies, the logic here will
4164+ // be more complex, as currently missed edges should not be added to the DAG.
4165+ for (SUnit &SU : SUnits) {
4166+ SUnit *Src = &SU;
4167+ if (auto *OrderDep = getOrderDepOrNull (Src)) {
4168+ SDep Dep (Src, SDep::Barrier);
4169+ Dep.setLatency (1 );
4170+ for (SUnit *Dst : *OrderDep)
4171+ Dst->addPred (Dep);
4172+ }
4173+ }
4174+ }
4175+
4176+ void LoopCarriedEdges::dump (SUnit *SU, const TargetRegisterInfo *TRI,
4177+ const MachineRegisterInfo *MRI) const {
4178+ const auto *Order = getOrderDepOrNull (SU);
4179+
4180+ if (!Order)
4181+ return ;
4182+
4183+ const auto DumpSU = [](const SUnit *SU) {
4184+ std::ostringstream OSS;
4185+ OSS << " SU(" << SU->NodeNum << " )" ;
4186+ return OSS.str ();
4187+ };
4188+
4189+ dbgs () << " Loop carried edges from " << DumpSU (SU) << " \n "
4190+ << " Order\n " ;
4191+ for (SUnit *Dst : *Order)
4192+ dbgs () << " " << DumpSU (Dst) << " \n " ;
4193+ }
0 commit comments