@@ -356,26 +356,25 @@ class LoopInterchangeLegality {
356356 SmallVector<PHINode *, 8 > InnerLoopInductions;
357357};
358358
359+ using CostMapTy = DenseMap<const Loop *, std::pair<unsigned , CacheCostTy>>;
360+
359361// / LoopInterchangeProfitability checks if it is profitable to interchange the
360362// / loop.
361363class LoopInterchangeProfitability {
362364public:
363365 LoopInterchangeProfitability (Loop *Outer, Loop *Inner, ScalarEvolution *SE,
364- OptimizationRemarkEmitter *ORE)
365- : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
366+ OptimizationRemarkEmitter *ORE,
367+ const std::optional<CostMapTy> &CM)
368+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE), CostMap(CM) {}
366369
367370 // / Check if the loop interchange is profitable.
368371 bool isProfitable (const Loop *InnerLoop, const Loop *OuterLoop,
369372 unsigned InnerLoopId, unsigned OuterLoopId,
370- CharMatrix &DepMatrix,
371- const DenseMap<const Loop *, unsigned > &CostMap,
372- std::unique_ptr<CacheCost> &CC);
373+ CharMatrix &DepMatrix);
373374
374375private:
375376 int getInstrOrderCost ();
376- std::optional<bool > isProfitablePerLoopCacheAnalysis (
377- const DenseMap<const Loop *, unsigned > &CostMap,
378- std::unique_ptr<CacheCost> &CC);
377+ std::optional<bool > isProfitablePerLoopCacheAnalysis ();
379378 std::optional<bool > isProfitablePerInstrOrderCost ();
380379 std::optional<bool > isProfitableForVectorization (unsigned InnerLoopId,
381380 unsigned OuterLoopId,
@@ -388,6 +387,8 @@ class LoopInterchangeProfitability {
388387
389388 // / Interface to emit optimization remarks.
390389 OptimizationRemarkEmitter *ORE;
390+
391+ const std::optional<CostMapTy> &CostMap;
391392};
392393
393394// / LoopInterchangeTransform interchanges the loop.
@@ -497,11 +498,13 @@ struct LoopInterchange {
497498 // indicates the loop should be placed as the innermost loop.
498499 //
499500 // For the old pass manager CacheCost would be null.
500- DenseMap< const Loop *, unsigned > CostMap;
501+ std::optional<CostMapTy > CostMap = std:: nullopt ;
501502 if (CC != nullptr ) {
503+ CostMap = CostMapTy ();
502504 const auto &LoopCosts = CC->getLoopCosts ();
503505 for (unsigned i = 0 ; i < LoopCosts.size (); i++) {
504- CostMap[LoopCosts[i].first ] = i;
506+ const auto &Cost = LoopCosts[i];
507+ (*CostMap)[Cost.first ] = std::make_pair (i, Cost.second );
505508 }
506509 }
507510 // We try to achieve the globally optimal memory access for the loopnest,
@@ -537,7 +540,7 @@ struct LoopInterchange {
537540 bool processLoop (Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
538541 unsigned OuterLoopId,
539542 std::vector<std::vector<char >> &DependencyMatrix,
540- const DenseMap< const Loop *, unsigned > &CostMap) {
543+ const std::optional<CostMapTy > &CostMap) {
541544 LLVM_DEBUG (dbgs () << " Processing InnerLoopId = " << InnerLoopId
542545 << " and OuterLoopId = " << OuterLoopId << " \n " );
543546 LoopInterchangeLegality LIL (OuterLoop, InnerLoop, SE, ORE);
@@ -546,9 +549,9 @@ struct LoopInterchange {
546549 return false ;
547550 }
548551 LLVM_DEBUG (dbgs () << " Loops are legal to interchange\n " );
549- LoopInterchangeProfitability LIP (OuterLoop, InnerLoop, SE, ORE);
552+ LoopInterchangeProfitability LIP (OuterLoop, InnerLoop, SE, ORE, CostMap );
550553 if (!LIP.isProfitable (InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
551- DependencyMatrix, CostMap, CC )) {
554+ DependencyMatrix)) {
552555 LLVM_DEBUG (dbgs () << " Interchanging loops not profitable.\n " );
553556 return false ;
554557 }
@@ -1127,29 +1130,60 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
11271130}
11281131
11291132std::optional<bool >
1130- LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis (
1131- const DenseMap<const Loop *, unsigned > &CostMap,
1132- std::unique_ptr<CacheCost> &CC) {
1133+ LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis () {
11331134 // This is the new cost model returned from loop cache analysis.
11341135 // A smaller index means the loop should be placed an outer loop, and vice
11351136 // versa.
1136- if (CostMap.contains (InnerLoop) && CostMap.contains (OuterLoop)) {
1137- unsigned InnerIndex = 0 , OuterIndex = 0 ;
1138- InnerIndex = CostMap.find (InnerLoop)->second ;
1139- OuterIndex = CostMap.find (OuterLoop)->second ;
1140- LLVM_DEBUG (dbgs () << " InnerIndex = " << InnerIndex
1141- << " , OuterIndex = " << OuterIndex << " \n " );
1142- if (InnerIndex < OuterIndex)
1143- return std::optional<bool >(true );
1144- assert (InnerIndex != OuterIndex && " CostMap should assign unique "
1145- " numbers to each loop" );
1146- if (CC->getLoopCost (*OuterLoop) == CC->getLoopCost (*InnerLoop))
1147- return std::nullopt ;
1148- return std::optional<bool >(false );
1149- }
1150- return std::nullopt ;
1137+ if (!CostMap.has_value ())
1138+ return std::nullopt ;
1139+
1140+ auto InnerIte = CostMap->find (InnerLoop);
1141+ auto OuterIte = CostMap->find (OuterLoop);
1142+ if (InnerIte == CostMap->end () || OuterIte == CostMap->end ())
1143+ return std::nullopt ;
1144+
1145+ const auto &[InnerIndex, InnerCost] = InnerIte->second ;
1146+ const auto &[OuterIndex, OuterCost] = OuterIte->second ;
1147+ LLVM_DEBUG (dbgs () << " InnerIndex = " << InnerIndex
1148+ << " , OuterIndex = " << OuterIndex << " \n " );
1149+ assert (InnerIndex != OuterIndex && " CostMap should assign unique "
1150+ " numbers to each loop" );
1151+
1152+ if (InnerCost == OuterCost)
1153+ return std::nullopt ;
1154+
1155+ return InnerIndex < OuterIndex;
11511156}
11521157
1158+ // This function doesn't satisfy transitivity. Consider the following case.
1159+ //
1160+ // ```
1161+ // for (int k = 0; k < N; k++) {
1162+ // for (int j = 0; j < N; j++) {
1163+ // for (int i = 0; i < N; i++) {
1164+ // dst0[i][j][k] += aa[i][j] + bb[i][j] + cc[j][k];
1165+ // dst1[k][j][i] += dd[i][j] + ee[i][j] + ff[j][k];
1166+ // }
1167+ // }
1168+ // }
1169+ //
1170+ // ```
1171+ //
1172+ // The getInstrOrderCost will return the following value.
1173+ //
1174+ // Outer | Inner | Cost
1175+ // -------+-------+------
1176+ // k | j | -2
1177+ // j | i | -4
1178+ // k | i | 0
1179+ //
1180+ // This means that this function says interchanging (k, j) loops and (j, i)
1181+ // loops are profitable, but not (k, i). The root cause of this is that the
1182+ // getInstrOrderCost only see the loops we are checking. We can resolve this if
1183+ // we also consider the order going through other inductions. As for the above
1184+ // case, we can induce that interchanging `k` and `i` is profitable (it is
1185+ // better to move the `k` loop to inner position) by `bb[i][j]` and `cc[j][k]`.
1186+ // However, such accurate calculation is expensive, so that we don't do it.
11531187std::optional<bool >
11541188LoopInterchangeProfitability::isProfitablePerInstrOrderCost () {
11551189 // Legacy cost model: this is rough cost estimation algorithm. It counts the
@@ -1184,11 +1218,27 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
11841218 return std::optional<bool >(!DepMatrix.empty ());
11851219}
11861220
1187- bool LoopInterchangeProfitability::isProfitable (
1188- const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1189- unsigned OuterLoopId, CharMatrix &DepMatrix,
1190- const DenseMap<const Loop *, unsigned > &CostMap,
1191- std::unique_ptr<CacheCost> &CC) {
1221+ // The bubble-sort fashion algorithm is adopted to sort the loop nest, so the
1222+ // comparison function should ideally induce a strict weak ordering required by
1223+ // some standard C++ libraries. In particular, isProfitable should hold the
1224+ // following properties.
1225+ //
1226+ // Asymmetry: If isProfitable(a, b) is true then isProfitable(b, a) is false.
1227+ // Transitivity: If both isProfitable(a, b) and isProfitable(b, c) is true then
1228+ // isProfitable(a, c) is true.
1229+ //
1230+ // The most important thing is not to make unprofitable interchange. From this
1231+ // point of view, asymmetry is important. This is because if both
1232+ // isProfitable(a, b) and isProfitable(b, a) are true, then an unprofitable
1233+ // transformation (one of them) will be performed. On the other hand, a lack of
1234+ // transitivity might cause some optimization opportunities to be lost, but
1235+ // won't trigger an unprofitable one. Moreover, guaranteeing transitivity is
1236+ // expensive. Therefore, isProfitable only holds the asymmetry.
1237+ bool LoopInterchangeProfitability::isProfitable (const Loop *InnerLoop,
1238+ const Loop *OuterLoop,
1239+ unsigned InnerLoopId,
1240+ unsigned OuterLoopId,
1241+ CharMatrix &DepMatrix) {
11921242 // isProfitable() is structured to avoid endless loop interchange.
11931243 // If loop cache analysis could decide the profitability then,
11941244 // profitability check will stop and return the analysis result.
@@ -1197,15 +1247,14 @@ bool LoopInterchangeProfitability::isProfitable(
11971247 // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
11981248 // analysis the profitability then only, isProfitableForVectorization
11991249 // will decide.
1200- std::optional<bool > shouldInterchange =
1201- isProfitablePerLoopCacheAnalysis (CostMap, CC);
1202- if (!shouldInterchange.has_value ()) {
1203- shouldInterchange = isProfitablePerInstrOrderCost ();
1204- if (!shouldInterchange.has_value ())
1205- shouldInterchange =
1250+ std::optional<bool > ShouldInterchange = isProfitablePerLoopCacheAnalysis ();
1251+ if (!ShouldInterchange.has_value ()) {
1252+ ShouldInterchange = isProfitablePerInstrOrderCost ();
1253+ if (!ShouldInterchange.has_value ())
1254+ ShouldInterchange =
12061255 isProfitableForVectorization (InnerLoopId, OuterLoopId, DepMatrix);
12071256 }
1208- if (!shouldInterchange .has_value ()) {
1257+ if (!ShouldInterchange .has_value ()) {
12091258 ORE->emit ([&]() {
12101259 return OptimizationRemarkMissed (DEBUG_TYPE, " InterchangeNotProfitable" ,
12111260 InnerLoop->getStartLoc (),
@@ -1214,7 +1263,8 @@ bool LoopInterchangeProfitability::isProfitable(
12141263 " interchange." ;
12151264 });
12161265 return false ;
1217- } else if (!shouldInterchange.value ()) {
1266+ }
1267+ if (!ShouldInterchange.value ()) {
12181268 ORE->emit ([&]() {
12191269 return OptimizationRemarkMissed (DEBUG_TYPE, " InterchangeNotProfitable" ,
12201270 InnerLoop->getStartLoc (),
0 commit comments