diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 70e9eee5339a7..08446ccaa9fca 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -17,8 +17,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSet.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -70,6 +70,13 @@ namespace { using LoopVector = SmallVector; +/// A list of direction vectors. Each entry represents a direction vector +/// corresponding to one or more dependencies existing in the loop nest. The +/// length of all direction vectors is equal and is N + 1, where N is the depth +/// of the loop nest. The first N elements correspond to the dependency +/// direction of each N loops. The last one indicates whether this entry is +/// forward dependency ('<') or not ('*'). The term "forward" aligns with what +/// is defined in LoopAccessAnalysis. // TODO: Check if we can use a sparse matrix here. using CharMatrix = std::vector>; @@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef Rules) { static void printDepMatrix(CharMatrix &DepMatrix) { for (auto &Row : DepMatrix) { - for (auto D : Row) + // Drop the last element because it is a flag indicating whether this is + // forward dependency or not, which doesn't affect the legality check. + for (char D : drop_end(Row)) LLVM_DEBUG(dbgs() << D << " "); LLVM_DEBUG(dbgs() << "\n"); } } + +/// Return true if \p Src appears before \p Dst in the same basic block. +/// Precondition: \p Src and \Dst are distinct instructions within the same +/// basic block. +static bool inThisOrder(const Instruction *Src, const Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && Src != Dst && + "Expected Src and Dst to be different instructions in the same BB"); + + bool FoundSrc = false; + for (const Instruction &I : *(Src->getParent())) { + if (&I == Src) { + FoundSrc = true; + continue; + } + if (&I == Dst) + return FoundSrc; + } + + llvm_unreachable("Dst not found"); +} #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, @@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, return false; } ValueVector::iterator I, IE, J, JE; - StringSet<> Seen; + + // Manage direction vectors that are already seen. Map each direction vector + // to an index of DepMatrix at which it is stored. + StringMap Seen; for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { for (J = I, JE = MemInstr.end(); J != JE; ++J) { @@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Dep.push_back('I'); } + // Test whether the dependency is forward or not. + bool IsKnownForward = true; + if (Src->getParent() != Dst->getParent()) { + // In general, when Src and Dst are in different BBs, the execution + // order of them within a single iteration is not guaranteed. Treat + // conservatively as not-forward dependency in this case. + IsKnownForward = false; + } else { + // Src and Dst are in the same BB. If they are the different + // instructions, Src should appear before Dst in the BB as they are + // stored to MemInstr in that order. + assert((Src == Dst || inThisOrder(Src, Dst)) && + "Unexpected instructions"); + + // If the Dependence object is reversed (due to normalization), it + // represents the dependency from Dst to Src, meaning it is a backward + // dependency. Otherwise it should be a forward dependency. + bool IsReversed = D->getSrc() != Src; + if (IsReversed) + IsKnownForward = false; + } + + // Initialize the last element. Assume forward dependencies only; it + // will be updated later if there is any non-forward dependency. + Dep.push_back('<'); + + // The last element should express the "summary" among one or more + // direction vectors whose first N elements are the same (where N is + // the depth of the loop nest). Hence we exclude the last element from + // the Seen map. + auto [Ite, Inserted] = Seen.try_emplace( + StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size()); + // Make sure we only add unique entries to the dependency matrix. - if (Seen.insert(StringRef(Dep.data(), Dep.size())).second) + if (Inserted) DepMatrix.push_back(Dep); + + // If we cannot prove that this dependency is forward, change the last + // element of the corresponding entry. Since a `[... *]` dependency + // includes a `[... <]` dependency, we do not need to keep both and + // change the existing entry instead. + if (!IsKnownForward) + DepMatrix[Ite->second].back() = '*'; } } } @@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, continue; // Check if the direction vector is lexicographically positive (or zero) - // for both before/after exchanged. - if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) + // for both before/after exchanged. Ignore the last element because it + // doesn't affect the legality. + if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) return false; std::swap(Cur[InnerLoopId], Cur[OuterLoopId]); - if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) + if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) return false; } return true; @@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { for (const auto &Dep : DepMatrix) { char Dir = Dep[LoopId]; - if (Dir != 'I' && Dir != '=') - return false; + char DepType = Dep.back(); + assert((DepType == '<' || DepType == '*') && + "Unexpected element in dependency vector"); + + // There are no loop-carried dependencies. + if (Dir == '=' || Dir == 'I') + continue; + + // DepType being '<' means that this direction vector represents a forward + // dependency. In principle, a loop with '<' direction can be vectorized in + // this case. + if (Dir == '<' && DepType == '<') + continue; + + // We cannot prove that the loop is vectorizable. + return false; } return true; } std::optional LoopInterchangeProfitability::isProfitableForVectorization( unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // If the outer loop is not loop independent it is not profitable to move - // this to inner position, since doing so would not enable inner loop - // parallelism. + // If the outer loop cannot be vectorized, it is not profitable to move this + // to inner position. if (!canVectorize(DepMatrix, OuterLoopId)) return false; - // If inner loop has dependence and outer loop is loop independent then it is - // profitable to interchange to enable inner loop parallelism. + // If the inner loop cannot be vectorized but the outer loop can be, then it + // is profitable to interchange to enable inner loop parallelism. if (!canVectorize(DepMatrix, InnerLoopId)) return true; diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll index 9c113d4570e4d..4194849784054 100644 --- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll +++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll @@ -5,6 +5,8 @@ @A = dso_local global [256 x [256 x float]] zeroinitializer @B = dso_local global [256 x [256 x float]] zeroinitializer @C = dso_local global [256 x [256 x float]] zeroinitializer +@D = global [256 x [256 x [256 x float]]] zeroinitializer +@E = global [256 x [256 x [256 x float]]] zeroinitializer ; Check that the below loops are exchanged for vectorization. ; @@ -64,15 +66,13 @@ exit: ; for (int j = 1; j < 256; j++) ; A[i][j-1] = A[i][j] + B[i][j]; ; -; FIXME: These loops are exchanged at this time due to the problem in -; profitability heuristic calculation for vectorization. -; CHECK: --- !Passed +; CHECK: --- !Missed ; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Name: InterchangeNotProfitable ; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Loop interchanged with enclosing loop. +; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange. define void @interchange_unnecesasry_for_vectorization() { entry: br label %for.i.header @@ -103,3 +103,134 @@ for.i.inc: exit: ret void } + +; Check that the below loops are exchanged to allow innermost loop +; vectorization. We cannot vectorize the j-loop because it has a lexically +; backward dependency, but the i-loop can be vectorized because all the +; loop-carried dependencies are lexically forward. LoopVectorize currently only +; vectorizes innermost loop, hence move the i-loop to that position. +; +; for (int i = 0; i < 255; i++) { +; for (int j = 1; j < 256; j++) { +; A[i][j] = A[i][j-1] + B[i][j]; +; C[i][j] += C[i+1][j]; +; } +; } +; + +; CHECK: --- !Passed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: interchange_necessary_for_vectorization2 +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Loop interchanged with enclosing loop. +define void @interchange_necessary_for_vectorization2() { +entry: + br label %for.i.header + +for.i.header: + %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ] + %i.inc = add i64 %i, 1 + br label %for.j.body + +for.j.body: + %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ] + %j.dec = add i64 %j, -1 + %a.load.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j.dec + %b.index = getelementptr [256 x [256 x float]], ptr @B, i64 0, i64 %i, i64 %j + %c.load.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i.inc, i64 %j + %c.store.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i, i64 %j + %a = load float, ptr %a.load.index + %b = load float, ptr %b.index + %c0 = load float, ptr %c.load.index + %c1 = load float, ptr %c.store.index + %add.0 = fadd float %a, %b + %a.store.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j + store float %add.0, ptr %a.store.index + %add.1 = fadd float %c0, %c1 + store float %add.1, ptr %c.store.index + %j.next = add i64 %j, 1 + %cmp.j = icmp eq i64 %j.next, 256 + br i1 %cmp.j, label %for.i.inc, label %for.j.body + +for.i.inc: + %i.next = add i64 %i, 1 + %cmp.i = icmp eq i64 %i.next, 255 + br i1 %cmp.i, label %exit, label %for.i.header + +exit: + ret void +} + +; Check that no interchange is performed for the following loop. Interchanging +; the j-loop and k-loop makes the innermost loop vectorizble, since the j-loop +; has only forward dependencies. However, at the moment, a loop body consisting +; of multiple BBs is handled pesimistically. Hence the j-loop isn't moved to +; the innermost place. +; +; for (int i = 0; i < 255; i++) { +; for (int j = 0; j < 255; j++) { +; for (int k = 0; k < 128; k++) { +; E[i][j][k] = D[i+1][j+1][2*k]; +; if (cond) +; D[i][j][k+1] = 1.0; +; } +; } + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: InterchangeNotProfitable +; CHECK-NEXT: Function: multiple_BBs_in_loop +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization. +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: InterchangeNotProfitable +; CHECK-NEXT: Function: multiple_BBs_in_loop +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization. +define void @multiple_BBs_in_loop() { +entry: + br label %for.i.header + +for.i.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ] + %i.inc = add i64 %i, 1 + br label %for.j.header + +for.j.header: + %j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ] + %j.inc = add i64 %j, 1 + br label %for.k.body + +for.k.body: + %k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ] + %k.inc = add i64 %k, 1 + %k.2 = mul i64 %k, 2 + %d.index = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i.inc, i64 %j.inc, i64 %k.2 + %e.index = getelementptr [256 x [256 x [256 x float]]], ptr @E, i64 0, i64 %i, i64 %j, i64 %k + %d.load = load float, ptr %d.index + store float %d.load, ptr %e.index + %cond = freeze i1 undef + br i1 %cond, label %if.then, label %for.k.inc + +if.then: + %d.index2 = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i, i64 %j, i64 %k.inc + store float 1.0, ptr %d.index2 + br label %for.k.inc + +for.k.inc: + %cmp.k = icmp eq i64 %k.inc, 128 + br i1 %cmp.k, label %for.j.inc, label %for.k.body + +for.j.inc: + %cmp.j = icmp eq i64 %j.inc, 255 + br i1 %cmp.j, label %for.i.inc, label %for.j.header + +for.i.inc: + %cmp.i = icmp eq i64 %i.inc, 255 + br i1 %cmp.i, label %exit, label %for.i.header + +exit: + ret void +}