-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LoopInterchange] Consider forward/backward dependency in vectorize heuristic #133672
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
b1c4248
8f4f814
cad4db9
42a19fb
6a0a868
4f5a8c0
ced443b
211be9e
c21efda
e324e62
e8ef29a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -17,8 +17,8 @@ | |||||
#include "llvm/ADT/SmallSet.h" | ||||||
#include "llvm/ADT/SmallVector.h" | ||||||
#include "llvm/ADT/Statistic.h" | ||||||
#include "llvm/ADT/StringMap.h" | ||||||
#include "llvm/ADT/StringRef.h" | ||||||
#include "llvm/ADT/StringSet.h" | ||||||
#include "llvm/Analysis/DependenceAnalysis.h" | ||||||
#include "llvm/Analysis/LoopCacheAnalysis.h" | ||||||
#include "llvm/Analysis/LoopInfo.h" | ||||||
|
@@ -70,6 +70,13 @@ namespace { | |||||
|
||||||
using LoopVector = SmallVector<Loop *, 8>; | ||||||
|
||||||
/// A list of direction vectors. Each entry represents a direction vector | ||||||
/// corresponding to one or more dependencies existing in the loop nest. The | ||||||
/// length of all direction vectors is equal and is N + 1, where N is the depth | ||||||
/// of the loop nest. The first N elements correspond to the dependency | ||||||
/// direction of each N loops. The last one indicates whether this entry is | ||||||
/// forward dependency ('<') or not ('*'). The term "forward" aligns with what | ||||||
/// is defined in LoopAccessAnalysis. | ||||||
// TODO: Check if we can use a sparse matrix here. | ||||||
using CharMatrix = std::vector<std::vector<char>>; | ||||||
|
||||||
|
@@ -126,11 +133,32 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) { | |||||
|
||||||
static void printDepMatrix(CharMatrix &DepMatrix) { | ||||||
for (auto &Row : DepMatrix) { | ||||||
for (auto D : Row) | ||||||
ArrayRef<char> RowRef(Row); | ||||||
|
||||||
// Drop the last element because it is a flag indicating whether this is | ||||||
// forward dependency or not, which doesn't affect the legality check. | ||||||
for (auto D : RowRef.drop_back()) | ||||||
LLVM_DEBUG(dbgs() << D << " "); | ||||||
LLVM_DEBUG(dbgs() << "\n"); | ||||||
} | ||||||
} | ||||||
|
||||||
static bool inThisOrder(const Instruction *Src, const Instruction *Dst) { | ||||||
Meinersbur marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
assert(Src->getParent() == Dst->getParent() && Src != Dst && | ||||||
"Expected Src and Dst to be different instructions in the same BB"); | ||||||
|
||||||
bool FoundSrc = false; | ||||||
for (const Instruction &I : *(Src->getParent())) { | ||||||
if (&I == Src) { | ||||||
FoundSrc = true; | ||||||
continue; | ||||||
} | ||||||
if (&I == Dst) | ||||||
return FoundSrc; | ||||||
} | ||||||
|
||||||
llvm_unreachable("Dst not found"); | ||||||
} | ||||||
#endif | ||||||
|
||||||
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, | ||||||
|
@@ -174,7 +202,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, | |||||
return false; | ||||||
} | ||||||
ValueVector::iterator I, IE, J, JE; | ||||||
StringSet<> Seen; | ||||||
|
||||||
// Manage direction vectors that are already seen. Map each direction vector | ||||||
// to an index of DepMatrix at which it is stored. | ||||||
StringMap<unsigned> Seen; | ||||||
|
||||||
for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { | ||||||
for (J = I, JE = MemInstr.end(); J != JE; ++J) { | ||||||
|
@@ -187,6 +218,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, | |||||
// Track Output, Flow, and Anti dependencies. | ||||||
if (auto D = DI->depends(Src, Dst)) { | ||||||
assert(D->isOrdered() && "Expected an output, flow or anti dep."); | ||||||
|
||||||
// If the direction vector is negative, normalize it to | ||||||
// make it non-negative. | ||||||
if (D->normalize(SE)) | ||||||
|
@@ -228,9 +260,47 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, | |||||
Dep.push_back('I'); | ||||||
} | ||||||
|
||||||
// Test whether the dependency is forward or not. | ||||||
bool IsKnownForward = true; | ||||||
if (Src->getParent() != Dst->getParent()) { | ||||||
// In general, when Src and Dst are in different BBs, the execution | ||||||
// order of them within a single iteration is not guaranteed. Treat | ||||||
// conservatively as not-forward dependency in this case. | ||||||
IsKnownForward = false; | ||||||
} else { | ||||||
// Src and Dst are in the same BB. If they are the different | ||||||
// instructions, Src should appear before Dst in the BB as they are | ||||||
// stored to MemInstr in that order. | ||||||
assert((Src == Dst || inThisOrder(Src, Dst)) && | ||||||
"Unexpected instructions"); | ||||||
|
||||||
// If the Dependence object is reversed (due to normalization), it | ||||||
// represents the dependency from Dst to Src, meaning it is a backward | ||||||
// dependency. Otherwise it should be a forward dependency. | ||||||
bool IsReversed = D->getSrc() != Src; | ||||||
if (IsReversed) | ||||||
IsKnownForward = false; | ||||||
} | ||||||
|
||||||
// Initialize the last element. | ||||||
Meinersbur marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
Dep.push_back('<'); | ||||||
|
||||||
// The last element should express the "summary" among one or more | ||||||
// direction vectors whose first N elements are the same (where N is | ||||||
// the depth of the loop nest). Hence we exclude the last element from | ||||||
// the Seen map. | ||||||
auto [Ite, Inserted] = Seen.try_emplace( | ||||||
StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size()); | ||||||
|
||||||
// Make sure we only add unique entries to the dependency matrix. | ||||||
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second) | ||||||
if (Inserted) | ||||||
DepMatrix.push_back(Dep); | ||||||
|
||||||
// If we cannot prove that this dependency is forward, change the last | ||||||
// element of the corresponding entry. Note that the existing entry in | ||||||
Meinersbur marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
// DepMatrix can be modified. | ||||||
if (!IsKnownForward) | ||||||
DepMatrix[Ite->second].back() = '*'; | ||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -281,11 +351,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, | |||||
continue; | ||||||
|
||||||
// Check if the direction vector is lexicographically positive (or zero) | ||||||
// for both before/after exchanged. | ||||||
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) | ||||||
// for both before/after exchanged. Ignore the last element because it | ||||||
// doesn't affect the legality. | ||||||
Meinersbur marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) | ||||||
return false; | ||||||
std::swap(Cur[InnerLoopId], Cur[OuterLoopId]); | ||||||
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) | ||||||
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) | ||||||
return false; | ||||||
} | ||||||
return true; | ||||||
|
@@ -1334,21 +1405,34 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { | |||||
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { | ||||||
for (const auto &Dep : DepMatrix) { | ||||||
char Dir = Dep[LoopId]; | ||||||
if (Dir != 'I' && Dir != '=') | ||||||
return false; | ||||||
char DepType = Dep.back(); | ||||||
assert((DepType == '<' || DepType == '*') && | ||||||
"Unexpected element in dependency vector"); | ||||||
|
||||||
// There are no loop-carried dependencies. | ||||||
if (Dir == '=' || Dir == 'I') | ||||||
continue; | ||||||
|
||||||
// DepType being '<' means that this direction vector represents a forward | ||||||
// dependency. In principle, a loop with '<' direction can be vectorized in | ||||||
// this case. | ||||||
if (Dir == '<' && DepType == '<') | ||||||
continue; | ||||||
|
||||||
// We cannot prove that the loop is vectorizable. | ||||||
return false; | ||||||
} | ||||||
return true; | ||||||
} | ||||||
|
||||||
std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( | ||||||
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { | ||||||
// If the outer loop is not loop independent it is not profitable to move | ||||||
// this to inner position, since doing so would not enable inner loop | ||||||
// parallelism. | ||||||
// If the outer loop cannot be vectorized, it is not profitable to move this | ||||||
// to inner position. | ||||||
if (!canVectorize(DepMatrix, OuterLoopId)) | ||||||
return false; | ||||||
|
||||||
// If inner loop has dependence and outer loop is loop independent then it is | ||||||
// If inner loop cannot be vectorized and outer loop can be then it is | ||||||
|
// If inner loop cannot be vectorized and outer loop can be then it is | |
// If the inner loop cannot be vectorized but the outer loop can be then it is |
[grammar]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
By the way, this would be nitpicky as well, but do you think the original comment is accurate? What I'm trying to say is that even if canVectorize
were a perfectly accurate function (neither false-positive nor false-negative), I'm starting to think that interchanging the loops here is not necessarily profitable for enabling inner loop parallelism. For example, in the following code:
for (int i = 1; i < N; i++)
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++) {
// Assume f and g don't have side effects
use(A[i][j][f(k)]);
A[i + 1][j][g(k)] = ...;
}
For the k-loop, canVectorize
would return false
if f
and g
are sufficiently complex. However, in principle, parallelizing the k-loop still seems legal in the original one. Therefore, a more accurate comment might be something like "... can be profitable to interchange the loops to enable inner loop parallelism"? (Apparently, I wrote the original comment too, so either past me or present me is wrong...)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is "sufficiently complex"? If DA returns "confused" then canVectorize
has to return false. If it returns [< = *]
the dependency is carried by the outermost loop, it does not matter what the inner loop does.
I actually don't know/undestand why canVectorize
does not look at the parent loop dependencies. Possible because what the outer loops are changes with interchange. At least the loops that are surrounded by both, outer+inner could be considered.
The case you mention is interesting because it is a counterexample to the assumption that if canVectorize
is pessimistic (never says a loop can be vectorized even though LoopVectorize will not for some reason), it will not cause loop exchanges that would not happen if it was not pessimistic. Anyway, in this case the j-loop looks more likely to be vectorized profitable because f(k)
/g(k)
indices would require more complex memory accesses. LoopVectorize can better handle i
as a "strided access pattern".
I think the comment itself is correct: If the outer one could be vectorized (if moved to the inner position) but the current inner one cannot, swap the outer one to the vectorizable position. For "vectorizable" it just assumes the definition of canVectorize
. Generally, even a loop is vectorizable in terms of dependencies, LoopVectorize may still consider it unprofitable to vectorize because of the instructions it contains, or the code may actually run slower after vectorization, so "profitable" was never in absolute term and hopefully understood as such by the reader. "can" does not add new information here unless we would mention such concrete situations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is "sufficiently complex"? If DA returns "confused" then
canVectorize
has to return false. If it returns[< = *]
the dependency is carried by the outermost loop, it does not matter what the inner loop does.
I tried to say the latter one. Just as you mentioned, I was assuming a case where DA returns [< = *]
.
I hadn't really been conscious of it, but as you pointed out, this is a case where pessimistic heuristics lead to an interchange that wouldn't have happened if they hadn't been pessimistic (and in this specific case, moving the j-loop would be profitable for vectorization because the memory access pattern is simpler) I personally think that the interchange should not happen in this case, since we currently don't take the vectorization cost into account. Checking dependencies of the surrounding loops seems basically like a good idea, but I'm not confident whether that might lead to other unintended transformations. Using the same cost model as LoopVectorize seems like an ideal solution, but it feels challenging.
For "vectorizable" it just assumes the definition of
canVectorize
.
As for the comment here, this explanation made the most sense to me. Thanks for clarifying!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I personally think that the interchange should not happen in this case, since we currently don't take the vectorization cost into account.
I agree, but there are limits on what we can do. At the end it is just a heuristric.
Checking dependencies of the surrounding loops seems basically like a good idea, but I'm not confident whether that might lead to other unintended transformations. Using the same cost model as LoopVectorize seems like an ideal solution, but it feels challenging.
This is a common problem that also LoopDistribute has: It is intended to enable vectorization on one more more distributed loops, but does not know whether they actually are vectorized. In other words, it has no cost model. Becausei if it does not do anything unless explicitly told to do so.
Using the profitability heuristic from LoopVectorize itself, even it it was easy, might also not what we want: Its computational cost is immense (building an entire new IR representation called VPlan) that we would not do speculatively on all loops without actually vectorizing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unless you have an universal cost model that takes everything into account and predicts the execution time, each pass needs its own heuristic for what it is optimizing for. E.g. the vectorizer optmizes cycles but does not consider cache effects.
When you put it that way, it hardly seems feasible (well, if it were feasible, it would probably have been done already).
No typo; the patch tries to teach DependenceAnalysis to determine dependencies after loop fusion has taken place without applying loop fusion. Now also do that for interchange, distribution, vectorization, ....
After reading this comment, I noticed that the patch introduces additional analysis for loop fusion even though the client doesn't require it. I initially expected an argument to be added (such as depends(Src, Dst, /*ForFusion=*/true)
), but that doesn't seem to be the case. Tough, controlling the analysis behavior via flags could complicate caching and reusing results across different passes.
By the way, I've recently been reading DependenceAnalysis.cpp, and noticed that; it's already quite complex and potentially buggy. I'm fairly certain it should be refactored before adding any new features.
UnrollAndJam is disabled by default. Its heuristic also does not take vectorization into account, but tires to maximize L1i cache usage.
Optimal outcome would be if the vectorizer supported outer-loop vectorization.
I don't know much about the details of the UnrollAndJam pass, but it appears to work (unintentionally?) as if outer-loop vectorization is applied in some cases, especially when combined with the SLPVectorizer (of course, I needed to specify the unroll count explicitly by pragma). So, I just thought that it might make more sense to enhance UnrollAndJam instead of interchange, for cases where outer-loop is vectorizable but inner-loop is not. And, as you said, it would be the best solution to support outer-loop vectorization in the vectorizer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After reading this comment, I noticed that the patch introduces additional analysis for loop fusion even though the client doesn't require it. I initially expected an argument to be added (such as
depends(Src, Dst, /*ForFusion=*/true)
), but that doesn't seem to be the case. Tough, controlling the analysis behavior via flags could complicate caching and reusing results across different passes.
Whether it is for fusion is not yet decided when calling depends, but FullDependence
stores the analysis for both.
By the way, I've recently been reading DependenceAnalysis.cpp, and noticed that; it's already quite complex and potentially buggy. I'm fairly certain it should be refactored before adding any new features.
The principle is straightforward; when processing one of the two fused loops, process them as the same. Since an expression can only be in one of the loops, no ambiguity arises. Only when processing the relationship between two statements you need to decide whether you want to treat them as the same or sequential loops.
I am not sure refactoring helps. Big part of why it is difficult to understand is the math. The Pair
also makes it look complex, but it is just matching the access subscript dimensions after delinearization. But I also am also not very happy about adding special cases to an already complex analysis. If you do loop fusion, you may want to support more cases than loops that have excactly the same trip count.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whether it is for fusion is not yet decided when calling depends, but
FullDependence
stores the analysis for both.
IIUC, FullDependence
objects are not cached anyware. DependenceInfo
is nearly stateless. Furthermore, DependenceInfo::depends
returns a unique_ptr
, hence we cannot cache the result as it is. That is, I think we know whether the caller is fusion or not when calling DependenceInfo::depends
.
I am not sure refactoring helps. Big part of why it is difficult to understand is the math. The
Pair
also makes it look complex, but it is just matching the access subscript dimensions after delinearization. But I also am also not very happy about adding special cases to an already complex analysis. If you do loop fusion, you may want to support more cases than loops that have excactly the same trip count.
I agree that we can't do much about the mathematical complexity, but I believe the code could be made simpler. It looks to me like there's a fair amount of code duplication, especially when the same processes are executed for SrcXXX
and DstXXX
(e.g., here). I'm not sure whether this duplication makes the code harder to understand, but I do think it hurts maintainability. I don't believe "Don't Repeat Yourself" is always the right principle, but in this case, I think there are parts of the logic where it does apply.
However, I think the most significant problem is that we don't take wrapping into account. The approach in #116632 seems incorrect to me. We probably need to be more pessimistic with respect to wrapping. I think it makes sense to insert checks for wrap flags where necessary, which would complicate the code. I'm not sure if #146383 applies in that case, but generally speaking, adding a new feature could increase the number of factors we need to consider.
In fact, there's a case where DependenceAnalysis misses a dependency, probably due to ignoring wraps, as shown below (godbolt: https://godbolt.org/z/hsxWve8s6).
; for (i = 0; i < 4; i++)
; a[i & 1][i & 1] = 0;
define void @f(ptr %a) {
entry:
br label %loop
loop:
%i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
%and = and i64 %i, 1
%idx = getelementptr [4 x [4 x i8]], ptr %a, i64 0, i64 %and, i64 %and
store i8 0, ptr %idx
%i.next = add i64 %i, 1
%exitcond.not = icmp slt i64 %i.next, 8
br i1 %exitcond.not, label %loop, label %exit
exit:
ret void
}
Printing analysis 'Dependence Analysis' for function 'f':
Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
da analyze - none!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you create an issue # for that case? (Or I can do so) It doesn't look nsw/nuw related though, the subscipts are well within i64 range.
I remember having had issues with #116632 but apparently I have been convinced otherwise.
Would be looking forward to cleanup PRs on DA.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, sorry, the issue already exists: #148435 (comment)
I think this is a kind of wrapping problem. IIRC, the %and
is represented as {false,+,true}<%loop>
, which would wrap. But DA casts it to i64 and ultimately overlooks the wrapping.
(While I'm at it, I'll share the other issues I found: #149977, #149501, #149991).
Would be looking forward to cleanup PRs on DA.
👍
Uh oh!
There was an error while loading. Please reload this page.