Skip to content
Merged
85 changes: 71 additions & 14 deletions llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/LoopCacheAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
Expand Down Expand Up @@ -119,7 +119,11 @@ static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {

static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
for (auto D : Row)
ArrayRef<char> RowRef(Row);

// Drop the last element because it is a flag indicating whether the row is
// "lexically forward", which doesn't affect the legality check.
for (auto D : RowRef.drop_back())
LLVM_DEBUG(dbgs() << D << " ");
LLVM_DEBUG(dbgs() << "\n");
}
Expand Down Expand Up @@ -167,7 +171,20 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
return false;
}
ValueVector::iterator I, IE, J, JE;
StringSet<> Seen;

// Manage direction vectors that are already seen. Map each direction vector
// to an index of DepMatrix at which it is stored.
StringMap<unsigned> Seen;

// The i-th element is set iff all dependencies corresponding to the i-th
// direction vector in DepMatrix are "lexically forward". The notion
// "lexically forward" aligns with what is defined in LAA
// (LoopAccessAnalysis).
//
// We deem a dependence lexically forward if we can prove that the
// destination instruction is always executed after the source instruction
// within each iteration.
BitVector IsForwardFlags;

for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
Expand All @@ -180,10 +197,22 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
// Track Output, Flow, and Anti dependencies.
if (auto D = DI->depends(Src, Dst)) {
assert(D->isOrdered() && "Expected an output, flow or anti dep.");
bool IsForward = true;

// If Src and Dst are in the same BB, Src is always executed before Dst
// in the same loop iteration. If not, we must check whether one BB
// dominates the other to determine if Src and Dst are executed in this
// order. At the moment, we don't perform such check.
if (Src->getParent() != Dst->getParent())
IsForward = false;

// If the direction vector is negative, normalize it to
// make it non-negative.
if (D->normalize(SE))
bool Normalized = D->normalize(SE);
if (Normalized) {
LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
IsForward = false;
Copy link
Member

@Meinersbur Meinersbur Jul 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some deduction steps needed for this, please add some explandation into a comment:

  1. If Src and Dst are not in the same BB, line 207 will consider it "not forward"
  2. If Src and Dst are in the same BB, DI->depends will be called with Src being the first, Dst the second in the BB due to how we iterate over the instructions, i.e. assuming a forward dependency
    2a. dependence vector is positve: assumption was true
    2b. dependence vector is negative: If it actually is not a forward dependency, the dependence vector will be negative, and D->normalize reverse the dependency (to make the dependence vector positive). That is, it becomes a backward dependence and D->normalize returns true.
    2c. If the dependency vector is 0, i.e. the dependency is not loop carried, D->normalize will not reverse the dependency. Because we called DI->depends with execution order of Src/Dst, we have a forward dependency
  3. If Src==Dst (e.g a single StoreInst depending on itself from a previous iterations, a WAW-dependency), the concept of forward/backward dependency is ill-defined. I think we should optimistically assume a forward dependency
    3a. dependence vector is positve: DI->depends(Src, Dst) probably can only return a positive dependence vector(?) that does not need to be normalized
    3b. dependence vector is negative: probably cannot happend as discussed (add assertion?)
    3c. dependence vector is zero: By atomicity of an instruction, cannot happen

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I'll add comments.

3. If Src==Dst (e.g two StoreInst of a WAW-dependency), the concept of forward/backward dependency is ill-defined. I think we should optimistically assume a forward dependency
3a. dependence vector is positve: DI->depends(Src, Dst) probably can only return a positive dependence vector(?) that does not need to be normalized
3b. dependence vector is negative: probably cannot happend as discussed (add assertion?)
3c. dependence vector is zero: By atomicity of an instruction, cannot happen

I believe 3b cannot happen.

This is a bit of a tangent, but seeing this reminded me of something. Recently, I’ve been thinking that maybe [* >] should actually be normalized to [* <] (if doing so, I think 3b can happen). If you don’t mind, I'd like to hear what you think about it.

Copy link
Member

@Meinersbur Meinersbur Jul 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming > here means the dependence-vector part of it (since your current encoding puts * for backward dependencies):

An analysis returning [* >] is unlikey, but could be possible because pessimizing [< >] to [* >] should be conservatively correct. It is not reversed though, because FullDependence::isDirectionNegative stops at *-like dependencies1.

Footnotes

  1. exactly for that reason

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming > here means the dependence-vector part of it (since you current encoding puts * for backward dependencies):

Correct, I was talking about the original dependence vector returned by DI->depends.

An analysis returning [* >] is unlikey, but could be possible because pessimizing [< >] to [* >] should be conservatively correct. It is not reversed though, because FullDependence::isDirectionNegative stops at *-like dependencies.

What I meant is that it might be more convenient for LoopInterchange if [* >] is normalized to [* <]. In other words, it may be useful if FullDependence::isDirectionNegative doesn’t stop at *-like dependencies.

I don’t think it’s very rare to have a dependence vector with * as its head element. For example, consider a case where the outermost loop has scalar dependencies (I don't know if ), like in the following example (I found such cases while investigating TSVC):

for (int n_times = 0; n_times < NTIMES; ++n_times)
  for (int i = 0; i < N; ++i)
    for (int j = 1; j < M; ++j)
      aa[j][i] = aa[j - 1][i] + 1; // This statement itself doesn't depend on `n_times`

The direction vector in the above example is [* = >]. Interchanging the i-loop and j-loop is legal (I believe), but it is currently rejected because [= >] is lexicographically negative. Alternatively, if the outermost one is not counted as a loop, the direction vector would be normalized to [= <] and the interchange would be legal.

So, I'm thinking that it may be better if direction vectors like [* = >] were normalized to [* = <]. This could probably be done by changing FullDependence::isDirectionNegative so that it doesn't stop at *. And what I'd like to ask is: Are there any concerns that come to mind? One thing that comes to my mind is that it can change the type of the dependence (flow/anti/output), but it is not very important for LoopInterchange, which is currently the only client of Dependence::normalize.

Copy link
Member

@Meinersbur Meinersbur Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

* may just mean "not analyzable", and as mentioned, it might effectively be <, but DA was not able to detect it as such. Reversing the dependence vector would be wrong.

* might also mean "sometimes <, sometimes >" depending on control flow. In that case there is no single correct normlization of the dependence vector, both (as-is and reversed) would time-negative in some cases.

for (int i = 0; i < 100; ++i) {
  A[99 - i] = ..;
  use(A[i]); // flow dependency with i >= 50, anti-dependency with i < 50
}

Since because of this one cannot assume that the dependency vector is positive even after normalization, it could be considered a heuristic, and it might be reasonable to assume that the * is a non-detected = direction due to symmetry of < and >. Could we do that in a different patch? it feels risky.

A better modeling might actually be that a FullDependence represents two dependencies: From Src to Dst and the anti-dependency from Dst to Src. One of each may actually be empty, because with [= <] there is no dependency from Dst to Src, and with [= >] there is not dependency from Src to Dst. But with [*], neither directions would bne ruled out and one has to pessimistically assume both.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your opinion! This is something I've been thinking about lately, and I don't intend to include this change in the current patch. As the discussion touched on a similar topic, I took the opportunity to raise a related question, just in case you happen to know any background or historical context behind it. Since there appear to be multiple approaches and potentially some edge cases, I'll give it some more thought. Your input was really helpful, thanks again!

* might also mean "sometimes <, sometimes >" depending on control flow. In that case there is no single correct normlization of the dependence vector

Considering this, it makes me wonder if the existence of the function normalize might be a bit misleading...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering this, it makes me wonder if the existence of the function normalize might be a bit misleading...

Definitely. When introduced I thought that callers should be able to handle the direction as-is since the caller has chosen Src and Dst. normalize retroactively swaps the arguments. But it also makes some sense since you do not want to call DA::depends again with Src/Dst swapped, paying the computational cost again.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it also makes some sense since you do not want to call DA::depends again with Src/Dst swapped, paying the computational cost again.

If I don't miss anything, simply copying the object looks to resolve that issue. Since there's a unique_ptr member in FullDependence, I don't think we can copy this as-is, but it probably doesn't need to be a unique_ptr.

Coming back to the original topic, added comments, and moved the process toward the bottom of the while-loop.

3b. dependence vector is negative: probably cannot happend as discussed (add assertion?)

I considered adding the assertion earlier, but realized it should be done right after calling normalize, so I didn’t add it at that point.

}
LLVM_DEBUG(StringRef DepType =
D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
dbgs() << "Found " << DepType
Expand Down Expand Up @@ -221,13 +250,28 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
Dep.push_back('I');
}

auto [Ite, Inserted] = Seen.try_emplace(
StringRef(Dep.data(), Dep.size()), DepMatrix.size());

// Make sure we only add unique entries to the dependency matrix.
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
if (Inserted) {
DepMatrix.push_back(Dep);
IsForwardFlags.push_back(true);
}
if (!IsForward)
IsForwardFlags.reset(Ite->second);
}
}
}

assert(DepMatrix.size() == IsForwardFlags.size() &&
"Dependency matrix and IsForwardVec should have the same size.");

// If all dependencies corresponding to a direction vector are forward, encode
// it to '<', otherwise to '*'.
for (unsigned I = 0; I != DepMatrix.size(); I++)
DepMatrix[I].push_back(IsForwardFlags[I] ? '<' : '*');

return true;
}

Expand Down Expand Up @@ -276,11 +320,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
continue;

// Check if the direction vector is lexicographically positive (or zero)
// for both before/after exchanged.
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
// for both before/after exchanged. Ignore the last element because it
// doesn't affect the legality.
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
}
return true;
Expand Down Expand Up @@ -1222,21 +1267,33 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
for (unsigned I = 0; I != DepMatrix.size(); I++) {
char Dir = DepMatrix[I][LoopId];
if (Dir != 'I' && Dir != '=')
return false;
char DepType = DepMatrix[I].back();
assert((DepType == '<' || DepType == '*') &&
"Unexpected element in dependency vector");

// There are no loop-carried dependencies.
if (Dir == '=' || Dir == 'I')
continue;

// If both Dir and DepType are '<', it means that the all dependencies are
// lexically forward. Such dependencies don't prevent vectorization.
if (Dir == '<' && DepType == '<')
continue;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A similar fact holds when Dir is > and all dependencies are lexically backward? (even if this is true, I don't intend to address it in this PR).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no; that's actually impossible. D->normalize(SE) would have reversed it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intend to reverse the last element at the same time. Even so, is it still impossible?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When Dir is >, it is reversed by ->normalize() independently of the last element that DependenceAnalysis does not even know about.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was considering a case where the original dependence vector is something like [> <] (which will be normalized to [< >]). In this case, representing a backward dependency like [< > >] instead of [< > *] looked reasonable to me in some situations, but I couldn't come up with any particularly useful examples...


// We cannot prove that the loop is vectorizable.
return false;
}
return true;
}

std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
// If the outer loop is not loop independent it is not profitable to move
// this to inner position, since doing so would not enable inner loop
// parallelism.
// If the outer loop cannot be vectorized, it is not profitable to move this
// to inner position.
if (!canVectorize(DepMatrix, OuterLoopId))
return false;

// If inner loop has dependence and outer loop is loop independent then it is
// If inner loop cannot be vectorized and outer loop can be then it is
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// If inner loop cannot be vectorized and outer loop can be then it is
// If the inner loop cannot be vectorized but the outer loop can be then it is

[grammar]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

By the way, this would be nitpicky as well, but do you think the original comment is accurate? What I'm trying to say is that even if canVectorize were a perfectly accurate function (neither false-positive nor false-negative), I'm starting to think that interchanging the loops here is not necessarily profitable for enabling inner loop parallelism. For example, in the following code:

for (int i = 1; i < N; i++)
  for (int j = 0; j < N; j++)
    for (int k = 0; k < N; k++) {
      // Assume f and g don't have side effects
      use(A[i][j][f(k)]);
      A[i + 1][j][g(k)] = ...;
    }

For the k-loop, canVectorize would return false if f and g are sufficiently complex. However, in principle, parallelizing the k-loop still seems legal in the original one. Therefore, a more accurate comment might be something like "... can be profitable to interchange the loops to enable inner loop parallelism"? (Apparently, I wrote the original comment too, so either past me or present me is wrong...)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is "sufficiently complex"? If DA returns "confused" then canVectorize has to return false. If it returns [< = *] the dependency is carried by the outermost loop, it does not matter what the inner loop does.
I actually don't know/undestand why canVectorize does not look at the parent loop dependencies. Possible because what the outer loops are changes with interchange. At least the loops that are surrounded by both, outer+inner could be considered.

The case you mention is interesting because it is a counterexample to the assumption that if canVectorize is pessimistic (never says a loop can be vectorized even though LoopVectorize will not for some reason), it will not cause loop exchanges that would not happen if it was not pessimistic. Anyway, in this case the j-loop looks more likely to be vectorized profitable because f(k)/g(k) indices would require more complex memory accesses. LoopVectorize can better handle i as a "strided access pattern".

I think the comment itself is correct: If the outer one could be vectorized (if moved to the inner position) but the current inner one cannot, swap the outer one to the vectorizable position. For "vectorizable" it just assumes the definition of canVectorize. Generally, even a loop is vectorizable in terms of dependencies, LoopVectorize may still consider it unprofitable to vectorize because of the instructions it contains, or the code may actually run slower after vectorization, so "profitable" was never in absolute term and hopefully understood as such by the reader. "can" does not add new information here unless we would mention such concrete situations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is "sufficiently complex"? If DA returns "confused" then canVectorize has to return false. If it returns [< = *] the dependency is carried by the outermost loop, it does not matter what the inner loop does.

I tried to say the latter one. Just as you mentioned, I was assuming a case where DA returns [< = *].

I hadn't really been conscious of it, but as you pointed out, this is a case where pessimistic heuristics lead to an interchange that wouldn't have happened if they hadn't been pessimistic (and in this specific case, moving the j-loop would be profitable for vectorization because the memory access pattern is simpler) I personally think that the interchange should not happen in this case, since we currently don't take the vectorization cost into account. Checking dependencies of the surrounding loops seems basically like a good idea, but I'm not confident whether that might lead to other unintended transformations. Using the same cost model as LoopVectorize seems like an ideal solution, but it feels challenging.

For "vectorizable" it just assumes the definition of canVectorize.

As for the comment here, this explanation made the most sense to me. Thanks for clarifying!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally think that the interchange should not happen in this case, since we currently don't take the vectorization cost into account.

I agree, but there are limits on what we can do. At the end it is just a heuristric.

Checking dependencies of the surrounding loops seems basically like a good idea, but I'm not confident whether that might lead to other unintended transformations. Using the same cost model as LoopVectorize seems like an ideal solution, but it feels challenging.

This is a common problem that also LoopDistribute has: It is intended to enable vectorization on one more more distributed loops, but does not know whether they actually are vectorized. In other words, it has no cost model. Becausei if it does not do anything unless explicitly told to do so.

Using the profitability heuristic from LoopVectorize itself, even it it was easy, might also not what we want: Its computational cost is immense (building an entire new IR representation called VPlan) that we would not do speculatively on all loops without actually vectorizing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless you have an universal cost model that takes everything into account and predicts the execution time, each pass needs its own heuristic for what it is optimizing for. E.g. the vectorizer optmizes cycles but does not consider cache effects.

When you put it that way, it hardly seems feasible (well, if it were feasible, it would probably have been done already).

No typo; the patch tries to teach DependenceAnalysis to determine dependencies after loop fusion has taken place without applying loop fusion. Now also do that for interchange, distribution, vectorization, ....

After reading this comment, I noticed that the patch introduces additional analysis for loop fusion even though the client doesn't require it. I initially expected an argument to be added (such as depends(Src, Dst, /*ForFusion=*/true)), but that doesn't seem to be the case. Tough, controlling the analysis behavior via flags could complicate caching and reusing results across different passes.

By the way, I've recently been reading DependenceAnalysis.cpp, and noticed that; it's already quite complex and potentially buggy. I'm fairly certain it should be refactored before adding any new features.

UnrollAndJam is disabled by default. Its heuristic also does not take vectorization into account, but tires to maximize L1i cache usage.

Optimal outcome would be if the vectorizer supported outer-loop vectorization.

I don't know much about the details of the UnrollAndJam pass, but it appears to work (unintentionally?) as if outer-loop vectorization is applied in some cases, especially when combined with the SLPVectorizer (of course, I needed to specify the unroll count explicitly by pragma). So, I just thought that it might make more sense to enhance UnrollAndJam instead of interchange, for cases where outer-loop is vectorizable but inner-loop is not. And, as you said, it would be the best solution to support outer-loop vectorization in the vectorizer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After reading this comment, I noticed that the patch introduces additional analysis for loop fusion even though the client doesn't require it. I initially expected an argument to be added (such as depends(Src, Dst, /*ForFusion=*/true)), but that doesn't seem to be the case. Tough, controlling the analysis behavior via flags could complicate caching and reusing results across different passes.

Whether it is for fusion is not yet decided when calling depends, but FullDependence stores the analysis for both.

By the way, I've recently been reading DependenceAnalysis.cpp, and noticed that; it's already quite complex and potentially buggy. I'm fairly certain it should be refactored before adding any new features.

The principle is straightforward; when processing one of the two fused loops, process them as the same. Since an expression can only be in one of the loops, no ambiguity arises. Only when processing the relationship between two statements you need to decide whether you want to treat them as the same or sequential loops.

I am not sure refactoring helps. Big part of why it is difficult to understand is the math. The Pair also makes it look complex, but it is just matching the access subscript dimensions after delinearization. But I also am also not very happy about adding special cases to an already complex analysis. If you do loop fusion, you may want to support more cases than loops that have excactly the same trip count.

Copy link
Contributor Author

@kasuga-fj kasuga-fj Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whether it is for fusion is not yet decided when calling depends, but FullDependence stores the analysis for both.

IIUC, FullDependence objects are not cached anyware. DependenceInfo is nearly stateless. Furthermore, DependenceInfo::depends returns a unique_ptr, hence we cannot cache the result as it is. That is, I think we know whether the caller is fusion or not when calling DependenceInfo::depends.

I am not sure refactoring helps. Big part of why it is difficult to understand is the math. The Pair also makes it look complex, but it is just matching the access subscript dimensions after delinearization. But I also am also not very happy about adding special cases to an already complex analysis. If you do loop fusion, you may want to support more cases than loops that have excactly the same trip count.

I agree that we can't do much about the mathematical complexity, but I believe the code could be made simpler. It looks to me like there's a fair amount of code duplication, especially when the same processes are executed for SrcXXX and DstXXX (e.g., here). I'm not sure whether this duplication makes the code harder to understand, but I do think it hurts maintainability. I don't believe "Don't Repeat Yourself" is always the right principle, but in this case, I think there are parts of the logic where it does apply.

However, I think the most significant problem is that we don't take wrapping into account. The approach in #116632 seems incorrect to me. We probably need to be more pessimistic with respect to wrapping. I think it makes sense to insert checks for wrap flags where necessary, which would complicate the code. I'm not sure if #146383 applies in that case, but generally speaking, adding a new feature could increase the number of factors we need to consider.

In fact, there's a case where DependenceAnalysis misses a dependency, probably due to ignoring wraps, as shown below (godbolt: https://godbolt.org/z/hsxWve8s6).

; for (i = 0; i < 4; i++)
;   a[i & 1][i & 1] = 0;
define void @f(ptr %a) {
entry:
  br label %loop

loop:
  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
  %and = and i64 %i, 1
  %idx = getelementptr [4 x [4 x i8]], ptr %a, i64 0, i64 %and, i64 %and
  store i8 0, ptr %idx
  %i.next = add i64 %i, 1
  %exitcond.not = icmp slt i64 %i.next, 8
  br i1 %exitcond.not, label %loop, label %exit

exit:
  ret void
}
Printing analysis 'Dependence Analysis' for function 'f':
Src:  store i8 0, ptr %idx, align 1 --> Dst:  store i8 0, ptr %idx, align 1
  da analyze - none!

Copy link
Member

@Meinersbur Meinersbur Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you create an issue # for that case? (Or I can do so) It doesn't look nsw/nuw related though, the subscipts are well within i64 range.

I remember having had issues with #116632 but apparently I have been convinced otherwise.

Would be looking forward to cleanup PRs on DA.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, sorry, the issue already exists: #148435 (comment)

I think this is a kind of wrapping problem. IIRC, the %and is represented as {false,+,true}<%loop>, which would wrap. But DA casts it to i64 and ultimately overlooks the wrapping.

(While I'm at it, I'll share the other issues I found: #149977, #149501, #149991).

Would be looking forward to cleanup PRs on DA.

👍

// profitable to interchange to enable inner loop parallelism.
if (!canVectorize(DepMatrix, InnerLoopId))
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
@A = dso_local global [256 x [256 x float]] zeroinitializer
@B = dso_local global [256 x [256 x float]] zeroinitializer
@C = dso_local global [256 x [256 x float]] zeroinitializer
@D = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
@E = dso_local global [256 x [256 x [256 x float]]] zeroinitializer

; Check that the below loops are exchanged for vectorization.
;
Expand Down Expand Up @@ -64,15 +66,13 @@ exit:
; for (int j = 1; j < 256; j++)
; A[i][j-1] = A[i][j] + B[i][j];
;
; FIXME: These loops are exchanged at this time due to the problem in
; profitability heuristic calculation for vectorization.

; CHECK: --- !Passed
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange.
define void @interchange_unnecesasry_for_vectorization() {
entry:
br label %for.i.header
Expand Down Expand Up @@ -103,3 +103,135 @@ for.i.inc:
exit:
ret void
}

; Check that the below loops are exchanged to allow innermost loop
; vectorization. We cannot vectorize the j-loop because it has a lexically
; backward dependency, but the i-loop can be vectorized because all the
; loop-carried dependencies are lexically forward.
;
; for (int i = 0; i < 255; i++) {
; for (int j = 1; j < 256; j++) {
; A[i][j] = A[i][j-1] + B[i][j];
; C[i][j] += C[i+1][j];
; }
; }
;

; CHECK: --- !Passed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Function: interchange_necessary_for_vectorization2
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
define void @interchange_necessary_for_vectorization2() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
%i.inc = add nsw i64 %i, 1
br label %for.j.body

for.j.body:
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
%j.dec = add nsw i64 %j, -1
%a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
%c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j
%c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
%a = load float, ptr %a.load.index, align 4
%b = load float, ptr %b.index, align 4
%c0 = load float, ptr %c.load.index, align 4
%c1 = load float, ptr %c.store.index, align 4
%add.0 = fadd float %a, %b
%a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
store float %add.0, ptr %a.store.index, align 4
%add.1 = fadd float %c0, %c1
store float %add.1, ptr %c.store.index, align 4
%j.next = add nuw nsw i64 %j, 1
%cmp.j = icmp eq i64 %j.next, 256
br i1 %cmp.j, label %for.i.inc, label %for.j.body

for.i.inc:
%i.next = add nuw nsw i64 %i, 1
%cmp.i = icmp eq i64 %i.next, 255
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}

; Check that no interchange is performed for the following loop. The j-loop is
; vectorizable because all the dependencies are lexically forward. However, at
; the moment, we don't analyze an execution order between instructions in
; different BBs, so fail to determine that the j-loop is vectorizable.
; Therefore, no exchange is performed.
;
; for (int i = 0; i < 255; i++) {
; for (int j = 0; j < 255; j++) {
; for (int k = 0; k < 128; k++) {
; E[i][j][k] = D[i+1][j+1][2*k];
; if (cond)
; D[i][j][k+1] += 1.0;
; }
; }

; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: multiple_BBs_in_loop
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: multiple_BBs_in_loop
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
define void @multiple_BBs_in_loop() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
%i.inc = add nsw i64 %i, 1
br label %for.j.header

for.j.header:
%j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
%j.inc = add nsw i64 %j, 1
br label %for.k.body

for.k.body:
%k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
%k.inc = add nsw i64 %k, 1
%k.2 = mul nsw i64 %k, 2
%d.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i.inc, i64 %j.inc, i64 %k.2
%e.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @E, i64 %i, i64 %j, i64 %k
%d.load = load float, ptr %d.index, align 4
store float %d.load, ptr %e.index, align 4
%cond = freeze i1 undef
br i1 %cond, label %if.then, label %for.k.inc

if.then:
%d.index2 = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i, i64 %j, i64 %k.inc
%d.load2 = load float, ptr %d.index2, align 4
%add = fadd float %d.load2, 1.0
store float %add, ptr %d.index2, align 4
br label %for.k.inc

for.k.inc:
%cmp.k = icmp eq i64 %k.inc, 128
br i1 %cmp.k, label %for.j.inc, label %for.k.body

for.j.inc:
%cmp.j = icmp eq i64 %j.inc, 255
br i1 %cmp.j, label %for.i.inc, label %for.j.header

for.i.inc:
%cmp.i = icmp eq i64 %i.inc, 255
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}
Loading