Skip to content

Commit b75530f

Browse files
authored
[LoopInterchange] Consider forward/backward dependency in vectorize heuristic (#133672)
The vectorization heuristic of LoopInterchange attempts to move a vectorizable loop to the innermost position. Before this patch, a loop was deemed vectorizable if there are no loop-carried dependencies induced by the loop. This patch extends the vectorization heuristic by introducing the concept of forward and backward dependencies, inspired by LoopAccessAnalysis. Specifically, an additional element is appended to each direction vector to indicate whether it represents a forward dependency (`<`) or not (`*`). Among these, only the forward dependencies (i.e., those whose last element is `<`) affect the vectorization heuristic. Accordingly, the check is conservative, and dependencies are considered forward only when this can be proven. Currently, we only support perfectly nested loops whose body consists of a single basic block. For other cases, dependencies are pessimistically treated as non-forward.
1 parent 81bbe98 commit b75530f

File tree

2 files changed

+236
-19
lines changed

2 files changed

+236
-19
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 100 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
#include "llvm/ADT/SmallSet.h"
1818
#include "llvm/ADT/SmallVector.h"
1919
#include "llvm/ADT/Statistic.h"
20+
#include "llvm/ADT/StringMap.h"
2021
#include "llvm/ADT/StringRef.h"
21-
#include "llvm/ADT/StringSet.h"
2222
#include "llvm/Analysis/DependenceAnalysis.h"
2323
#include "llvm/Analysis/LoopCacheAnalysis.h"
2424
#include "llvm/Analysis/LoopInfo.h"
@@ -70,6 +70,13 @@ namespace {
7070

7171
using LoopVector = SmallVector<Loop *, 8>;
7272

73+
/// A list of direction vectors. Each entry represents a direction vector
74+
/// corresponding to one or more dependencies existing in the loop nest. The
75+
/// length of all direction vectors is equal and is N + 1, where N is the depth
76+
/// of the loop nest. The first N elements correspond to the dependency
77+
/// direction of each N loops. The last one indicates whether this entry is
78+
/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79+
/// is defined in LoopAccessAnalysis.
7380
// TODO: Check if we can use a sparse matrix here.
7481
using CharMatrix = std::vector<std::vector<char>>;
7582

@@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
126133

127134
static void printDepMatrix(CharMatrix &DepMatrix) {
128135
for (auto &Row : DepMatrix) {
129-
for (auto D : Row)
136+
// Drop the last element because it is a flag indicating whether this is
137+
// forward dependency or not, which doesn't affect the legality check.
138+
for (char D : drop_end(Row))
130139
LLVM_DEBUG(dbgs() << D << " ");
131140
LLVM_DEBUG(dbgs() << "\n");
132141
}
133142
}
143+
144+
/// Return true if \p Src appears before \p Dst in the same basic block.
145+
/// Precondition: \p Src and \Dst are distinct instructions within the same
146+
/// basic block.
147+
static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
148+
assert(Src->getParent() == Dst->getParent() && Src != Dst &&
149+
"Expected Src and Dst to be different instructions in the same BB");
150+
151+
bool FoundSrc = false;
152+
for (const Instruction &I : *(Src->getParent())) {
153+
if (&I == Src) {
154+
FoundSrc = true;
155+
continue;
156+
}
157+
if (&I == Dst)
158+
return FoundSrc;
159+
}
160+
161+
llvm_unreachable("Dst not found");
162+
}
134163
#endif
135164

136165
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
@@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
174203
return false;
175204
}
176205
ValueVector::iterator I, IE, J, JE;
177-
StringSet<> Seen;
206+
207+
// Manage direction vectors that are already seen. Map each direction vector
208+
// to an index of DepMatrix at which it is stored.
209+
StringMap<unsigned> Seen;
178210

179211
for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
180212
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
228260
Dep.push_back('I');
229261
}
230262

263+
// Test whether the dependency is forward or not.
264+
bool IsKnownForward = true;
265+
if (Src->getParent() != Dst->getParent()) {
266+
// In general, when Src and Dst are in different BBs, the execution
267+
// order of them within a single iteration is not guaranteed. Treat
268+
// conservatively as not-forward dependency in this case.
269+
IsKnownForward = false;
270+
} else {
271+
// Src and Dst are in the same BB. If they are the different
272+
// instructions, Src should appear before Dst in the BB as they are
273+
// stored to MemInstr in that order.
274+
assert((Src == Dst || inThisOrder(Src, Dst)) &&
275+
"Unexpected instructions");
276+
277+
// If the Dependence object is reversed (due to normalization), it
278+
// represents the dependency from Dst to Src, meaning it is a backward
279+
// dependency. Otherwise it should be a forward dependency.
280+
bool IsReversed = D->getSrc() != Src;
281+
if (IsReversed)
282+
IsKnownForward = false;
283+
}
284+
285+
// Initialize the last element. Assume forward dependencies only; it
286+
// will be updated later if there is any non-forward dependency.
287+
Dep.push_back('<');
288+
289+
// The last element should express the "summary" among one or more
290+
// direction vectors whose first N elements are the same (where N is
291+
// the depth of the loop nest). Hence we exclude the last element from
292+
// the Seen map.
293+
auto [Ite, Inserted] = Seen.try_emplace(
294+
StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
295+
231296
// Make sure we only add unique entries to the dependency matrix.
232-
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
297+
if (Inserted)
233298
DepMatrix.push_back(Dep);
299+
300+
// If we cannot prove that this dependency is forward, change the last
301+
// element of the corresponding entry. Since a `[... *]` dependency
302+
// includes a `[... <]` dependency, we do not need to keep both and
303+
// change the existing entry instead.
304+
if (!IsKnownForward)
305+
DepMatrix[Ite->second].back() = '*';
234306
}
235307
}
236308
}
@@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
281353
continue;
282354

283355
// Check if the direction vector is lexicographically positive (or zero)
284-
// for both before/after exchanged.
285-
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
356+
// for both before/after exchanged. Ignore the last element because it
357+
// doesn't affect the legality.
358+
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
286359
return false;
287360
std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
288-
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
361+
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
289362
return false;
290363
}
291364
return true;
@@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
13341407
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
13351408
for (const auto &Dep : DepMatrix) {
13361409
char Dir = Dep[LoopId];
1337-
if (Dir != 'I' && Dir != '=')
1338-
return false;
1410+
char DepType = Dep.back();
1411+
assert((DepType == '<' || DepType == '*') &&
1412+
"Unexpected element in dependency vector");
1413+
1414+
// There are no loop-carried dependencies.
1415+
if (Dir == '=' || Dir == 'I')
1416+
continue;
1417+
1418+
// DepType being '<' means that this direction vector represents a forward
1419+
// dependency. In principle, a loop with '<' direction can be vectorized in
1420+
// this case.
1421+
if (Dir == '<' && DepType == '<')
1422+
continue;
1423+
1424+
// We cannot prove that the loop is vectorizable.
1425+
return false;
13391426
}
13401427
return true;
13411428
}
13421429

13431430
std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
13441431
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1345-
// If the outer loop is not loop independent it is not profitable to move
1346-
// this to inner position, since doing so would not enable inner loop
1347-
// parallelism.
1432+
// If the outer loop cannot be vectorized, it is not profitable to move this
1433+
// to inner position.
13481434
if (!canVectorize(DepMatrix, OuterLoopId))
13491435
return false;
13501436

1351-
// If inner loop has dependence and outer loop is loop independent then it is
1352-
// profitable to interchange to enable inner loop parallelism.
1437+
// If the inner loop cannot be vectorized but the outer loop can be, then it
1438+
// is profitable to interchange to enable inner loop parallelism.
13531439
if (!canVectorize(DepMatrix, InnerLoopId))
13541440
return true;
13551441

llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll

Lines changed: 136 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
@A = dso_local global [256 x [256 x float]] zeroinitializer
66
@B = dso_local global [256 x [256 x float]] zeroinitializer
77
@C = dso_local global [256 x [256 x float]] zeroinitializer
8+
@D = global [256 x [256 x [256 x float]]] zeroinitializer
9+
@E = global [256 x [256 x [256 x float]]] zeroinitializer
810

911
; Check that the below loops are exchanged for vectorization.
1012
;
@@ -64,15 +66,13 @@ exit:
6466
; for (int j = 1; j < 256; j++)
6567
; A[i][j-1] = A[i][j] + B[i][j];
6668
;
67-
; FIXME: These loops are exchanged at this time due to the problem in
68-
; profitability heuristic calculation for vectorization.
6969

70-
; CHECK: --- !Passed
70+
; CHECK: --- !Missed
7171
; CHECK-NEXT: Pass: loop-interchange
72-
; CHECK-NEXT: Name: Interchanged
72+
; CHECK-NEXT: Name: InterchangeNotProfitable
7373
; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization
7474
; CHECK-NEXT: Args:
75-
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
75+
; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange.
7676
define void @interchange_unnecesasry_for_vectorization() {
7777
entry:
7878
br label %for.i.header
@@ -103,3 +103,134 @@ for.i.inc:
103103
exit:
104104
ret void
105105
}
106+
107+
; Check that the below loops are exchanged to allow innermost loop
108+
; vectorization. We cannot vectorize the j-loop because it has a lexically
109+
; backward dependency, but the i-loop can be vectorized because all the
110+
; loop-carried dependencies are lexically forward. LoopVectorize currently only
111+
; vectorizes innermost loop, hence move the i-loop to that position.
112+
;
113+
; for (int i = 0; i < 255; i++) {
114+
; for (int j = 1; j < 256; j++) {
115+
; A[i][j] = A[i][j-1] + B[i][j];
116+
; C[i][j] += C[i+1][j];
117+
; }
118+
; }
119+
;
120+
121+
; CHECK: --- !Passed
122+
; CHECK-NEXT: Pass: loop-interchange
123+
; CHECK-NEXT: Name: Interchanged
124+
; CHECK-NEXT: Function: interchange_necessary_for_vectorization2
125+
; CHECK-NEXT: Args:
126+
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
127+
define void @interchange_necessary_for_vectorization2() {
128+
entry:
129+
br label %for.i.header
130+
131+
for.i.header:
132+
%i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
133+
%i.inc = add i64 %i, 1
134+
br label %for.j.body
135+
136+
for.j.body:
137+
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
138+
%j.dec = add i64 %j, -1
139+
%a.load.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j.dec
140+
%b.index = getelementptr [256 x [256 x float]], ptr @B, i64 0, i64 %i, i64 %j
141+
%c.load.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i.inc, i64 %j
142+
%c.store.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i, i64 %j
143+
%a = load float, ptr %a.load.index
144+
%b = load float, ptr %b.index
145+
%c0 = load float, ptr %c.load.index
146+
%c1 = load float, ptr %c.store.index
147+
%add.0 = fadd float %a, %b
148+
%a.store.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j
149+
store float %add.0, ptr %a.store.index
150+
%add.1 = fadd float %c0, %c1
151+
store float %add.1, ptr %c.store.index
152+
%j.next = add i64 %j, 1
153+
%cmp.j = icmp eq i64 %j.next, 256
154+
br i1 %cmp.j, label %for.i.inc, label %for.j.body
155+
156+
for.i.inc:
157+
%i.next = add i64 %i, 1
158+
%cmp.i = icmp eq i64 %i.next, 255
159+
br i1 %cmp.i, label %exit, label %for.i.header
160+
161+
exit:
162+
ret void
163+
}
164+
165+
; Check that no interchange is performed for the following loop. Interchanging
166+
; the j-loop and k-loop makes the innermost loop vectorizble, since the j-loop
167+
; has only forward dependencies. However, at the moment, a loop body consisting
168+
; of multiple BBs is handled pesimistically. Hence the j-loop isn't moved to
169+
; the innermost place.
170+
;
171+
; for (int i = 0; i < 255; i++) {
172+
; for (int j = 0; j < 255; j++) {
173+
; for (int k = 0; k < 128; k++) {
174+
; E[i][j][k] = D[i+1][j+1][2*k];
175+
; if (cond)
176+
; D[i][j][k+1] = 1.0;
177+
; }
178+
; }
179+
180+
; CHECK: --- !Missed
181+
; CHECK-NEXT: Pass: loop-interchange
182+
; CHECK-NEXT: Name: InterchangeNotProfitable
183+
; CHECK-NEXT: Function: multiple_BBs_in_loop
184+
; CHECK-NEXT: Args:
185+
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
186+
; CHECK: --- !Missed
187+
; CHECK-NEXT: Pass: loop-interchange
188+
; CHECK-NEXT: Name: InterchangeNotProfitable
189+
; CHECK-NEXT: Function: multiple_BBs_in_loop
190+
; CHECK-NEXT: Args:
191+
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
192+
define void @multiple_BBs_in_loop() {
193+
entry:
194+
br label %for.i.header
195+
196+
for.i.header:
197+
%i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
198+
%i.inc = add i64 %i, 1
199+
br label %for.j.header
200+
201+
for.j.header:
202+
%j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
203+
%j.inc = add i64 %j, 1
204+
br label %for.k.body
205+
206+
for.k.body:
207+
%k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
208+
%k.inc = add i64 %k, 1
209+
%k.2 = mul i64 %k, 2
210+
%d.index = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i.inc, i64 %j.inc, i64 %k.2
211+
%e.index = getelementptr [256 x [256 x [256 x float]]], ptr @E, i64 0, i64 %i, i64 %j, i64 %k
212+
%d.load = load float, ptr %d.index
213+
store float %d.load, ptr %e.index
214+
%cond = freeze i1 undef
215+
br i1 %cond, label %if.then, label %for.k.inc
216+
217+
if.then:
218+
%d.index2 = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i, i64 %j, i64 %k.inc
219+
store float 1.0, ptr %d.index2
220+
br label %for.k.inc
221+
222+
for.k.inc:
223+
%cmp.k = icmp eq i64 %k.inc, 128
224+
br i1 %cmp.k, label %for.j.inc, label %for.k.body
225+
226+
for.j.inc:
227+
%cmp.j = icmp eq i64 %j.inc, 255
228+
br i1 %cmp.j, label %for.i.inc, label %for.j.header
229+
230+
for.i.inc:
231+
%cmp.i = icmp eq i64 %i.inc, 255
232+
br i1 %cmp.i, label %exit, label %for.i.header
233+
234+
exit:
235+
ret void
236+
}

0 commit comments

Comments
 (0)