Skip to content

Commit 427c182

Browse files
authored
[unroll-and-jam] Document dependencies_multidims.ll and fix loop bounds (NFC) (#156578)
Add detailed comments explaining why each function should/shouldn't be unroll-and-jammed based on memory access patterns and dependencies. Fix loop bounds to ensure array accesses are within array bounds: * sub_sub_less: j starts from 1 (not 0) to ensure j-1 >= 0 * sub_sub_less_3d: k starts from 1 (not 0) to ensure k-1 >= 0 * sub_sub_outer_scalar: j starts from 1 (not 0) to ensure j-1 >= 0
1 parent 6be7cf0 commit 427c182

File tree

1 file changed

+101
-17
lines changed

1 file changed

+101
-17
lines changed

llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll

Lines changed: 101 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
; RUN: opt -da-disable-delinearization-checks -passes=loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
22
; RUN: opt -da-disable-delinearization-checks -aa-pipeline=basic-aa -passes='loop-unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
33

4-
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
5-
6-
; XFAIL: *
7-
; The transformation seems to have succeeded "accidentally". It should be fixed
8-
; by PR #156578.
9-
104
; CHECK-LABEL: sub_sub_less
115
; CHECK: %j = phi
126
; CHECK-NOT: %j.1 = phi
7+
;
8+
; sub_sub_less should NOT be unroll-and-jammed due to a loop-carried dependency.
9+
; Memory accesses:
10+
; - A[i][j] = 1 (write to current iteration)
11+
; - A[i+1][j-1] = add (write to next i iteration, previous j iteration)
12+
; The dependency: A[i+1][j-1] from iteration (i,j) may conflict with A[i'][j']
13+
; from a later iteration when i'=i+1 and j'=j-1, creating a backward dependency
14+
; in the j dimension that prevents safe unroll-and-jam.
1315
define void @sub_sub_less(ptr noalias nocapture %A, i32 %N, ptr noalias nocapture readonly %B) {
1416
entry:
1517
%cmp = icmp sgt i32 %N, 0
@@ -20,7 +22,7 @@ for.outer:
2022
br label %for.inner
2123

2224
for.inner:
23-
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
25+
%j = phi i32 [ %add6, %for.inner ], [ 1, %for.outer ]
2426
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
2527
%arrayidx5 = getelementptr inbounds i32, ptr %B, i32 %j
2628
%0 = load i32, ptr %arrayidx5, align 4
@@ -51,6 +53,31 @@ cleanup:
5153
; CHECK: %j.1 = phi
5254
; CHECK: %j.2 = phi
5355
; CHECK: %j.3 = phi
56+
;
57+
; sub_sub_eq SHOULD be unroll-and-jammed (count=4) as it's safe.
58+
; Memory accesses:
59+
; - A[i][j] = 1 (write to current iteration)
60+
; - A[i+1][j] = add (write to next i iteration, same j iteration)
61+
; No dependency conflict: When unroll-and-jamming with count=4, the i loop
62+
; iterations (i, i+1, i+2, i+3) are unrolled and their j loops are jammed
63+
; together. Unroll-and-jam factor 4:
64+
;
65+
; for (int i = 0; i < N; i += 4)
66+
; for (int j = 0; j < N; ++j) {
67+
; // i iteration
68+
; A[i][j] = 1; A[i+1][j] = sum_i;
69+
; // i+1 iteration
70+
; A[i+1][j] = 1; A[i+2][j] = sum_i1;
71+
; // i+2 iteration
72+
; A[i+2][j] = 1; A[i+3][j] = sum_i2;
73+
; // i+3 iteration
74+
; A[i+3][j] = 1; A[i+4][j] = sum_i3;
75+
; }
76+
;
77+
; A[i+1][j] from iteration i doesn't conflict with A[i'][j'] from unrolled
78+
; iterations since each unrolled i iteration accesses its own row i+1, i+2, i+3.
79+
; j' values are identical, but accesses happen to different rows in the same j
80+
; iteration before moving to the next j value.
5481
define void @sub_sub_eq(ptr noalias nocapture %A, i32 %N, ptr noalias nocapture readonly %B) {
5582
entry:
5683
%cmp = icmp sgt i32 %N, 0
@@ -92,6 +119,29 @@ cleanup:
92119
; CHECK: %j.1 = phi
93120
; CHECK: %j.2 = phi
94121
; CHECK: %j.3 = phi
122+
;
123+
; sub_sub_more SHOULD be unroll-and-jammed (count=4) as it's safe.
124+
; Memory accesses:
125+
; - A[i][j] = 1 (write to current iteration)
126+
; - A[i+1][j+1] = add (write to next i iteration, next j iteration)
127+
; No dependency conflict: The forward dependency pattern (j+1 in i dimension)
128+
; is safe. Unroll-and-jam factor 4:
129+
;
130+
; for (int i = 0; i < N; i += 4)
131+
; for (int j = 0; j < N; ++j) {
132+
; // i iteration
133+
; A[i][j] = 1; A[i+1][j+1] = sum_i;
134+
; // i+1 iteration
135+
; A[i+1][j] = 1; A[i+2][j+1] = sum_i1;
136+
; // i+2 iteration
137+
; A[i+2][j] = 1; A[i+3][j+1] = sum_i2;
138+
; // i+3 iteration
139+
; A[i+3][j] = 1; A[i+4][j+1] = sum_i3;
140+
; }
141+
;
142+
; A[i+1][j+1] from iteration i accesses row i+1 and column j+1, which is
143+
; disjoint from the accesses in the same iteration. The forward dependency
144+
; pattern doesn't create conflicts between unrolled i iterations.
95145
define void @sub_sub_more(ptr noalias nocapture %A, i32 %N, ptr noalias nocapture readonly %B) {
96146
entry:
97147
%cmp = icmp sgt i32 %N, 0
@@ -130,12 +180,21 @@ cleanup:
130180
; CHECK-LABEL: sub_sub_less_3d
131181
; CHECK: %k = phi
132182
; CHECK-NOT: %k.1 = phi
133-
183+
;
184+
; sub_sub_less_3d should NOT be unroll-and-jammed due to a loop-carried dependency.
185+
; Memory accesses:
186+
; - A3d[i][j][k] = 0 (write to current iteration)
187+
; - A3d[i+1][j][k-1] = 0 (write to next i iteration, previous k iteration)
188+
; The dependency: A[i+1][j][k-1] from iteration (i,j,k) may conflict with
189+
; A[i'][j'][k'] from a later iteration when i'=i+1 and k'=k-1, creating a
190+
; backward dependency in the k dimension that prevents safe unroll-and-jam.
191+
; This is a 3D version of the same pattern as sub_sub_less.
192+
;
134193
; for (long i = 0; i < 100; ++i)
135194
; for (long j = 0; j < 100; ++j)
136-
; for (long k = 0; k < 100; ++k) {
137-
; A[i][j][k] = 0;
138-
; A[i+1][j][k-1] = 0;
195+
; for (long k = 1; k < 100; ++k) {
196+
; A[i][j][k] = 5;
197+
; A[i+1][j][k-1] = 10;
139198
; }
140199

141200
define void @sub_sub_less_3d(ptr noalias %A) {
@@ -151,13 +210,13 @@ for.j:
151210
br label %for.k
152211

153212
for.k:
154-
%k = phi i32 [ 0, %for.j ], [ %inc.k, %for.k ]
213+
%k = phi i32 [ 1, %for.j ], [ %inc.k, %for.k ]
155214
%arrayidx = getelementptr inbounds [100 x [100 x i32]], ptr %A, i32 %i, i32 %j, i32 %k
156-
store i32 0, ptr %arrayidx, align 4
215+
store i32 5, ptr %arrayidx, align 4
157216
%add.i = add nsw i32 %i, 1
158217
%sub.k = add nsw i32 %k, -1
159218
%arrayidx2 = getelementptr inbounds [100 x [100 x i32]], ptr %A, i32 %add.i, i32 %j, i32 %sub.k
160-
store i32 0, ptr %arrayidx2, align 4
219+
store i32 10, ptr %arrayidx2, align 4
161220
%inc.k = add nsw i32 %k, 1
162221
%cmp.k = icmp slt i32 %inc.k, 100
163222
br i1 %cmp.k, label %for.k, label %for.j.latch
@@ -178,8 +237,33 @@ for.end:
178237

179238
; CHECK-LABEL: sub_sub_outer_scalar
180239
; CHECK: %k = phi
181-
; CHECK-NOT: %k.1 = phi
182-
240+
; CHECK: %k.1 = phi
241+
; CHECK: %k.2 = phi
242+
; CHECK: %k.3 = phi
243+
;
244+
; sub_sub_outer_scalar SHOULD be unroll-and-jammed (count=4) as it's safe.
245+
; Memory accesses:
246+
; - load from A[j][k] (read from current j iteration)
247+
; - store to A[j-1][k] (write to previous j iteration)
248+
; The dependency: reading A[j][k] and writing A[j-1][k] creates a backward
249+
; dependency, but execution order is preserved. Unroll-and-jam factor 4:
250+
;
251+
; for (int i = 0; i < 100; i++)
252+
; for (int j = 1; j < 100; j += 4)
253+
; for (int k = 0; k < 100; k++) {
254+
; // j iteration
255+
; temp0 = A[j][k]; A[j-1][k] = temp0;
256+
; // j+1 iteration
257+
; temp1 = A[j+1][k]; A[j][k] = temp1;
258+
; // j+2 iteration
259+
; temp2 = A[j+2][k]; A[j+1][k] = temp2;
260+
; // j+3 iteration
261+
; temp3 = A[j+3][k]; A[j+2][k] = temp3;
262+
; }
263+
;
264+
; All k iterations for each j iteration (including j+1, j+2, j+3) are completed
265+
; before moving to the next j group, so j+1's store to A[j][k] doesn't conflict
266+
; with j's load from A[j][k] because they happen in different k loop invocations.
183267
define void @sub_sub_outer_scalar(ptr %A) {
184268
entry:
185269
br label %for.i
@@ -189,7 +273,7 @@ for.i:
189273
br label %for.j
190274

191275
for.j:
192-
%j = phi i64 [ 0, %for.i ], [ %inc.j, %for.j.latch ]
276+
%j = phi i64 [ 1, %for.i ], [ %inc.j, %for.j.latch ]
193277
br label %for.k
194278

195279
for.k:

0 commit comments

Comments
 (0)