11; RUN: opt -da-disable-delinearization-checks -passes=loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
22; RUN: opt -da-disable-delinearization-checks -aa-pipeline=basic-aa -passes='loop-unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
33
4- target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
5-
6- ; XFAIL: *
7- ; The transformation seems to have succeeded "accidentally". It should be fixed
8- ; by PR #156578.
9-
104; CHECK-LABEL: sub_sub_less
115; CHECK: %j = phi
126; CHECK-NOT: %j.1 = phi
7+ ;
8+ ; sub_sub_less should NOT be unroll-and-jammed due to a loop-carried dependency.
9+ ; Memory accesses:
10+ ; - A[i][j] = 1 (write to current iteration)
11+ ; - A[i+1][j-1] = add (write to next i iteration, previous j iteration)
12+ ; The dependency: A[i+1][j-1] from iteration (i,j) may conflict with A[i'][j']
13+ ; from a later iteration when i'=i+1 and j'=j-1, creating a backward dependency
14+ ; in the j dimension that prevents safe unroll-and-jam.
1315define void @sub_sub_less (ptr noalias nocapture %A , i32 %N , ptr noalias nocapture readonly %B ) {
1416entry:
1517 %cmp = icmp sgt i32 %N , 0
@@ -20,7 +22,7 @@ for.outer:
2022 br label %for.inner
2123
2224for.inner:
23- %j = phi i32 [ %add6 , %for.inner ], [ 0 , %for.outer ]
25+ %j = phi i32 [ %add6 , %for.inner ], [ 1 , %for.outer ]
2426 %sum = phi i32 [ %add , %for.inner ], [ 0 , %for.outer ]
2527 %arrayidx5 = getelementptr inbounds i32 , ptr %B , i32 %j
2628 %0 = load i32 , ptr %arrayidx5 , align 4
@@ -51,6 +53,31 @@ cleanup:
5153; CHECK: %j.1 = phi
5254; CHECK: %j.2 = phi
5355; CHECK: %j.3 = phi
56+ ;
57+ ; sub_sub_eq SHOULD be unroll-and-jammed (count=4) as it's safe.
58+ ; Memory accesses:
59+ ; - A[i][j] = 1 (write to current iteration)
60+ ; - A[i+1][j] = add (write to next i iteration, same j iteration)
61+ ; No dependency conflict: When unroll-and-jamming with count=4, the i loop
62+ ; iterations (i, i+1, i+2, i+3) are unrolled and their j loops are jammed
63+ ; together. Unroll-and-jam factor 4:
64+ ;
65+ ; for (int i = 0; i < N; i += 4)
66+ ; for (int j = 0; j < N; ++j) {
67+ ; // i iteration
68+ ; A[i][j] = 1; A[i+1][j] = sum_i;
69+ ; // i+1 iteration
70+ ; A[i+1][j] = 1; A[i+2][j] = sum_i1;
71+ ; // i+2 iteration
72+ ; A[i+2][j] = 1; A[i+3][j] = sum_i2;
73+ ; // i+3 iteration
74+ ; A[i+3][j] = 1; A[i+4][j] = sum_i3;
75+ ; }
76+ ;
77+ ; A[i+1][j] from iteration i doesn't conflict with A[i'][j'] from unrolled
78+ ; iterations since each unrolled i iteration accesses its own row i+1, i+2, i+3.
79+ ; j' values are identical, but accesses happen to different rows in the same j
80+ ; iteration before moving to the next j value.
5481define void @sub_sub_eq (ptr noalias nocapture %A , i32 %N , ptr noalias nocapture readonly %B ) {
5582entry:
5683 %cmp = icmp sgt i32 %N , 0
@@ -92,6 +119,29 @@ cleanup:
92119; CHECK: %j.1 = phi
93120; CHECK: %j.2 = phi
94121; CHECK: %j.3 = phi
122+ ;
123+ ; sub_sub_more SHOULD be unroll-and-jammed (count=4) as it's safe.
124+ ; Memory accesses:
125+ ; - A[i][j] = 1 (write to current iteration)
126+ ; - A[i+1][j+1] = add (write to next i iteration, next j iteration)
127+ ; No dependency conflict: The forward dependency pattern (j+1 in i dimension)
128+ ; is safe. Unroll-and-jam factor 4:
129+ ;
130+ ; for (int i = 0; i < N; i += 4)
131+ ; for (int j = 0; j < N; ++j) {
132+ ; // i iteration
133+ ; A[i][j] = 1; A[i+1][j+1] = sum_i;
134+ ; // i+1 iteration
135+ ; A[i+1][j] = 1; A[i+2][j+1] = sum_i1;
136+ ; // i+2 iteration
137+ ; A[i+2][j] = 1; A[i+3][j+1] = sum_i2;
138+ ; // i+3 iteration
139+ ; A[i+3][j] = 1; A[i+4][j+1] = sum_i3;
140+ ; }
141+ ;
142+ ; A[i+1][j+1] from iteration i accesses row i+1 and column j+1, which is
143+ ; disjoint from the accesses in the same iteration. The forward dependency
144+ ; pattern doesn't create conflicts between unrolled i iterations.
95145define void @sub_sub_more (ptr noalias nocapture %A , i32 %N , ptr noalias nocapture readonly %B ) {
96146entry:
97147 %cmp = icmp sgt i32 %N , 0
@@ -130,12 +180,21 @@ cleanup:
130180; CHECK-LABEL: sub_sub_less_3d
131181; CHECK: %k = phi
132182; CHECK-NOT: %k.1 = phi
133-
183+ ;
184+ ; sub_sub_less_3d should NOT be unroll-and-jammed due to a loop-carried dependency.
185+ ; Memory accesses:
186+ ; - A3d[i][j][k] = 0 (write to current iteration)
187+ ; - A3d[i+1][j][k-1] = 0 (write to next i iteration, previous k iteration)
188+ ; The dependency: A[i+1][j][k-1] from iteration (i,j,k) may conflict with
189+ ; A[i'][j'][k'] from a later iteration when i'=i+1 and k'=k-1, creating a
190+ ; backward dependency in the k dimension that prevents safe unroll-and-jam.
191+ ; This is a 3D version of the same pattern as sub_sub_less.
192+ ;
134193; for (long i = 0; i < 100; ++i)
135194; for (long j = 0; j < 100; ++j)
136- ; for (long k = 0 ; k < 100; ++k) {
137- ; A[i][j][k] = 0 ;
138- ; A[i+1][j][k-1] = 0 ;
195+ ; for (long k = 1 ; k < 100; ++k) {
196+ ; A[i][j][k] = 5 ;
197+ ; A[i+1][j][k-1] = 10 ;
139198; }
140199
141200define void @sub_sub_less_3d (ptr noalias %A ) {
@@ -151,13 +210,13 @@ for.j:
151210 br label %for.k
152211
153212for.k:
154- %k = phi i32 [ 0 , %for.j ], [ %inc.k , %for.k ]
213+ %k = phi i32 [ 1 , %for.j ], [ %inc.k , %for.k ]
155214 %arrayidx = getelementptr inbounds [100 x [100 x i32 ]], ptr %A , i32 %i , i32 %j , i32 %k
156- store i32 0 , ptr %arrayidx , align 4
215+ store i32 5 , ptr %arrayidx , align 4
157216 %add.i = add nsw i32 %i , 1
158217 %sub.k = add nsw i32 %k , -1
159218 %arrayidx2 = getelementptr inbounds [100 x [100 x i32 ]], ptr %A , i32 %add.i , i32 %j , i32 %sub.k
160- store i32 0 , ptr %arrayidx2 , align 4
219+ store i32 10 , ptr %arrayidx2 , align 4
161220 %inc.k = add nsw i32 %k , 1
162221 %cmp.k = icmp slt i32 %inc.k , 100
163222 br i1 %cmp.k , label %for.k , label %for.j.latch
@@ -178,8 +237,33 @@ for.end:
178237
179238; CHECK-LABEL: sub_sub_outer_scalar
180239; CHECK: %k = phi
181- ; CHECK-NOT: %k.1 = phi
182-
240+ ; CHECK: %k.1 = phi
241+ ; CHECK: %k.2 = phi
242+ ; CHECK: %k.3 = phi
243+ ;
244+ ; sub_sub_outer_scalar SHOULD be unroll-and-jammed (count=4) as it's safe.
245+ ; Memory accesses:
246+ ; - load from A[j][k] (read from current j iteration)
247+ ; - store to A[j-1][k] (write to previous j iteration)
248+ ; The dependency: reading A[j][k] and writing A[j-1][k] creates a backward
249+ ; dependency, but execution order is preserved. Unroll-and-jam factor 4:
250+ ;
251+ ; for (int i = 0; i < 100; i++)
252+ ; for (int j = 1; j < 100; j += 4)
253+ ; for (int k = 0; k < 100; k++) {
254+ ; // j iteration
255+ ; temp0 = A[j][k]; A[j-1][k] = temp0;
256+ ; // j+1 iteration
257+ ; temp1 = A[j+1][k]; A[j][k] = temp1;
258+ ; // j+2 iteration
259+ ; temp2 = A[j+2][k]; A[j+1][k] = temp2;
260+ ; // j+3 iteration
261+ ; temp3 = A[j+3][k]; A[j+2][k] = temp3;
262+ ; }
263+ ;
264+ ; All k iterations for each j iteration (including j+1, j+2, j+3) are completed
265+ ; before moving to the next j group, so j+1's store to A[j][k] doesn't conflict
266+ ; with j's load from A[j][k] because they happen in different k loop invocations.
183267define void @sub_sub_outer_scalar (ptr %A ) {
184268entry:
185269 br label %for.i
@@ -189,7 +273,7 @@ for.i:
189273 br label %for.j
190274
191275for.j:
192- %j = phi i64 [ 0 , %for.i ], [ %inc.j , %for.j.latch ]
276+ %j = phi i64 [ 1 , %for.i ], [ %inc.j , %for.j.latch ]
193277 br label %for.k
194278
195279for.k:
0 commit comments