55@A = dso_local global [256 x [256 x float ]] zeroinitializer
66@B = dso_local global [256 x [256 x float ]] zeroinitializer
77@C = dso_local global [256 x [256 x float ]] zeroinitializer
8- @D = dso_local global [256 x [256 x [256 x float ]]] zeroinitializer
9- @E = dso_local global [256 x [256 x [256 x float ]]] zeroinitializer
8+ @D = global [256 x [256 x [256 x float ]]] zeroinitializer
9+ @E = global [256 x [256 x [256 x float ]]] zeroinitializer
1010
1111; Check that the below loops are exchanged for vectorization.
1212;
@@ -107,7 +107,8 @@ exit:
107107; Check that the below loops are exchanged to allow innermost loop
108108; vectorization. We cannot vectorize the j-loop because it has a lexically
109109; backward dependency, but the i-loop can be vectorized because all the
110- ; loop-carried dependencies are lexically forward.
110+ ; loop-carried dependencies are lexically forward. LoopVectorize currently only
111+ ; vectorizes innermost loop, hence move the i-loop to that position.
111112;
112113; for (int i = 0; i < 255; i++) {
113114; for (int j = 1; j < 256; j++) {
@@ -129,50 +130,50 @@ entry:
129130
130131for.i.header:
131132 %i = phi i64 [ 1 , %entry ], [ %i.next , %for.i.inc ]
132- %i.inc = add nsw i64 %i , 1
133+ %i.inc = add i64 %i , 1
133134 br label %for.j.body
134135
135136for.j.body:
136137 %j = phi i64 [ 1 , %for.i.header ], [ %j.next , %for.j.body ]
137- %j.dec = add nsw i64 %j , -1
138- %a.load.index = getelementptr nuw inbounds [256 x [256 x float ]], ptr @A , i64 %i , i64 %j.dec
139- %b.index = getelementptr nuw inbounds [256 x [256 x float ]], ptr @B , i64 %i , i64 %j
140- %c.load.index = getelementptr nuw inbounds [256 x [256 x float ]], ptr @C , i64 %i.inc , i64 %j
141- %c.store.index = getelementptr nuw inbounds [256 x [256 x float ]], ptr @C , i64 %i , i64 %j
142- %a = load float , ptr %a.load.index , align 4
143- %b = load float , ptr %b.index , align 4
144- %c0 = load float , ptr %c.load.index , align 4
145- %c1 = load float , ptr %c.store.index , align 4
138+ %j.dec = add i64 %j , -1
139+ %a.load.index = getelementptr [256 x [256 x float ]], ptr @A , i64 0 , i64 %i , i64 %j.dec
140+ %b.index = getelementptr [256 x [256 x float ]], ptr @B , i64 0 , i64 %i , i64 %j
141+ %c.load.index = getelementptr [256 x [256 x float ]], ptr @C , i64 0 , i64 %i.inc , i64 %j
142+ %c.store.index = getelementptr [256 x [256 x float ]], ptr @C , i64 0 , i64 %i , i64 %j
143+ %a = load float , ptr %a.load.index
144+ %b = load float , ptr %b.index
145+ %c0 = load float , ptr %c.load.index
146+ %c1 = load float , ptr %c.store.index
146147 %add.0 = fadd float %a , %b
147- %a.store.index = getelementptr nuw inbounds [256 x [256 x float ]], ptr @A , i64 %i , i64 %j
148- store float %add.0 , ptr %a.store.index , align 4
148+ %a.store.index = getelementptr [256 x [256 x float ]], ptr @A , i64 0 , i64 %i , i64 %j
149+ store float %add.0 , ptr %a.store.index
149150 %add.1 = fadd float %c0 , %c1
150- store float %add.1 , ptr %c.store.index , align 4
151- %j.next = add nuw nsw i64 %j , 1
151+ store float %add.1 , ptr %c.store.index
152+ %j.next = add i64 %j , 1
152153 %cmp.j = icmp eq i64 %j.next , 256
153154 br i1 %cmp.j , label %for.i.inc , label %for.j.body
154155
155156for.i.inc:
156- %i.next = add nuw nsw i64 %i , 1
157+ %i.next = add i64 %i , 1
157158 %cmp.i = icmp eq i64 %i.next , 255
158159 br i1 %cmp.i , label %exit , label %for.i.header
159160
160161exit:
161162 ret void
162163}
163164
164- ; Check that no interchange is performed for the following loop. The j-loop is
165- ; vectorizable because all the dependencies are lexically forward. However, at
166- ; the moment, we don't analyze an execution order between instructions in
167- ; different BBs, so fail to determine that the j-loop is vectorizable.
168- ; Therefore, no exchange is performed .
165+ ; Check that no interchange is performed for the following loop. Interchanging
166+ ; the j-loop and k-loop makes the innermost loop vectorizble, since the j-loop
167+ ; has only forward dependencies. However, at the moment, a loop body consisting
168+ ; of multiple BBs is handled pesimistically. Hence the j-loop isn't moved to
169+ ; the innermost place .
169170;
170171; for (int i = 0; i < 255; i++) {
171172; for (int j = 0; j < 255; j++) {
172173; for (int k = 0; k < 128; k++) {
173174; E[i][j][k] = D[i+1][j+1][2*k];
174175; if (cond)
175- ; D[i][j][k+1] + = 1.0;
176+ ; D[i][j][k+1] = 1.0;
176177; }
177178; }
178179
@@ -194,30 +195,28 @@ entry:
194195
195196for.i.header:
196197 %i = phi i64 [ 0 , %entry ], [ %i.inc , %for.i.inc ]
197- %i.inc = add nsw i64 %i , 1
198+ %i.inc = add i64 %i , 1
198199 br label %for.j.header
199200
200201for.j.header:
201202 %j = phi i64 [ 0 , %for.i.header ], [ %j.inc , %for.j.inc ]
202- %j.inc = add nsw i64 %j , 1
203+ %j.inc = add i64 %j , 1
203204 br label %for.k.body
204205
205206for.k.body:
206207 %k = phi i64 [ 0 , %for.j.header ], [ %k.inc , %for.k.inc ]
207- %k.inc = add nsw i64 %k , 1
208- %k.2 = mul nsw i64 %k , 2
209- %d.index = getelementptr nuw inbounds [256 x [256 x [256 x float ]]], ptr @D , i64 %i.inc , i64 %j.inc , i64 %k.2
210- %e.index = getelementptr nuw inbounds [256 x [256 x [256 x float ]]], ptr @E , i64 %i , i64 %j , i64 %k
211- %d.load = load float , ptr %d.index , align 4
212- store float %d.load , ptr %e.index , align 4
208+ %k.inc = add i64 %k , 1
209+ %k.2 = mul i64 %k , 2
210+ %d.index = getelementptr [256 x [256 x [256 x float ]]], ptr @D , i64 0 , i64 %i.inc , i64 %j.inc , i64 %k.2
211+ %e.index = getelementptr [256 x [256 x [256 x float ]]], ptr @E , i64 0 , i64 %i , i64 %j , i64 %k
212+ %d.load = load float , ptr %d.index
213+ store float %d.load , ptr %e.index
213214 %cond = freeze i1 undef
214215 br i1 %cond , label %if.then , label %for.k.inc
215216
216217if.then:
217- %d.index2 = getelementptr nuw inbounds [256 x [256 x [256 x float ]]], ptr @D , i64 %i , i64 %j , i64 %k.inc
218- %d.load2 = load float , ptr %d.index2 , align 4
219- %add = fadd float %d.load2 , 1 .0
220- store float %add , ptr %d.index2 , align 4
218+ %d.index2 = getelementptr [256 x [256 x [256 x float ]]], ptr @D , i64 0 , i64 %i , i64 %j , i64 %k.inc
219+ store float 1 .0 , ptr %d.index2
221220 br label %for.k.inc
222221
223222for.k.inc:
0 commit comments