@@ -48,22 +48,13 @@ entry:
4848vector.ph: ; preds = %entry
4949 %n.rnd.up = add i32 %N , 3
5050 %n.vec = and i32 %n.rnd.up , -4
51- %trip.count.minus.1 = add i32 %N , -1
52- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
53- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
5451 br label %vector.body
5552
5653vector.body: ; preds = %vector.body, %vector.ph
5754 %index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
5855 %vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
59- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
60- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
61- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
6256 %tmp = getelementptr inbounds i32 , i32* %a , i32 %index
63-
64- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
6557 %tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
66-
6758 %tmp2 = bitcast i32* %tmp to <4 x i32 >*
6859 %wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
6960 %tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -147,22 +138,13 @@ entry:
147138vector.ph: ; preds = %entry
148139 %n.rnd.up = add i32 %N , 3
149140 %n.vec = and i32 %n.rnd.up , -4
150- %trip.count.minus.1 = add i32 %N , -1
151- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
152- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
153141 br label %vector.body
154142
155143vector.body: ; preds = %vector.body, %vector.ph
156144 %index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
157145 %vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
158- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
159- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
160- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
161146 %tmp = getelementptr inbounds i32 , i32* %a , i32 %index
162-
163- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
164147 %tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
165-
166148 %tmp2 = bitcast i32* %tmp to <4 x i32 >*
167149 %wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
168150 %tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -205,13 +187,12 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
205187; CHECK-NEXT: cmp.w r12, #0
206188; CHECK-NEXT: beq .LBB2_4
207189; CHECK-NEXT: @ %bb.1: @ %vector.ph
208- ; CHECK-NEXT: add.w r4, r12, #3
209- ; CHECK-NEXT: vmov.i32 q1, #0x0
210- ; CHECK-NEXT: bic r4, r4, #3
211- ; CHECK-NEXT: sub.w lr, r4, #4
190+ ; CHECK-NEXT: add.w lr, r12, #3
212191; CHECK-NEXT: movs r4, #1
192+ ; CHECK-NEXT: bic lr, lr, #3
193+ ; CHECK-NEXT: vmov.i32 q1, #0x0
194+ ; CHECK-NEXT: sub.w lr, lr, #4
213195; CHECK-NEXT: add.w lr, r4, lr, lsr #2
214- ; CHECK-NEXT: movs r4, #0
215196; CHECK-NEXT: dls lr, lr
216197; CHECK-NEXT: .LBB2_2: @ %vector.body
217198; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -222,12 +203,11 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
222203; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
223204; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
224205; CHECK-NEXT: vsub.i32 q1, q2, q1
225- ; CHECK-NEXT: adds r4 , #4
206+ ; CHECK-NEXT: sub.w r12, r12 , #4
226207; CHECK-NEXT: vpsttt
227208; CHECK-NEXT: vcmpt.i32 eq, q1, zr
228209; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
229210; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
230- ; CHECK-NEXT: sub.w r12, r12, #4
231211; CHECK-NEXT: vmul.i32 q1, q2, q1
232212; CHECK-NEXT: vadd.i32 q1, q1, q0
233213; CHECK-NEXT: le lr, .LBB2_2
@@ -249,22 +229,13 @@ entry:
249229vector.ph: ; preds = %entry
250230 %n.rnd.up = add i32 %N , 3
251231 %n.vec = and i32 %n.rnd.up , -4
252- %trip.count.minus.1 = add i32 %N , -1
253- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
254- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
255232 br label %vector.body
256233
257234vector.body: ; preds = %vector.body, %vector.ph
258235 %index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
259236 %vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
260- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
261- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
262- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
263237 %tmp = getelementptr inbounds i32 , i32* %a , i32 %index
264-
265- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
266238 %tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
267-
268239 %tmp2 = bitcast i32* %tmp to <4 x i32 >*
269240 %wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
270241 %tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -304,13 +275,12 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
304275; CHECK-NEXT: cmp.w r12, #0
305276; CHECK-NEXT: beq .LBB3_4
306277; CHECK-NEXT: @ %bb.1: @ %vector.ph
307- ; CHECK-NEXT: add.w r4, r12, #3
308- ; CHECK-NEXT: vmov.i32 q1, #0x0
309- ; CHECK-NEXT: bic r4, r4, #3
310- ; CHECK-NEXT: sub.w lr, r4, #4
278+ ; CHECK-NEXT: add.w lr, r12, #3
311279; CHECK-NEXT: movs r4, #1
280+ ; CHECK-NEXT: bic lr, lr, #3
281+ ; CHECK-NEXT: vmov.i32 q1, #0x0
282+ ; CHECK-NEXT: sub.w lr, lr, #4
312283; CHECK-NEXT: add.w lr, r4, lr, lsr #2
313- ; CHECK-NEXT: movs r4, #0
314284; CHECK-NEXT: dls lr, lr
315285; CHECK-NEXT: .LBB3_2: @ %vector.body
316286; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -326,9 +296,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
326296; CHECK-NEXT: vcmpt.i32 ne, q1, zr
327297; CHECK-NEXT: vldrwe.u32 q1, [r3], #16
328298; CHECK-NEXT: vldrwe.u32 q2, [r2], #16
329- ; CHECK-NEXT: adds r4, #4
330- ; CHECK-NEXT: vmul.i32 q1, q2, q1
331299; CHECK-NEXT: sub.w r12, r12, #4
300+ ; CHECK-NEXT: vmul.i32 q1, q2, q1
332301; CHECK-NEXT: vadd.i32 q1, q1, q0
333302; CHECK-NEXT: le lr, .LBB3_2
334303; CHECK-NEXT: @ %bb.3: @ %middle.block
@@ -348,22 +317,13 @@ entry:
348317vector.ph: ; preds = %entry
349318 %n.rnd.up = add i32 %N , 3
350319 %n.vec = and i32 %n.rnd.up , -4
351- %trip.count.minus.1 = add i32 %N , -1
352- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
353- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
354320 br label %vector.body
355321
356322vector.body: ; preds = %vector.body, %vector.ph
357323 %index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
358324 %vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
359- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
360- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
361- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
362325 %tmp = getelementptr inbounds i32 , i32* %a , i32 %index
363-
364- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
365326 %tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
366-
367327 %tmp2 = bitcast i32* %tmp to <4 x i32 >*
368328 %wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
369329 %tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -402,11 +362,9 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
402362; CHECK-NEXT: it eq
403363; CHECK-NEXT: popeq {r7, pc}
404364; CHECK-NEXT: .LBB4_1: @ %bb3
405- ; CHECK-NEXT: movs r3, #0
406365; CHECK-NEXT: dlstp.32 lr, r2
407366; CHECK-NEXT: .LBB4_2: @ %bb9
408367; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
409- ; CHECK-NEXT: adds r3, #4
410368; CHECK-NEXT: vldrw.u32 q0, [r1], #16
411369; CHECK-NEXT: vpt.i32 ne, q0, zr
412370; CHECK-NEXT: vldrwt.u32 q1, [r0]
@@ -423,21 +381,12 @@ bb:
423381bb3: ; preds = %bb
424382 %tmp4 = add i32 %arg2 , 3
425383 %tmp5 = and i32 %tmp4 , -4
426- %tmp6 = add i32 %arg2 , -1
427- %tmp7 = insertelement <4 x i32 > undef , i32 %tmp6 , i32 0
428- %tmp8 = shufflevector <4 x i32 > %tmp7 , <4 x i32 > undef , <4 x i32 > zeroinitializer
429384 br label %bb9
430385
431386bb9: ; preds = %bb9, %bb3
432387 %tmp10 = phi i32 [ 0 , %bb3 ], [ %tmp25 , %bb9 ]
433- %tmp11 = insertelement <4 x i32 > undef , i32 %tmp10 , i32 0
434- %tmp12 = shufflevector <4 x i32 > %tmp11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
435- %tmp13 = add <4 x i32 > %tmp12 , <i32 0 , i32 1 , i32 2 , i32 3 >
436388 %tmp14 = getelementptr inbounds i32 , i32* %arg1 , i32 %tmp10
437-
438- ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
439389 %tmp15 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %tmp10 , i32 %arg2 )
440-
441390 %tmp16 = bitcast i32* %tmp14 to <4 x i32 >*
442391 %tmp17 = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp16 , i32 4 , <4 x i1 > %tmp15 , <4 x i32 > undef )
443392 %tmp18 = icmp ne <4 x i32 > %tmp17 , zeroinitializer
@@ -464,15 +413,13 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
464413; CHECK-NEXT: it eq
465414; CHECK-NEXT: popeq {r7, pc}
466415; CHECK-NEXT: .LBB5_1: @ %bb4
467- ; CHECK-NEXT: mov.w r12, #0
468416; CHECK-NEXT: dlstp.32 lr, r3
469417; CHECK-NEXT: .LBB5_2: @ %bb12
470418; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
471419; CHECK-NEXT: vldrw.u32 q0, [r0]
472420; CHECK-NEXT: vptt.i32 ne, q0, zr
473421; CHECK-NEXT: vcmpt.s32 le, q0, r2
474422; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
475- ; CHECK-NEXT: add.w r12, r12, #4
476423; CHECK-NEXT: vmul.i32 q0, q1, q0
477424; CHECK-NEXT: vpst
478425; CHECK-NEXT: vstrwt.32 q0, [r0], #16
@@ -486,23 +433,14 @@ bb:
486433bb4: ; preds = %bb
487434 %tmp5 = add i32 %arg3 , 3
488435 %tmp6 = and i32 %tmp5 , -4
489- %tmp7 = add i32 %arg3 , -1
490- %tmp8 = insertelement <4 x i32 > undef , i32 %tmp7 , i32 0
491- %tmp9 = shufflevector <4 x i32 > %tmp8 , <4 x i32 > undef , <4 x i32 > zeroinitializer
492436 %tmp10 = insertelement <4 x i32 > undef , i32 %arg2 , i32 0
493437 %tmp11 = shufflevector <4 x i32 > %tmp10 , <4 x i32 > undef , <4 x i32 > zeroinitializer
494438 br label %bb12
495439
496440bb12: ; preds = %bb12, %bb4
497441 %tmp13 = phi i32 [ 0 , %bb4 ], [ %tmp30 , %bb12 ]
498- %tmp14 = insertelement <4 x i32 > undef , i32 %tmp13 , i32 0
499- %tmp15 = shufflevector <4 x i32 > %tmp14 , <4 x i32 > undef , <4 x i32 > zeroinitializer
500- %tmp16 = add <4 x i32 > %tmp15 , <i32 0 , i32 1 , i32 2 , i32 3 >
501442 %tmp17 = getelementptr inbounds i32 , i32* %arg , i32 %tmp13
502-
503- ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
504443 %tmp18 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %tmp13 , i32 %arg3 )
505-
506444 %tmp19 = bitcast i32* %tmp17 to <4 x i32 >*
507445 %tmp20 = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp19 , i32 4 , <4 x i1 > %tmp18 , <4 x i32 > undef )
508446 %tmp21 = icmp ne <4 x i32 > %tmp20 , zeroinitializer
0 commit comments