@@ -321,26 +321,29 @@ end:
321321 ret void ;
322322}
323323
324- define arm_aapcs_vfpcc void @non_gatscat_use1 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec ) {
324+ define arm_aapcs_vfpcc void @non_gatscat_use1 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec , < 4 x i32 >* %x ) {
325325; CHECK-LABEL: non_gatscat_use1:
326326; CHECK: @ %bb.0: @ %vector.ph
327- ; CHECK-NEXT: .vsave {d8, d9}
328- ; CHECK-NEXT: vpush {d8, d9}
329- ; CHECK-NEXT: adr r3 , .LCPI7_0
330- ; CHECK-NEXT: vmov.i32 q0, #0x8
331- ; CHECK-NEXT: vldrw.u32 q2 , [r3 ]
327+ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13 }
328+ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13 }
329+ ; CHECK-NEXT: adr.w r12 , .LCPI7_0
330+ ; CHECK-NEXT: vmov.i32 q0, #0x9
331+ ; CHECK-NEXT: vldrw.u32 q3 , [r12 ]
332332; CHECK-NEXT: vmov.i32 q1, #0xc
333+ ; CHECK-NEXT: vmov.i32 q2, #0x8
333334; CHECK-NEXT: .LBB7_1: @ %vector.body
334335; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
335- ; CHECK-NEXT: vadd.i32 q3, q2, q0
336- ; CHECK-NEXT: vmlas.u32 q2, q1, r0
337- ; CHECK-NEXT: vldrw .u32 q4, [q2, #24]
336+ ; CHECK-NEXT: vadd.i32 q4, q3, q2
337+ ; CHECK-NEXT: vmul.i32 q5, q3, q0
338+ ; CHECK-NEXT: vmlas .u32 q3, q1, r0
338339; CHECK-NEXT: subs r2, #4
339- ; CHECK-NEXT: vmov q2, q3
340- ; CHECK-NEXT: vstrb.8 q4, [r1], #16
340+ ; CHECK-NEXT: vldrw.u32 q6, [q3, #24]
341+ ; CHECK-NEXT: vmov q3, q4
342+ ; CHECK-NEXT: vstrw.32 q5, [r3]
343+ ; CHECK-NEXT: vstrb.8 q6, [r1], #16
341344; CHECK-NEXT: bne .LBB7_1
342345; CHECK-NEXT: @ %bb.2: @ %end
343- ; CHECK-NEXT: vpop {d8, d9}
346+ ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13 }
344347; CHECK-NEXT: bx lr
345348; CHECK-NEXT: .p2align 4
346349; CHECK-NEXT: @ %bb.3:
@@ -364,6 +367,7 @@ vector.body: ; preds = %vector.body, %vecto
364367 %4 = bitcast i32* %3 to <4 x i32 >*
365368 store <4 x i32 > %wide.masked.gather , <4 x i32 >* %4 , align 4
366369 %non_gatscat_use = mul <4 x i32 > %0 , <i32 3 , i32 3 , i32 3 , i32 3 >
370+ store <4 x i32 > %non_gatscat_use , <4 x i32 >* %x , align 4
367371 %index.next = add i32 %index , 4
368372 %vec.ind.next = add <4 x i32 > %vec.ind , <i32 8 , i32 8 , i32 8 , i32 8 >
369373 %5 = icmp eq i32 %index.next , %n.vec
@@ -373,26 +377,31 @@ end:
373377 ret void ;
374378}
375379
376- define arm_aapcs_vfpcc void @non_gatscat_use2 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec ) {
380+ define arm_aapcs_vfpcc void @non_gatscat_use2 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec , < 4 x i32 >* %x ) {
377381; CHECK-LABEL: non_gatscat_use2:
378382; CHECK: @ %bb.0: @ %vector.ph
379- ; CHECK-NEXT: .vsave {d8, d9}
380- ; CHECK-NEXT: vpush {d8, d9}
381- ; CHECK-NEXT: adr r3, .LCPI8_0
382- ; CHECK-NEXT: vmov.i32 q0, #0x8
383- ; CHECK-NEXT: vldrw.u32 q2, [r3]
384- ; CHECK-NEXT: vmov.i32 q1, #0xc
383+ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
384+ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
385+ ; CHECK-NEXT: adr.w r12, .LCPI8_0
386+ ; CHECK-NEXT: vmov.i32 q0, #0x12
387+ ; CHECK-NEXT: vldrw.u32 q4, [r12]
388+ ; CHECK-NEXT: vmov.i32 q1, #0x9
389+ ; CHECK-NEXT: vmov.i32 q2, #0x8
390+ ; CHECK-NEXT: vmov.i32 q3, #0xc
385391; CHECK-NEXT: .LBB8_1: @ %vector.body
386392; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
387- ; CHECK-NEXT: vadd.i32 q3, q2, q0
388- ; CHECK-NEXT: vmlas.u32 q2, q1, r0
389- ; CHECK-NEXT: vldrw .u32 q4, [q2, #24]
393+ ; CHECK-NEXT: vadd.i32 q5, q4, q2
394+ ; CHECK-NEXT: vmul.i32 q6, q4, q1
395+ ; CHECK-NEXT: vmlas .u32 q4, q3, r0
390396; CHECK-NEXT: subs r2, #4
391- ; CHECK-NEXT: vmov q2, q3
392- ; CHECK-NEXT: vstrb.8 q4, [r1], #16
397+ ; CHECK-NEXT: vldrw.u32 q7, [q4, #24]
398+ ; CHECK-NEXT: vadd.i32 q4, q6, q0
399+ ; CHECK-NEXT: vstrw.32 q4, [r3]
400+ ; CHECK-NEXT: vmov q4, q5
401+ ; CHECK-NEXT: vstrb.8 q7, [r1], #16
393402; CHECK-NEXT: bne .LBB8_1
394403; CHECK-NEXT: @ %bb.2: @ %end
395- ; CHECK-NEXT: vpop {d8, d9}
404+ ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15 }
396405; CHECK-NEXT: bx lr
397406; CHECK-NEXT: .p2align 4
398407; CHECK-NEXT: @ %bb.3:
@@ -416,6 +425,7 @@ vector.body: ; preds = %vector.body, %vecto
416425 %4 = bitcast i32* %3 to <4 x i32 >*
417426 store <4 x i32 > %wide.masked.gather , <4 x i32 >* %4 , align 4
418427 %non_gatscat_use = mul <4 x i32 > %1 , <i32 3 , i32 3 , i32 3 , i32 3 >
428+ store <4 x i32 > %non_gatscat_use , <4 x i32 >* %x , align 4
419429 %index.next = add i32 %index , 4
420430 %vec.ind.next = add <4 x i32 > %vec.ind , <i32 8 , i32 8 , i32 8 , i32 8 >
421431 %5 = icmp eq i32 %index.next , %n.vec
@@ -844,12 +854,12 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
844854; CHECK-NEXT: add.w r8, r7, #10
845855; CHECK-NEXT: adr r7, .LCPI11_0
846856; CHECK-NEXT: ldr r1, [sp, #96]
847- ; CHECK-NEXT: vdup.32 q1 , r2
848- ; CHECK-NEXT: vldrw.u32 q0 , [r7]
857+ ; CHECK-NEXT: vdup.32 q0 , r2
858+ ; CHECK-NEXT: vldrw.u32 q1 , [r7]
849859; CHECK-NEXT: mov.w r10, #0
850860; CHECK-NEXT: mov.w r9, #6
851861; CHECK-NEXT: movs r6, #11
852- ; CHECK-NEXT: vshl.i32 q1, q1 , #2
862+ ; CHECK-NEXT: vshl.i32 q0, q0 , #2
853863; CHECK-NEXT: movs r5, #0
854864; CHECK-NEXT: .LBB11_1: @ %for.body10.i
855865; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -884,10 +894,10 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
884894; CHECK-NEXT: mul r4, r11, r6
885895; CHECK-NEXT: vdup.32 q3, r5
886896; CHECK-NEXT: vdup.32 q2, r7
887- ; CHECK-NEXT: vadd.i32 q4, q0 , r4
897+ ; CHECK-NEXT: vadd.i32 q4, q1 , r4
888898; CHECK-NEXT: vmla.u32 q3, q4, r2
889899; CHECK-NEXT: adds r4, #113
890- ; CHECK-NEXT: vadd.i32 q4, q0 , r4
900+ ; CHECK-NEXT: vadd.i32 q4, q1 , r4
891901; CHECK-NEXT: mov r4, r8
892902; CHECK-NEXT: vmla.u32 q2, q4, r2
893903; CHECK-NEXT: .LBB11_5: @ %vector.body
@@ -897,8 +907,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
897907; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
898908; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
899909; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
900- ; CHECK-NEXT: vadd.i32 q5, q2, q1
901- ; CHECK-NEXT: vadd.i32 q4, q3, q1
910+ ; CHECK-NEXT: vadd.i32 q5, q2, q0
911+ ; CHECK-NEXT: vadd.i32 q4, q3, q0
902912; CHECK-NEXT: subs r4, #4
903913; CHECK-NEXT: vadd.i32 q2, q6, r2
904914; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
0 commit comments