-
Notifications
You must be signed in to change notification settings - Fork 15k
ARM: Enable terminal rule #165958
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/arsenm/x86/enable-terminal-rule
Are you sure you want to change the base?
ARM: Enable terminal rule #165958
Conversation
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-llvm-regalloc Author: Matt Arsenault (arsenm) ChangesPatch is 43.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165958.diff 14 Files Affected:
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 4a0883cc662e7..34baa3108402c 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -377,6 +377,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
bool isRWPI() const;
bool useMachineScheduler() const { return UseMISched; }
+ bool enableTerminalRule() const override { return true; }
bool useMachinePipeliner() const { return UseMIPipeliner; }
bool hasMinSize() const { return OptMinSize; }
bool isThumb1Only() const { return isThumb() && !hasThumb2(); }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
index 79665af17ef58..9632469261f4d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
@@ -7,22 +7,22 @@ define dso_local i32 @test_500_504(ptr nocapture readonly %x) {
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: mov.w lr, #126
-; CHECK-NEXT: adr r2, .LCPI0_0
-; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: mov.w r2, #500
-; CHECK-NEXT: vdup.32 q1, r2
-; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: adr r1, .LCPI0_0
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: mov.w r1, #500
+; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: vdup.32 q1, r1
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vqadd.u32 q2, q0, r1
-; CHECK-NEXT: adds r1, #4
+; CHECK-NEXT: vqadd.u32 q2, q0, r2
+; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: vptt.u32 hi, q1, q2
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT: vaddvat.u32 r2, q2
+; CHECK-NEXT: vaddvat.u32 r12, q2
; CHECK-NEXT: le lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
index ec257bcf123f3..bcedcd40ba112 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
@@ -28,29 +28,29 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
; CHECK-NEXT: str r6, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: .LBB0_5: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r11, [r0, #16]!
-; CHECK-NEXT: ldrd r5, r7, [r0, #-12]
+; CHECK-NEXT: ldrd r5, r6, [r0, #-12]
; CHECK-NEXT: ldr r4, [r0, #-4]
; CHECK-NEXT: cmp r12, r5
; CHECK-NEXT: csel r5, r5, r12, gt
-; CHECK-NEXT: csinc r6, r10, r8, le
-; CHECK-NEXT: cmp r5, r7
+; CHECK-NEXT: csinc r7, r10, r8, le
+; CHECK-NEXT: cmp r5, r6
; CHECK-NEXT: it gt
-; CHECK-NEXT: addgt.w r6, r8, #2
-; CHECK-NEXT: csel r7, r7, r5, gt
-; CHECK-NEXT: cmp r7, r4
+; CHECK-NEXT: addgt.w r7, r8, #2
+; CHECK-NEXT: csel r6, r6, r5, gt
+; CHECK-NEXT: cmp r6, r4
; CHECK-NEXT: it gt
-; CHECK-NEXT: addgt.w r6, r8, #3
-; CHECK-NEXT: csel r7, r4, r7, gt
+; CHECK-NEXT: addgt.w r7, r8, #3
+; CHECK-NEXT: csel r6, r4, r6, gt
; CHECK-NEXT: add.w r8, r8, #4
-; CHECK-NEXT: cmp r7, r11
-; CHECK-NEXT: csel r10, r8, r6, gt
-; CHECK-NEXT: csel r12, r11, r7, gt
+; CHECK-NEXT: cmp r6, r11
+; CHECK-NEXT: csel r10, r8, r7, gt
+; CHECK-NEXT: csel r12, r11, r6, gt
; CHECK-NEXT: le lr, .LBB0_5
; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit
; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 1769c5d2fd385..98e082be4cad1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -21,11 +21,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: it lt
; ENABLED-NEXT: bxlt lr
; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph
-; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; ENABLED-NEXT: mov r11, r0
-; ENABLED-NEXT: ldr r0, [sp, #32]
+; ENABLED-NEXT: ldr r0, [sp, #36]
; ENABLED-NEXT: add.w r9, r2, #3
; ENABLED-NEXT: mov.w r12, #0
+; ENABLED-NEXT: mov.w r8, #1
; ENABLED-NEXT: mov r10, r11
; ENABLED-NEXT: uxth r0, r0
; ENABLED-NEXT: rsbs r5, r0, #0
@@ -49,18 +50,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
-; ENABLED-NEXT: movs r7, #1
-; ENABLED-NEXT: subs r0, #4
; ENABLED-NEXT: sub.w r4, r2, r12
+; ENABLED-NEXT: subs r0, #4
; ENABLED-NEXT: vmov.i32 q1, #0x0
-; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
+; ENABLED-NEXT: mov r7, r10
+; ENABLED-NEXT: add.w r6, r8, r0, lsr #2
; ENABLED-NEXT: adds r0, r2, #3
; ENABLED-NEXT: sub.w r0, r0, r12
; ENABLED-NEXT: bic r0, r0, #3
; ENABLED-NEXT: subs r0, #4
-; ENABLED-NEXT: add.w r0, r7, r0, lsr #2
-; ENABLED-NEXT: mov r7, r10
-; ENABLED-NEXT: dls lr, r0
+; ENABLED-NEXT: add.w lr, r8, r0, lsr #2
; ENABLED-NEXT: mov r0, r11
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
@@ -83,7 +82,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: b .LBB0_3
; ENABLED-NEXT: .LBB0_8:
-; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; ENABLED-NEXT: bx lr
;
; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
@@ -92,11 +91,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: it lt
; NOREDUCTIONS-NEXT: bxlt lr
; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; NOREDUCTIONS-NEXT: mov r11, r0
-; NOREDUCTIONS-NEXT: ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT: ldr r0, [sp, #36]
; NOREDUCTIONS-NEXT: add.w r9, r2, #3
; NOREDUCTIONS-NEXT: mov.w r12, #0
+; NOREDUCTIONS-NEXT: mov.w r8, #1
; NOREDUCTIONS-NEXT: mov r10, r11
; NOREDUCTIONS-NEXT: uxth r0, r0
; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
@@ -120,18 +120,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
-; NOREDUCTIONS-NEXT: movs r7, #1
-; NOREDUCTIONS-NEXT: subs r0, #4
; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
+; NOREDUCTIONS-NEXT: subs r0, #4
; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT: mov r7, r10
+; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2
; NOREDUCTIONS-NEXT: adds r0, r2, #3
; NOREDUCTIONS-NEXT: sub.w r0, r0, r12
; NOREDUCTIONS-NEXT: bic r0, r0, #3
; NOREDUCTIONS-NEXT: subs r0, #4
-; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT: mov r7, r10
-; NOREDUCTIONS-NEXT: dls lr, r0
+; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2
; NOREDUCTIONS-NEXT: mov r0, r11
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
@@ -154,7 +152,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: b .LBB0_3
; NOREDUCTIONS-NEXT: .LBB0_8:
-; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; NOREDUCTIONS-NEXT: bx lr
entry:
%conv = sext i16 %N to i32
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index cbcbf1f392ce8..435acc29f076e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,74 +165,73 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: wls lr, r1, .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT: adds r6, r3, #4
-; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: add.w r9, r3, #4
+; CHECK-NEXT: add.w r10, r0, #4
; CHECK-NEXT: mvn r8, #1
-; CHECK-NEXT: @ implicit-def: $r9
+; CHECK-NEXT: @ implicit-def: $r6
; CHECK-NEXT: @ implicit-def: $r4
; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: ldr.w r1, [r10]
; CHECK-NEXT: asrs r2, r4, #31
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: muls r1, r3, r1
; CHECK-NEXT: adds r4, r4, r1
; CHECK-NEXT: adc.w r1, r2, r1, asr #31
; CHECK-NEXT: adds.w r2, r4, #-2147483648
-; CHECK-NEXT: ldrd r2, r4, [r8]
-; CHECK-NEXT: adc r5, r1, #0
-; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: smull r4, r2, r4, r9
-; CHECK-NEXT: asrs r1, r5, #31
+; CHECK-NEXT: ldrd r5, r4, [r8]
+; CHECK-NEXT: adc r2, r1, #0
; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: subs r4, r5, r4
-; CHECK-NEXT: sbcs r1, r2
-; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds.w r10, r4, #-2147483648
-; CHECK-NEXT: adc r1, r1, #0
-; CHECK-NEXT: ldr r4, [r2, #-4]
+; CHECK-NEXT: smull r4, r5, r4, r6
+; CHECK-NEXT: asrs r1, r2, #31
+; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: subs r4, r2, r4
+; CHECK-NEXT: sbcs r1, r5
+; CHECK-NEXT: adds.w r6, r4, #-2147483648
+; CHECK-NEXT: ldr r4, [r10, #-4]
+; CHECK-NEXT: adc r11, r1, #0
+; CHECK-NEXT: mov r1, r9
+; CHECK-NEXT: add.w r10, r10, #4
; CHECK-NEXT: muls r4, r3, r4
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: adds.w r12, r4, #-2147483648
; CHECK-NEXT: asr.w r5, r4, #31
-; CHECK-NEXT: ldr r4, [r6]
+; CHECK-NEXT: ldr.w r4, [r9]
; CHECK-NEXT: adc r5, r5, #0
; CHECK-NEXT: mul r2, r4, r0
-; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: add.w r2, r2, #-2147483648
; CHECK-NEXT: asrl r12, r5, r2
-; CHECK-NEXT: smull r2, r5, r4, r12
-; CHECK-NEXT: lsll r2, r5, #30
-; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: asr.w r11, r5, #31
-; CHECK-NEXT: mov r12, r5
-; CHECK-NEXT: lsll r12, r11, r4
-; CHECK-NEXT: mul r2, r2, r9
-; CHECK-NEXT: lsrl r12, r11, #2
-; CHECK-NEXT: adds r2, #2
-; CHECK-NEXT: lsll r12, r11, r2
+; CHECK-NEXT: smull r2, r9, r4, r12
+; CHECK-NEXT: mov r12, r0
+; CHECK-NEXT: lsll r2, r9, #30
+; CHECK-NEXT: asr.w r5, r9, #31
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: mov r9, r1
+; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT: lsll r2, r5, r4
+; CHECK-NEXT: lsrl r2, r5, #2
+; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: lsll r2, r5, r0
+; CHECK-NEXT: add.w r0, r2, #-2147483648
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT: add.w r5, r12, #-2147483648
-; CHECK-NEXT: asrl r10, r1, r5
-; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: lsrl r10, r1, #2
-; CHECK-NEXT: movs r1, #2
-; CHECK-NEXT: mov r9, r10
-; CHECK-NEXT: str.w r10, [r1]
-; CHECK-NEXT: ldr r1, [r8], #-4
-; CHECK-NEXT: mls r5, r1, r4, r5
-; CHECK-NEXT: adds.w r4, r5, #-2147483648
-; CHECK-NEXT: asr.w r1, r5, #31
+; CHECK-NEXT: asrl r6, r11, r0
+; CHECK-NEXT: movs r0, #2
+; CHECK-NEXT: lsrl r6, r11, #2
+; CHECK-NEXT: str r6, [r0]
+; CHECK-NEXT: ldr r0, [r8], #-4
+; CHECK-NEXT: mls r0, r0, r4, r1
+; CHECK-NEXT: adds.w r4, r0, #-2147483648
+; CHECK-NEXT: asr.w r1, r0, #31
; CHECK-NEXT: adc r1, r1, #0
; CHECK-NEXT: lsrl r4, r1, #2
-; CHECK-NEXT: rsbs r1, r4, #0
-; CHECK-NEXT: str r1, [r2]
-; CHECK-NEXT: str r1, [r6, #-4]
-; CHECK-NEXT: adds r6, #4
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds r1, #4
+; CHECK-NEXT: rsbs r0, r4, #0
+; CHECK-NEXT: str r0, [r2]
+; CHECK-NEXT: str r0, [r9, #-4]
+; CHECK-NEXT: add.w r9, r9, #4
+; CHECK-NEXT: add.w r0, r12, #4
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: .LBB2_3: @ %while.end
; CHECK-NEXT: add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index f7b4548f127bf..b6657d607ce6d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1573,120 +1573,115 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: ldrd r7, r9, [r0]
-; CHECK-NEXT: and r6, r3, #3
-; CHECK-NEXT: ldr r0, [r0, #8]
-; CHECK-NEXT: lsrs r3, r3, #2
-; CHECK-NEXT: @ implicit-def: $r12
-; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: ldm.w r0, {r7, r9, r11}
+; CHECK-NEXT: and r0, r3, #3
+; CHECK-NEXT: @ implicit-def: $r5
+; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: lsrs r0, r3, #2
+; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: b .LBB19_3
; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT: mov r3, r8
-; CHECK-NEXT: mov r2, r5
-; CHECK-NEXT: mov r4, r11
-; CHECK-NEXT: mov r8, r10
+; CHECK-NEXT: mov r8, r3
+; CHECK-NEXT: mov r3, r12
+; CHECK-NEXT: mov r0, r4
+; CHECK-NEXT: mov r12, r10
; CHECK-NEXT: .LBB19_2: @ %if.end69
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds r0, #128
-; CHECK-NEXT: strd r2, r4, [r9]
-; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: subs r7, #1
-; CHECK-NEXT: strd r3, r8, [r9, #8]
-; CHECK-NEXT: add.w r9, r9, #16
+; CHECK-NEXT: add.w r11, r11, #128
+; CHECK-NEXT: strd r8, r0, [r9]
; CHECK-NEXT: mov r1, r2
+; CHECK-NEXT: strd r3, r12, [r9, #8]
+; CHECK-NEXT: add.w r9, r9, #16
+; CHECK-NEXT: subs r7, #1
; CHECK-NEXT: beq.w .LBB19_13
; CHECK-NEXT: .LBB19_3: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT: ldrd r5, r11, [r9]
+; CHECK-NEXT: ldr.w r10, [r9, #12]
; CHECK-NEXT: mov r6, r2
-; CHECK-NEXT: ldrd r8, r10, [r9, #8]
-; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT: ldm.w r9, {r3, r4, r12}
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: wls lr, r2, .LBB19_6
+; CHECK-NEXT: wls lr, r0, .LBB19_6
; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: mov r4, r11
-; CHECK-NEXT: mov r3, r5
+; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: .LBB19_5: @ %while.body
; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldr r5, [r1, #12]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT: ldm.w r1, {r2, r7, r11}
-; CHECK-NEXT: vmul.f32 q2, q2, r5
-; CHECK-NEXT: vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT: vfma.f32 q2, q6, r11
-; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: mov r8, r4
+; CHECK-NEXT: ldrd r4, r3, [r1, #8]
+; CHECK-NEXT: vldrw.u32 q2, [r11]
+; CHECK-NEXT: vldrw.u32 q6, [r11, #16]
+; CHECK-NEXT: ldrd r0, r7, [r1]
+; CHECK-NEXT: vmul.f32 q2, q2, r3
+; CHECK-NEXT: vldrw.u32 q7, [r11, #32]
+; CHECK-NEXT: vfma.f32 q2, q6, r4
+; CHECK-NEXT: vldrw.u32 q4, [r11, #48]
; CHECK-NEXT: vfma.f32 q2, q7, r7
-; CHECK-NEXT: vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT: vfma.f32 q2, q4, r2
-; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT: vfma.f32 q2, q5, r3
-; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
-; CHECK-NEXT: vfma.f32 q2, q3, r4
-; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT: vfma.f32 q2, q1, r8
+; CHECK-NEXT: vldrw.u32 q5, [r11, #64]
+; CHECK-NEXT: vfma.f32 q2, q4, r0
+; CHECK-NEXT: vldrw.u32 q3, [r11, #80]
+; CHECK-NEXT: vfma.f32 q2, q5, r5
+; CHECK-NEXT: vldrw.u32 q1, [r11, #96]
+; CHECK-NEXT: vfma.f32 q2, q3, r8
+; CHECK-NEXT: vldrw.u32 q0, [r11, #112]
+; CHECK-NEXT: vfma.f32 q2, q1, r12
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: vfma.f32 q2, q0, r10
-; CHECK-NEXT: mov r4, r11
-; CHECK-NEXT: vmov r10, r8, d5
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: vmov r10, r12, d5
; CHECK-NEXT: vstrb.8 q2, [r6], #16
-; CHECK-NEXT: mov r3, r5
-; CHECK-NEXT: mov r12, r5
; CHECK-NEXT: le lr, .LBB19_5
; CHECK-NEXT: .LBB19_6: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: beq .LBB19_1
; CHECK-NEXT: @ %bb.7: @ %if.then
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT: ldrd lr, r4, [r1]
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: ldrd r2, r1, [r1, #8]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT: ldrd lr, r0, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r11]
+; CHECK-NEXT: ldrd r8, r1, [r1, #8]
+; CHECK-NEXT: vldrw.u32 q6, [r11, #16]
+; CHECK-NEXT: vldrw.u32 q7, [r11, #32]
+; CHECK-NEXT: vldrw.u32 q4, [r11, #48]
; CHECK-NEXT: vmul.f32 q0, q0, r1
-; CHECK-NEXT: vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT: vfma.f32 q0, q6, r2
-; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT: vfma.f32 q0, q7, r4
-; CHECK-NEXT: vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT: vldrw.u32 q5, [r11, #64]
+; CHECK-NEXT: vfma.f32 q0, q6, r8
+; CHECK-NEXT: vldrw.u32 q3, [r11, #80]
+; CHECK-NEXT: vfma.f32 q0, q7, r0
+; CHECK-NEXT: vldrw.u32 q2, [r11, #96]
; CHECK-NEXT: vfma.f32 q0, q4, lr
-; CHECK-NEXT: vldrw.u32 q1, [r0, #112]
-; CHECK-NEXT: vfma.f32 q0, q5, r5
-; CHECK-NEXT: cmp r3, #1
-; CHECK-NEXT: vfma.f32 q0, q3, r11
-; CHECK-NEXT: vfma.f32 q0, q2, r8
+; CHECK-NEXT: vldrw.u32 q1, [r11, #112]
+; CHECK-NEXT: vfma.f32 q0, q5, r3
+; CHECK-NEXT: cmp r7, #1
+; CHECK-NEXT: vfma.f32 q0, q3, r4
+; CHECK-NEXT: vfma.f32 q0, q2, r12
; CHECK-NEXT: vfma.f32 q0, q1, r10
-; CHECK-NEXT: vmov r5, s0
+; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: bne .LBB19_9
; CHECK-NEXT: @ %bb.8: @ %if.then58
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT: str r5, [r6]
-; CHECK-NEXT: mov r2, lr
-; CHECK-NEXT: mov r4, r12
-; CHECK-NEXT: mov r3, r5
+; CHECK-NEXT: str r4, [r6]
+; CHECK-NEXT: mov r8, lr
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: mov r3, r4
; CHECK-NEXT: b .LBB19_12
; CHECK-NEXT: .LBB19_9: @ %if.else
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT: vmov r8, s1
-; CHECK-NEXT: cmp r3, #2
+; CHECK-NEXT: vmov r12, s1
+; CHECK-NEXT: cmp r7, #2
; CHECK-NEXT: vstr s1, [r6, #4]
-; CHECK-NEXT: str r5, [r6]
+; CHECK-NEXT: str r4, [r6]
; CHECK-NEXT: bne .LBB19_11
; CHECK-NEXT: @ %bb....
[truncated]
|

No description provided.