|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| 2 | +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0 |
| 3 | +; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1 |
| 4 | + |
| 5 | +declare void @private_za_call() |
| 6 | +declare void @shared_za_call() "aarch64_inout_za" |
| 7 | + |
| 8 | +; This test checks that at -O0 we don't attempt to optimize lazy save state |
| 9 | +; changes in loops, and that -O1 (and above) we attempt to push state changes |
| 10 | +; out of loops. |
| 11 | + |
| 12 | +define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { |
| 13 | +; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit: |
| 14 | +; CHECK-O0: // %bb.0: // %entry |
| 15 | +; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill |
| 16 | +; CHECK-O0-NEXT: mov x29, sp |
| 17 | +; CHECK-O0-NEXT: sub sp, sp, #32 |
| 18 | +; CHECK-O0-NEXT: rdsvl x9, #1 |
| 19 | +; CHECK-O0-NEXT: mov x8, sp |
| 20 | +; CHECK-O0-NEXT: msub x8, x9, x9, x8 |
| 21 | +; CHECK-O0-NEXT: mov sp, x8 |
| 22 | +; CHECK-O0-NEXT: stp x8, x9, [x29, #-16] |
| 23 | +; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill |
| 24 | +; CHECK-O0-NEXT: bl shared_za_call |
| 25 | +; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload |
| 26 | +; CHECK-O0-NEXT: mov w8, wzr |
| 27 | +; CHECK-O0-NEXT: subs w9, w0, #1 |
| 28 | +; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill |
| 29 | +; CHECK-O0-NEXT: b.lt .LBB0_4 |
| 30 | +; CHECK-O0-NEXT: b .LBB0_1 |
| 31 | +; CHECK-O0-NEXT: .LBB0_1: // %loop |
| 32 | +; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1 |
| 33 | +; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload |
| 34 | +; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill |
| 35 | +; CHECK-O0-NEXT: sub x8, x29, #16 |
| 36 | +; CHECK-O0-NEXT: msr TPIDR2_EL0, x8 |
| 37 | +; CHECK-O0-NEXT: bl private_za_call |
| 38 | +; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload |
| 39 | +; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload |
| 40 | +; CHECK-O0-NEXT: add w9, w8, #1 |
| 41 | +; CHECK-O0-NEXT: mov w8, w9 |
| 42 | +; CHECK-O0-NEXT: subs w9, w9, w10 |
| 43 | +; CHECK-O0-NEXT: mrs x9, NZCV |
| 44 | +; CHECK-O0-NEXT: smstart za |
| 45 | +; CHECK-O0-NEXT: mrs x10, TPIDR2_EL0 |
| 46 | +; CHECK-O0-NEXT: sub x0, x29, #16 |
| 47 | +; CHECK-O0-NEXT: cbz x10, .LBB0_2 |
| 48 | +; CHECK-O0-NEXT: b .LBB0_3 |
| 49 | +; CHECK-O0-NEXT: .LBB0_2: // %loop |
| 50 | +; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1 |
| 51 | +; CHECK-O0-NEXT: bl __arm_tpidr2_restore |
| 52 | +; CHECK-O0-NEXT: b .LBB0_3 |
| 53 | +; CHECK-O0-NEXT: .LBB0_3: // %loop |
| 54 | +; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1 |
| 55 | +; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr |
| 56 | +; CHECK-O0-NEXT: msr NZCV, x9 |
| 57 | +; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill |
| 58 | +; CHECK-O0-NEXT: b.ne .LBB0_1 |
| 59 | +; CHECK-O0-NEXT: b .LBB0_4 |
| 60 | +; CHECK-O0-NEXT: .LBB0_4: // %exit |
| 61 | +; CHECK-O0-NEXT: mov sp, x29 |
| 62 | +; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload |
| 63 | +; CHECK-O0-NEXT: b shared_za_call |
| 64 | +; |
| 65 | +; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit: |
| 66 | +; CHECK-O1: // %bb.0: // %entry |
| 67 | +; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill |
| 68 | +; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill |
| 69 | +; CHECK-O1-NEXT: mov x29, sp |
| 70 | +; CHECK-O1-NEXT: sub sp, sp, #16 |
| 71 | +; CHECK-O1-NEXT: rdsvl x8, #1 |
| 72 | +; CHECK-O1-NEXT: mov x9, sp |
| 73 | +; CHECK-O1-NEXT: msub x9, x8, x8, x9 |
| 74 | +; CHECK-O1-NEXT: mov sp, x9 |
| 75 | +; CHECK-O1-NEXT: mov w19, w0 |
| 76 | +; CHECK-O1-NEXT: stp x9, x8, [x29, #-16] |
| 77 | +; CHECK-O1-NEXT: bl shared_za_call |
| 78 | +; CHECK-O1-NEXT: cmp w19, #1 |
| 79 | +; CHECK-O1-NEXT: sub x8, x29, #16 |
| 80 | +; CHECK-O1-NEXT: msr TPIDR2_EL0, x8 |
| 81 | +; CHECK-O1-NEXT: b.lt .LBB0_2 |
| 82 | +; CHECK-O1-NEXT: .LBB0_1: // %loop |
| 83 | +; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1 |
| 84 | +; CHECK-O1-NEXT: bl private_za_call |
| 85 | +; CHECK-O1-NEXT: subs w19, w19, #1 |
| 86 | +; CHECK-O1-NEXT: b.ne .LBB0_1 |
| 87 | +; CHECK-O1-NEXT: .LBB0_2: // %exit |
| 88 | +; CHECK-O1-NEXT: smstart za |
| 89 | +; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0 |
| 90 | +; CHECK-O1-NEXT: sub x0, x29, #16 |
| 91 | +; CHECK-O1-NEXT: cbnz x8, .LBB0_4 |
| 92 | +; CHECK-O1-NEXT: // %bb.3: // %exit |
| 93 | +; CHECK-O1-NEXT: bl __arm_tpidr2_restore |
| 94 | +; CHECK-O1-NEXT: .LBB0_4: // %exit |
| 95 | +; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr |
| 96 | +; CHECK-O1-NEXT: mov sp, x29 |
| 97 | +; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload |
| 98 | +; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload |
| 99 | +; CHECK-O1-NEXT: b shared_za_call |
| 100 | +entry: |
| 101 | + %cmpgt = icmp sgt i32 %n, 0 |
| 102 | + tail call void @shared_za_call() |
| 103 | + br i1 %cmpgt, label %loop, label %exit |
| 104 | + |
| 105 | +loop: |
| 106 | + %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ] |
| 107 | + tail call void @private_za_call() |
| 108 | + %next_iv = add nuw nsw i32 %iv, 1 |
| 109 | + %cmpeq = icmp eq i32 %next_iv, %n |
| 110 | + br i1 %cmpeq, label %exit, label %loop |
| 111 | + |
| 112 | +exit: |
| 113 | + tail call void @shared_za_call() |
| 114 | + ret void |
| 115 | +} |
0 commit comments