From 72fc8f4b154893b829aebe9f811f0dc57f65eb05 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 22 May 2025 11:23:25 +0000 Subject: [PATCH 1/4] [AArch64][SME] Simplify initialization of TPIDR2 block This patch updates the definition of `AArch64ISD::INIT_TPIDR2OBJ` to take the number of save slices (which is currently always all ZA slices). Using this, we can initialize the TPIDR2 block with a single STP of the save buffer pointer and the number of save slices. The reserved bytes (10-15) will be implicitly zeroed as the result of RDSVL will always be <= 16-bits. Using an STP is also possible for big-endian targets with an additional left shift. Note: We used to write the number of save slices to the TPIDR2 block before every call with a lazy save; however, based on 6.6.9 "Changes to the TPIDR2 block" in the aapcs64 (https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#changes-to-the-tpidr2-block), it seems we can rely on callers preserving the contents of the TPIDR2 block. --- .../Target/AArch64/AArch64ISelLowering.cpp | 42 ++++++------ .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 6 +- .../AArch64/sme-disable-gisel-fisel.ll | 23 ++----- .../CodeGen/AArch64/sme-lazy-save-call.ll | 57 ++++++---------- .../AArch64/sme-shared-za-interface.ll | 18 ++--- .../AArch64/sme-tpidr2-init-aarch64be.ll | 66 +++++++++++++++++++ .../AArch64/sme-za-lazy-save-buffer.ll | 30 ++++----- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 9 +-- llvm/test/CodeGen/AArch64/stack-hazard.ll | 27 +++----- .../CodeGen/AArch64/sve-stack-frame-layout.ll | 9 +-- 10 files changed, 149 insertions(+), 138 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d70a46b0e8939..463466fb27d17 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2985,20 +2985,24 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); if (TPIDR2.Uses > 0) { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - // Store the buffer pointer to the TPIDR2 stack object. - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + unsigned TPIDInitSaveSlicesReg = MI.getOperand(1).getReg(); + if (!Subtarget->isLittleEndian()) { + unsigned TmpReg = + MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + // For big-endian targets move "num_za_save_slices" to the top two bytes. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::UBFMXri), TmpReg) + .addReg(TPIDInitSaveSlicesReg) + .addImm(16) + .addImm(15); + TPIDInitSaveSlicesReg = TmpReg; + } + // Store buffer pointer and num_za_save_slices. + // Bytes 10-15 are implicitly zeroed. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi)) .addReg(MI.getOperand(0).getReg()) + .addReg(TPIDInitSaveSlicesReg) .addFrameIndex(TPIDR2.FrameIndex) .addImm(0); - // Set the reserved bytes (10-15) to zero - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2.FrameIndex) - .addImm(5); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2.FrameIndex) - .addImm(3); } else MFI.RemoveStackObject(TPIDR2.FrameIndex); @@ -8313,9 +8317,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); MFI.CreateVariableSizedObject(Align(16), nullptr); } + SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); Chain = DAG.getNode( AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), - {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0), + /*Num save slices*/ NumZaSaveSlices}); } else if (Attrs.hasAgnosticZAInterface()) { // Call __arm_sme_state_size(). SDValue BufferSize = @@ -9165,19 +9172,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave(); bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState(); if (RequiresLazySave) { - const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - MachinePointerInfo MPI = - MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue TPIDR2ObjAddr = DAG.getFrameIndex( TPIDR2.FrameIndex, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - SDValue NumZaSaveSlicesAddr = - DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, - DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); - SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr, - MPI, MVT::i16); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 125225df15464..0d8cb3a76d0be 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -61,10 +61,10 @@ let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)), (AllocateZABuffer $size)>; -def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1, - [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>; +def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPMayStore]>; let usesCustomInserter = 1 in { - def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {} + def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer, GPR64:$save_slices), [(AArch64InitTPIDR2Obj GPR64:$buffer, GPR64:$save_slices)]>, Sched<[WriteI]> {} } // Nodes to allocate a save buffer for SME. diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index e1bfdddaba923..937dd417b9ec2 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -250,10 +250,7 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] ; CHECK-COMMON-NEXT: sub x8, x29, #16 ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 ; CHECK-COMMON-NEXT: bl normal_callee @@ -292,12 +289,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -356,12 +350,9 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index c57cb8e0873d0..64e4a2196ebfd 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -20,12 +20,9 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -74,21 +71,17 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_2_callees: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -98,8 +91,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -110,9 +102,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: test_lazy_save_2_callees: @@ -159,12 +150,9 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -227,12 +215,9 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: mov x20, x0 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-80] -; CHECK-NEXT: sub x9, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-70] -; CHECK-NEXT: stur wzr, [x29, #-68] -; CHECK-NEXT: sturh w8, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: tbz w20, #0, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index b4ff8d085ff40..c8915aac56084 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -15,12 +15,9 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -73,12 +70,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll b/llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll new file mode 100644 index 0000000000000..78823e8b4da60 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s --check-prefix=CHECK-BE + +declare void @private_za_callee() +declare float @llvm.cos.f32(float) + +; Test TPIDR2_EL0 is initialized correctly for AArch64 big-endian. +define void @test_tpidr2_init() nounwind "aarch64_inout_za" { +; CHECK-LABEL: test_tpidr2_init: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-BE-LABEL: test_tpidr2_init: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-BE-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-BE-NEXT: mov x29, sp +; CHECK-BE-NEXT: sub sp, sp, #16 +; CHECK-BE-NEXT: rdsvl x8, #1 +; CHECK-BE-NEXT: mov x9, sp +; CHECK-BE-NEXT: msub x9, x8, x8, x9 +; CHECK-BE-NEXT: mov sp, x9 +; CHECK-BE-NEXT: lsl x8, x8, #48 +; CHECK-BE-NEXT: sub x10, x29, #16 +; CHECK-BE-NEXT: stp x9, x8, [x29, #-16] +; CHECK-BE-NEXT: msr TPIDR2_EL0, x10 +; CHECK-BE-NEXT: bl private_za_callee +; CHECK-BE-NEXT: smstart za +; CHECK-BE-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-BE-NEXT: sub x0, x29, #16 +; CHECK-BE-NEXT: cbnz x8, .LBB0_2 +; CHECK-BE-NEXT: // %bb.1: +; CHECK-BE-NEXT: bl __arm_tpidr2_restore +; CHECK-BE-NEXT: .LBB0_2: +; CHECK-BE-NEXT: msr TPIDR2_EL0, xzr +; CHECK-BE-NEXT: mov sp, x29 +; CHECK-BE-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-BE-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-BE-NEXT: ret + call void @private_za_callee() + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index a9ad6f695cf8f..4ab553d79405d 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -22,11 +22,9 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: cbz w0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 @@ -34,10 +32,8 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_2: // %use_c ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -115,20 +111,18 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: cmp sp, x9 ; CHECK-NEXT: b.le .LBB2_3 ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: str xzr, [sp] ; CHECK-NEXT: b .LBB2_1 ; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: cbz w0, .LBB2_5 ; CHECK-NEXT: // %bb.4: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 @@ -136,10 +130,8 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: b .LBB2_8 ; CHECK-NEXT: .LBB2_5: // %use_c ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 57c1ced8ab125..49eb368662b5d 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -40,13 +40,10 @@ define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstart za diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index c878d888b5f03..5f52280935c73 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -2855,12 +2855,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK0-NEXT: mov w20, w0 ; CHECK0-NEXT: msub x9, x8, x8, x9 ; CHECK0-NEXT: mov sp, x9 -; CHECK0-NEXT: stur x9, [x29, #-80] -; CHECK0-NEXT: sub x9, x29, #80 -; CHECK0-NEXT: sturh wzr, [x29, #-70] -; CHECK0-NEXT: stur wzr, [x29, #-68] -; CHECK0-NEXT: sturh w8, [x29, #-72] -; CHECK0-NEXT: msr TPIDR2_EL0, x9 +; CHECK0-NEXT: sub x10, x29, #80 +; CHECK0-NEXT: stp x9, x8, [x29, #-80] +; CHECK0-NEXT: msr TPIDR2_EL0, x10 ; CHECK0-NEXT: smstop sm ; CHECK0-NEXT: bl other ; CHECK0-NEXT: smstart sm @@ -2930,12 +2927,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK64-NEXT: msub x9, x8, x8, x9 ; CHECK64-NEXT: mov x19, sp ; CHECK64-NEXT: mov sp, x9 -; CHECK64-NEXT: str x9, [x19] -; CHECK64-NEXT: add x9, x19, #0 -; CHECK64-NEXT: strh wzr, [x19, #10] -; CHECK64-NEXT: str wzr, [x19, #12] -; CHECK64-NEXT: strh w8, [x19, #8] -; CHECK64-NEXT: msr TPIDR2_EL0, x9 +; CHECK64-NEXT: add x10, x19, #0 +; CHECK64-NEXT: stp x9, x8, [x19] +; CHECK64-NEXT: msr TPIDR2_EL0, x10 ; CHECK64-NEXT: smstop sm ; CHECK64-NEXT: bl other ; CHECK64-NEXT: smstart sm @@ -3011,12 +3005,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK1024-NEXT: msub x9, x8, x8, x9 ; CHECK1024-NEXT: mov x19, sp ; CHECK1024-NEXT: mov sp, x9 -; CHECK1024-NEXT: str x9, [x19] -; CHECK1024-NEXT: add x9, x19, #0 -; CHECK1024-NEXT: strh wzr, [x19, #10] -; CHECK1024-NEXT: str wzr, [x19, #12] -; CHECK1024-NEXT: strh w8, [x19, #8] -; CHECK1024-NEXT: msr TPIDR2_EL0, x9 +; CHECK1024-NEXT: add x10, x19, #0 +; CHECK1024-NEXT: stp x9, x8, [x19] +; CHECK1024-NEXT: msr TPIDR2_EL0, x10 ; CHECK1024-NEXT: smstop sm ; CHECK1024-NEXT: bl other ; CHECK1024-NEXT: smstart sm diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 1f1ca7e3b9ee9..2cbb29ebe1a1f 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -556,12 +556,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: mov w20, w0 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-80] -; CHECK-NEXT: sub x9, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-70] -; CHECK-NEXT: stur wzr, [x29, #-68] -; CHECK-NEXT: sturh w8, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl other ; CHECK-NEXT: smstart sm From 913093ffc250573f4159f77a67a1fc843aa1bf7b Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 12 Aug 2025 14:29:52 +0000 Subject: [PATCH 2/4] Remove big-endian support --- .../Target/AArch64/AArch64ISelLowering.cpp | 19 ++---- .../AArch64/sme-tpidr2-init-aarch64be.ll | 66 ------------------- 2 files changed, 7 insertions(+), 78 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 463466fb27d17..35a3b6609454a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2984,23 +2984,18 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, AArch64FunctionInfo *FuncInfo = MF->getInfo(); TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); if (TPIDR2.Uses > 0) { + // Note: This case just needs to do `SVL << 48`. It is not implemented as we + // generally don't support big-endian SVE/SME. + assert( + Subtarget->isLittleEndian() && + "TPIDR2 block initialization is not supported on big-endian targets"); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - unsigned TPIDInitSaveSlicesReg = MI.getOperand(1).getReg(); - if (!Subtarget->isLittleEndian()) { - unsigned TmpReg = - MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); - // For big-endian targets move "num_za_save_slices" to the top two bytes. - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::UBFMXri), TmpReg) - .addReg(TPIDInitSaveSlicesReg) - .addImm(16) - .addImm(15); - TPIDInitSaveSlicesReg = TmpReg; - } // Store buffer pointer and num_za_save_slices. // Bytes 10-15 are implicitly zeroed. BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi)) .addReg(MI.getOperand(0).getReg()) - .addReg(TPIDInitSaveSlicesReg) + .addReg(MI.getOperand(1).getReg()) .addFrameIndex(TPIDR2.FrameIndex) .addImm(0); } else diff --git a/llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll b/llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll deleted file mode 100644 index 78823e8b4da60..0000000000000 --- a/llvm/test/CodeGen/AArch64/sme-tpidr2-init-aarch64be.ll +++ /dev/null @@ -1,66 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s -; RUN: llc -mtriple=aarch64_be -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s --check-prefix=CHECK-BE - -declare void @private_za_callee() -declare float @llvm.cos.f32(float) - -; Test TPIDR2_EL0 is initialized correctly for AArch64 big-endian. -define void @test_tpidr2_init() nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_tpidr2_init: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-BE-LABEL: test_tpidr2_init: -; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-BE-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-BE-NEXT: mov x29, sp -; CHECK-BE-NEXT: sub sp, sp, #16 -; CHECK-BE-NEXT: rdsvl x8, #1 -; CHECK-BE-NEXT: mov x9, sp -; CHECK-BE-NEXT: msub x9, x8, x8, x9 -; CHECK-BE-NEXT: mov sp, x9 -; CHECK-BE-NEXT: lsl x8, x8, #48 -; CHECK-BE-NEXT: sub x10, x29, #16 -; CHECK-BE-NEXT: stp x9, x8, [x29, #-16] -; CHECK-BE-NEXT: msr TPIDR2_EL0, x10 -; CHECK-BE-NEXT: bl private_za_callee -; CHECK-BE-NEXT: smstart za -; CHECK-BE-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-BE-NEXT: sub x0, x29, #16 -; CHECK-BE-NEXT: cbnz x8, .LBB0_2 -; CHECK-BE-NEXT: // %bb.1: -; CHECK-BE-NEXT: bl __arm_tpidr2_restore -; CHECK-BE-NEXT: .LBB0_2: -; CHECK-BE-NEXT: msr TPIDR2_EL0, xzr -; CHECK-BE-NEXT: mov sp, x29 -; CHECK-BE-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-BE-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-BE-NEXT: ret - call void @private_za_callee() - ret void -} From b06492762ecfa624a7af288ec9323146b9153564 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 12 Aug 2025 14:53:04 +0000 Subject: [PATCH 3/4] Use fatal error --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 35a3b6609454a..d0535b307ae42 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2986,9 +2986,9 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, if (TPIDR2.Uses > 0) { // Note: This case just needs to do `SVL << 48`. It is not implemented as we // generally don't support big-endian SVE/SME. - assert( - Subtarget->isLittleEndian() && - "TPIDR2 block initialization is not supported on big-endian targets"); + if (!Subtarget->isLittleEndian()) + reportFatalInternalError( + "TPIDR2 block initialization is not supported on big-endian targets"); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); // Store buffer pointer and num_za_save_slices. From fd34a824de81bedf178cc6a9f432977480eeae91 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 26 Aug 2025 10:12:51 +0000 Subject: [PATCH 4/4] Update tests --- .../CodeGen/AArch64/sme-lazy-save-call.ll | 432 +++++---------- .../CodeGen/AArch64/sme-za-control-flow.ll | 523 ++++++------------ 2 files changed, 320 insertions(+), 635 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 64e4a2196ebfd..67199d9c0970c 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -10,59 +10,32 @@ declare float @llvm.cos.f32(float) ; Test lazy-save mechanism for a single callee. define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_lazy_save_1_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_lazy_save_1_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB0_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_lazy_save_1_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: bl private_za_callee +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB0_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB0_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void @private_za_callee() ret void } @@ -140,59 +113,32 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; Test a call of an intrinsic that gets expanded to a library call. define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_lazy_save_expanded_intrinsic: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEXT: bl cosf -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB2_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB2_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_lazy_save_expanded_intrinsic: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: bl cosf -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB2_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_lazy_save_expanded_intrinsic: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: bl cosf +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB2_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB2_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %res = call float @llvm.cos.f32(float %a) ret float %res } @@ -298,24 +244,20 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-LABEL: test_lazy_save_mixed_shared_and_private_callees: ; CHECK: // %bb.0: // %prelude -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -32 -; CHECK-NEXT: .cfi_offset w30, -40 -; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB4_2 ; CHECK-NEXT: // %bb.1: // %save.za @@ -323,11 +265,9 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: zero {za} -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -339,8 +279,7 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_callee ; CHECK-NEXT: bl preserves_za_callee -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -352,9 +291,8 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: test_lazy_save_mixed_shared_and_private_callees: @@ -413,28 +351,23 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-LABEL: test_many_back2back_private_za_calls: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -32 -; CHECK-NEXT: .cfi_offset w30, -40 -; CHECK-NEXT: .cfi_offset w29, -48 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_callee -; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -444,8 +377,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -455,8 +387,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -466,8 +397,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_6: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -477,8 +407,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_8: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -488,8 +417,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_10: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -501,9 +429,8 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_callee ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: test_many_back2back_private_za_calls: @@ -555,66 +482,34 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { } define void @test_shared_private_shared() nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_shared_private_shared: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: bl shared_za_callee -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB6_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB6_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: bl shared_za_callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_shared_private_shared: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB6_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB6_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_shared_private_shared: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: bl shared_za_callee +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_callee +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB6_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB6_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: bl shared_za_callee +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void @shared_za_callee() call void @private_za_callee() call void @shared_za_callee() @@ -636,70 +531,36 @@ declare i64 @shared_za_callee_i64(i64) "aarch64_inout_za" declare i64 @private_za_callee_i64(i64) define i64 @test_shared_private_shared_i64(i64 %x) nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_shared_private_shared_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: bl shared_za_callee_i64 -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_callee_i64 -; CHECK-NEXT: mov x1, x0 -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB8_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB8_2: -; CHECK-NEXT: mov x0, x1 -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: bl shared_za_callee_i64 -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_shared_private_shared_i64: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee_i64 -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee_i64 -; CHECK-NEWLOWERING-NEXT: mov x1, x0 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB8_2: -; CHECK-NEWLOWERING-NEXT: mov x0, x1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee_i64 -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_shared_private_shared_i64: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: bl shared_za_callee_i64 +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_callee_i64 +; CHECK-COMMON-NEXT: mov x1, x0 +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB8_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB8_2: +; CHECK-COMMON-NEXT: mov x0, x1 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: bl shared_za_callee_i64 +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %a = call i64 @shared_za_callee_i64(i64 %x) %b = call i64 @private_za_callee_i64(i64 %a) %c = call i64 @shared_za_callee_i64(i64 %b) @@ -724,12 +585,9 @@ define i64 @test_many_callee_arguments( ; CHECK-NEXT: msub x8, x9, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: ldp x10, x11, [x29, #32] -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w9, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: sub x12, x29, #16 +; CHECK-NEXT: stp x8, x9, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x12 ; CHECK-NEXT: stp x10, x11, [sp, #-16]! ; CHECK-NEXT: bl many_args_private_za_callee ; CHECK-NEXT: add sp, sp, #16 diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index d3d7e953bedfa..18ea07e38fe89 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -8,24 +8,20 @@ declare void @shared_za_call() "aarch64_inout_za" define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-LABEL: private_za_loop: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: cmp w0, #1 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: b.lt .LBB0_5 ; CHECK-NEXT: // %bb.1: // %loop.preheader ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %loop ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 @@ -34,8 +30,7 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-NEXT: b.eq .LBB0_5 ; CHECK-NEXT: .LBB0_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -47,9 +42,8 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_5: // %exit ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: private_za_loop: @@ -106,25 +100,21 @@ exit: define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-LABEL: private_za_loop_active_entry_and_exit: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: cmp w19, #1 ; CHECK-NEXT: b.lt .LBB1_5 ; CHECK-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_2: // %loop ; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 @@ -133,8 +123,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEXT: b.eq .LBB1_5 ; CHECK-NEXT: .LBB1_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -146,9 +135,8 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEXT: b .LBB1_2 ; CHECK-NEXT: .LBB1_5: // %exit ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call ; ; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit: @@ -251,17 +239,13 @@ define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB3_4 ; CHECK-NEXT: // %bb.1: // %private_za_call -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -317,20 +301,17 @@ exit: define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind { ; CHECK-LABEL: mixed_shared_private_za_loop: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: b .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %loop ; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 @@ -340,8 +321,7 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind ; CHECK-NEXT: .LBB4_2: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -354,9 +334,8 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind ; CHECK-NEXT: .LBB4_4: // %exit ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: mixed_shared_private_za_loop: @@ -425,18 +404,14 @@ define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounw ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: tbz w19, #0, .LBB5_4 ; CHECK-NEXT: // %bb.1: // %cond_clobber -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -447,10 +422,8 @@ define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounw ; CHECK-NEXT: .LBB5_3: // %cond_clobber ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB5_4: // %exit -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -511,67 +484,34 @@ exit: } define void @conditionally_use_za(i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: conditionally_use_za: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: tbz w0, #0, .LBB6_4 -; CHECK-NEXT: // %bb.1: // %use_za -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB6_3 -; CHECK-NEXT: // %bb.2: // %use_za -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB6_3: // %use_za -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB6_4: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: conditionally_use_za: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB6_4 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_za -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB6_3 -; CHECK-NEWLOWERING-NEXT: // %bb.2: // %use_za -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB6_3: // %use_za -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: .LBB6_4: // %exit -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: conditionally_use_za: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: tbz w0, #0, .LBB6_4 +; CHECK-COMMON-NEXT: // %bb.1: // %use_za +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB6_3 +; CHECK-COMMON-NEXT: // %bb.2: // %use_za +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB6_3: // %use_za +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB6_4: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret br i1 %cond, label %use_za, label %exit use_za: @@ -585,73 +525,37 @@ exit: define void @diamond_mixed_za_merge_shared(i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: diamond_mixed_za_merge_shared: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: tbz w0, #0, .LBB7_2 -; CHECK-NEXT: // %bb.1: // %then -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: b .LBB7_5 -; CHECK-NEXT: .LBB7_2: // %else -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB7_4 -; CHECK-NEXT: // %bb.3: // %else -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB7_4: // %else -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB7_5: // %merge_shared -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: diamond_mixed_za_merge_shared: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB7_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %then -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: b .LBB7_5 -; CHECK-NEWLOWERING-NEXT: .LBB7_2: // %else -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB7_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: // %else -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB7_4: // %else -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: .LBB7_5: // %merge_shared -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: diamond_mixed_za_merge_shared: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: tbz w0, #0, .LBB7_2 +; CHECK-COMMON-NEXT: // %bb.1: // %then +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: b .LBB7_5 +; CHECK-COMMON-NEXT: .LBB7_2: // %else +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB7_4 +; CHECK-COMMON-NEXT: // %bb.3: // %else +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB7_4: // %else +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB7_5: // %merge_shared +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: br i1 %cond, label %then, label %else @@ -677,20 +581,16 @@ define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwin ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB8_2 ; CHECK-NEXT: // %bb.1: // %then ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: b .LBB8_5 ; CHECK-NEXT: .LBB8_2: // %else -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -701,10 +601,8 @@ define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwin ; CHECK-NEXT: .LBB8_4: // %else ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB8_5: // %merge_private_za -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -777,20 +675,16 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w1 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB9_5 ; CHECK-NEXT: // %bb.1: // %shared_path ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: tbz w19, #0, .LBB9_8 ; CHECK-NEXT: .LBB9_2: // %exit_private -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -802,10 +696,8 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: b .LBB9_9 ; CHECK-NEXT: .LBB9_5: // %private_path -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -897,117 +789,58 @@ exit_shared: } define void @nested_cond_in_loop(i32 %n, i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: nested_cond_in_loop: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: str x23, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: cmp w0, #1 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: b.lt .LBB10_8 -; CHECK-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEXT: mov w19, w1 -; CHECK-NEXT: mov w20, w0 -; CHECK-NEXT: mov w21, wzr -; CHECK-NEXT: rdsvl x22, #1 -; CHECK-NEXT: sub x23, x29, #16 -; CHECK-NEXT: b .LBB10_4 -; CHECK-NEXT: .LBB10_2: // %use_shared -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: .LBB10_3: // %latch -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: add w21, w21, #1 -; CHECK-NEXT: cmp w21, w20 -; CHECK-NEXT: b.ge .LBB10_8 -; CHECK-NEXT: .LBB10_4: // %loop -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: tbnz w19, #0, .LBB10_2 -; CHECK-NEXT: // %bb.5: // %use_private -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: sturh w22, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x23 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB10_7 -; CHECK-NEXT: // %bb.6: // %use_private -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB10_7: // %use_private -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: b .LBB10_3 -; CHECK-NEXT: .LBB10_8: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x23, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: nested_cond_in_loop: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: cmp w0, #1 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: b.lt .LBB10_8 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEWLOWERING-NEXT: mov w19, w1 -; CHECK-NEWLOWERING-NEXT: mov w20, w0 -; CHECK-NEWLOWERING-NEXT: mov w21, wzr -; CHECK-NEWLOWERING-NEXT: sub x22, x29, #16 -; CHECK-NEWLOWERING-NEXT: b .LBB10_4 -; CHECK-NEWLOWERING-NEXT: .LBB10_2: // %use_shared -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: .LBB10_3: // %latch -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: add w21, w21, #1 -; CHECK-NEWLOWERING-NEXT: cmp w21, w20 -; CHECK-NEWLOWERING-NEXT: b.ge .LBB10_8 -; CHECK-NEWLOWERING-NEXT: .LBB10_4: // %loop -; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB10_2 -; CHECK-NEWLOWERING-NEXT: // %bb.5: // %use_private -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x22 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB10_7 -; CHECK-NEWLOWERING-NEXT: // %bb.6: // %use_private -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB10_7: // %use_private -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: b .LBB10_3 -; CHECK-NEWLOWERING-NEXT: .LBB10_8: // %exit -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: nested_cond_in_loop: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: cmp w0, #1 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: b.lt .LBB10_8 +; CHECK-COMMON-NEXT: // %bb.1: // %loop.preheader +; CHECK-COMMON-NEXT: mov w19, w1 +; CHECK-COMMON-NEXT: mov w20, w0 +; CHECK-COMMON-NEXT: mov w21, wzr +; CHECK-COMMON-NEXT: sub x22, x29, #16 +; CHECK-COMMON-NEXT: b .LBB10_4 +; CHECK-COMMON-NEXT: .LBB10_2: // %use_shared +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: .LBB10_3: // %latch +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: add w21, w21, #1 +; CHECK-COMMON-NEXT: cmp w21, w20 +; CHECK-COMMON-NEXT: b.ge .LBB10_8 +; CHECK-COMMON-NEXT: .LBB10_4: // %loop +; CHECK-COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-COMMON-NEXT: tbnz w19, #0, .LBB10_2 +; CHECK-COMMON-NEXT: // %bb.5: // %use_private +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x22 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB10_7 +; CHECK-COMMON-NEXT: // %bb.6: // %use_private +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB10_7: // %use_private +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: b .LBB10_3 +; CHECK-COMMON-NEXT: .LBB10_8: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %cmp = icmp sgt i32 %n, 0 br i1 %cmp, label %loop, label %exit @@ -1036,25 +869,21 @@ exit: define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind { ; CHECK-LABEL: loop_with_external_entry: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w1 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB11_2 ; CHECK-NEXT: // %bb.1: // %init ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: .LBB11_2: // %loop.preheader -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: b .LBB11_4 ; CHECK-NEXT: .LBB11_3: // %loop ; CHECK-NEXT: // in Loop: Header=BB11_4 Depth=1 @@ -1062,8 +891,7 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEXT: tbz w19, #0, .LBB11_6 ; CHECK-NEXT: .LBB11_4: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -1075,9 +903,8 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEXT: b .LBB11_3 ; CHECK-NEXT: .LBB11_6: // %exit ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: loop_with_external_entry: