diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 109988246d0ab..39f13f5907190 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2529,8 +2529,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { } // Check that an instruction has register operands only as expected. - if (MCOI.OperandType == MCOI::OPERAND_REGISTER && - !MO->isReg() && !MO->isFI()) + if (MCOI.OperandType == MCOI::OPERAND_REGISTER && !MO->isReg() && + !MO->isFI() && !MO->isTargetIndex()) report("Expected a register operand.", MO, MONum); if (MO->isReg()) { if (MCOI.OperandType == MCOI::OPERAND_IMMEDIATE || diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 5496ebd495a55..558cec0e373da 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -60,6 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); FunctionPass *createSMEABIPass(); FunctionPass *createSMEPeepholeOptPass(); +FunctionPass *createMachineSMEABIPass(); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, @@ -111,8 +112,14 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); void initializeSMEABIPass(PassRegistry &); void initializeSMEPeepholeOptPass(PassRegistry &); +void initializeMachineSMEABIPass(PassRegistry &); void initializeSVEIntrinsicOptsPass(PassRegistry &); void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &); + +namespace AArch64 { +enum TargetIndex { SAVED_VG_SLOT, SAVED_STREAMING_VG_SLOT }; +} + } // end namespace llvm #endif diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index a71668e71c235..ddcdb59cc8ec5 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -333,7 +333,10 @@ static bool needsWinCFI(const MachineFunction &MF); static StackOffset getSVEStackSize(const MachineFunction &MF); static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, bool HasCall = false); -static bool requiresSaveVG(const MachineFunction &MF); +static bool requiresSaveVG(const MachineFunction &MF) { + return MF.getSubtarget().getRegisterInfo()->requiresSaveVG( + MF); +} /// Returns true if a homogeneous prolog or epilog code can be emitted /// for the size optimization. If possible, a frame helper call is injected. @@ -1105,8 +1108,7 @@ bool AArch64FrameLowering::canUseAsPrologue( // May need a scratch register (for return value) if require making a special // call - if (requiresSaveVG(*MF) || - windowsRequiresStackProbe(*MF, std::numeric_limits::max())) + if (windowsRequiresStackProbe(*MF, std::numeric_limits::max())) if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister) return false; @@ -1391,38 +1393,6 @@ bool requiresGetVGCall(MachineFunction &MF) { !MF.getSubtarget().hasSVE(); } -static bool requiresSaveVG(const MachineFunction &MF) { - const AArch64FunctionInfo *AFI = MF.getInfo(); - // For Darwin platforms we don't save VG for non-SVE functions, even if SME - // is enabled with streaming mode changes. - if (!AFI->hasStreamingModeChanges()) - return false; - auto &ST = MF.getSubtarget(); - if (ST.isTargetDarwin()) - return ST.hasSVE(); - return true; -} - -bool isVGInstruction(MachineBasicBlock::iterator MBBI) { - unsigned Opc = MBBI->getOpcode(); - if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI || - Opc == AArch64::UBFMXri) - return true; - - if (requiresGetVGCall(*MBBI->getMF())) { - if (Opc == AArch64::ORRXrr) - return true; - - if (Opc == AArch64::BL) { - auto Op1 = MBBI->getOperand(0); - return Op1.isSymbol() && - (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg"); - } - } - - return false; -} - // Convert callee-save register save/restore instruction to do stack pointer // decrement/increment to allocate/deallocate the callee-save stack area by // converting store/load to use pre/post increment version. @@ -1434,15 +1404,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( int CFAOffset = 0) { unsigned NewOpc; - // If the function contains streaming mode changes, we expect instructions - // to calculate the value of VG before spilling. For locally-streaming - // functions, we need to do this for both the streaming and non-streaming - // vector length. Move past these instructions if necessary. - MachineFunction &MF = *MBB.getParent(); - if (requiresSaveVG(MF)) - while (isVGInstruction(MBBI)) - ++MBBI; - switch (MBBI->getOpcode()) { default: llvm_unreachable("Unexpected callee-save save/restore opcode!"); @@ -1979,9 +1940,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // pointer bump above. while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && !IsSVECalleeSave(MBBI)) { - if (CombineSPBump && - // Only fix-up frame-setup load/store instructions. - (!requiresSaveVG(MF) || !isVGInstruction(MBBI))) + if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); ++MBBI; @@ -3403,66 +3362,19 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( StrOpc = Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI; break; - case RegPairInfo::VG: - StrOpc = AArch64::STRXui; - break; - } - - unsigned X0Scratch = AArch64::NoRegister; - if (Reg1 == AArch64::VG) { - // Find an available register to store value of VG to. - Reg1 = findScratchNonCalleeSaveRegister(&MBB, true); - assert(Reg1 != AArch64::NoRegister); + case RegPairInfo::VG: { SMEAttrs Attrs = AFI->getSMEFnAttrs(); - if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() && AFI->getStreamingVGIdx() == std::numeric_limits::max()) { // For locally-streaming functions, we need to store both the streaming - // & non-streaming VG. Spill the streaming value first. - BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1) - .addImm(1) - .setMIFlag(MachineInstr::FrameSetup); - BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1) - .addReg(Reg1) - .addImm(3) - .addImm(63) - .setMIFlag(MachineInstr::FrameSetup); - + // & non-streaming VG. AFI->setStreamingVGIdx(RPI.FrameIdx); - } else if (MF.getSubtarget().hasSVE()) { - BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1) - .addImm(31) - .addImm(1) - .setMIFlag(MachineInstr::FrameSetup); - AFI->setVGIdx(RPI.FrameIdx); } else { - const AArch64Subtarget &STI = MF.getSubtarget(); - if (llvm::any_of( - MBB.liveins(), - [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { - return STI.getRegisterInfo()->isSuperOrSubRegisterEq( - AArch64::X0, LiveIn.PhysReg); - })) - X0Scratch = Reg1; - - if (X0Scratch != AArch64::NoRegister) - BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1) - .addReg(AArch64::XZR) - .addReg(AArch64::X0, RegState::Undef) - .addReg(AArch64::X0, RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); - - const uint32_t *RegMask = TRI->getCallPreservedMask( - MF, - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); - BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) - .addExternalSymbol("__arm_get_current_vg") - .addRegMask(RegMask) - .addReg(AArch64::X0, RegState::ImplicitDefine) - .setMIFlag(MachineInstr::FrameSetup); - Reg1 = AArch64::X0; AFI->setVGIdx(RPI.FrameIdx); } + // VG will be written to the frame indices immediately after the prologue. + continue; + } } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); @@ -3556,13 +3468,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( if (RPI.isPaired()) MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector); } - - if (X0Scratch != AArch64::NoRegister) - BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0) - .addReg(AArch64::XZR) - .addReg(X0Scratch, RegState::Undef) - .addReg(X0Scratch, RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); } return true; } @@ -4092,31 +3997,19 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( MaxCSFrameIndex = FrameIdx; } - // Insert VG into the list of CSRs, immediately before LR if saved. if (requiresSaveVG(MF)) { - std::vector VGSaves; - SMEAttrs Attrs = AFI->getSMEFnAttrs(); - - auto VGInfo = CalleeSavedInfo(AArch64::VG); + CalleeSavedInfo VGInfo(AArch64::VG); VGInfo.setRestored(false); - VGSaves.push_back(VGInfo); + SmallVector VGSaves{VGInfo}; // Add VG again if the function is locally-streaming, as we will spill two // values. + SMEAttrs Attrs = AFI->getSMEFnAttrs(); if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface()) VGSaves.push_back(VGInfo); - bool InsertBeforeLR = false; - - for (unsigned I = 0; I < CSI.size(); I++) - if (CSI[I].getReg() == AArch64::LR) { - InsertBeforeLR = true; - CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end()); - break; - } - - if (!InsertBeforeLR) - llvm::append_range(CSI, VGSaves); + // Insert the VG saves at the start of the CSI (alongside GPRs). + CSI.insert(CSI.begin(), VGSaves.begin(), VGSaves.end()); } Register LastReg = 0; @@ -5135,13 +5028,28 @@ static void emitVGSaveRestore(MachineBasicBlock::iterator II, MI.eraseFromParent(); } +static void replaceVGTargetIndices(MachineBasicBlock::iterator II, + AArch64FunctionInfo *AFI) { + for (auto &MO : II->explicit_operands()) { + if (MO.isTargetIndex()) { + if (MO.getIndex() == AArch64::SAVED_STREAMING_VG_SLOT) + MO.ChangeToFrameIndex(AFI->getStreamingVGIdx()); + if (MO.getIndex() == AArch64::SAVED_VG_SLOT) + MO.ChangeToFrameIndex(AFI->getVGIdx()); + } + } +} + void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( MachineFunction &MF, RegScavenger *RS = nullptr) const { + bool VGSaved = requiresSaveVG(MF); + AArch64FunctionInfo *AFI = MF.getInfo(); for (auto &BB : MF) for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) { - if (requiresSaveVG(MF)) + if (VGSaved) { + replaceVGTargetIndices(II, AFI); emitVGSaveRestore(II++, this); - else if (StackTaggingMergeSetTag) + } else if (StackTaggingMergeSetTag) II = tryMergeAdjacentSTG(II, this, RS); } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 951cb93ea8f8c..ec42718954ba6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3688,6 +3688,14 @@ static unsigned offsetExtendOpcode(unsigned Opcode) { } } +ArrayRef> +AArch64InstrInfo::getSerializableTargetIndices() const { + static constexpr std::pair TargetIndices[] = { + {AArch64::SAVED_VG_SLOT, "saved-vg-slot"}, + {AArch64::SAVED_STREAMING_VG_SLOT, "saved-streaming-vg-slot"}}; + return TargetIndices; +} + MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da333e4b..7d12271424664 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -299,6 +299,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { MachineInstr *emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override; + ArrayRef> + getSerializableTargetIndices() const override; + bool getMemOperandsWithOffsetWidth( const MachineInstr &MI, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index fb472ddc719fc..9563b1eeacc1d 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -673,6 +673,18 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { return false; } +bool AArch64RegisterInfo::requiresSaveVG(const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo(); + // For Darwin platforms we don't save VG for non-SVE functions, even if SME + // is enabled with streaming mode changes. + if (!AFI->hasStreamingModeChanges()) + return false; + auto &ST = MF.getSubtarget(); + if (ST.isTargetDarwin()) + return ST.hasSVE(); + return true; +} + bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const { CallingConv::ID CC = MF.getFunction().getCallingConv(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index cc94be611a2ea..29fa6df4c7875 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -127,6 +127,8 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo { bool hasBasePointer(const MachineFunction &MF) const; unsigned getBaseRegister() const; + bool requiresSaveVG(const MachineFunction &MF) const; + bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 281a1457fe57e..4fc1f850e4e09 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -803,6 +803,9 @@ bool AArch64PassConfig::addILPOpts() { } void AArch64PassConfig::addPreRegAlloc() { + // Insert VG saves for the unwinder. + addPass(createMachineSMEABIPass()); + // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOptLevel::None && EnableDeadRegisterElimination) diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 66136a464f05d..909f638d03beb 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -87,6 +87,7 @@ add_llvm_target(AArch64CodeGen AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp SMEABIPass.cpp + MachineSMEABIPass.cpp SMEPeepholeOpt.cpp SVEIntrinsicOpts.cpp AArch64SIMDInstrOpt.cpp diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp new file mode 100644 index 0000000000000..bbdf6cab5222e --- /dev/null +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -0,0 +1,113 @@ +//===- MachineSMEABIPass.cpp - MIR SME ABI lowerings ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass implements aspects of the SME ABI not known to be required till +// this stage in lowering. Currently this is just: +// * Saving VG for the unwinder (which depends on SMEPeepholeOpt running first). +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-machine-sme-abi" + +namespace { + +struct MachineSMEABI : public MachineFunctionPass { + inline static char ID = 0; + + MachineSMEABI() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "Machine SME ABI pass"; } + + bool insertVGSaveForUnwinder(MachineFunction &MF) const; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +bool MachineSMEABI::insertVGSaveForUnwinder(MachineFunction &MF) const { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); + const AArch64FunctionInfo *AFI = MF.getInfo(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (!TRI.requiresSaveVG(MF)) + return false; + + SMEAttrs Attrs = AFI->getSMEFnAttrs(); + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + + if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface()) { + // For locally-streaming functions, we need to store both the streaming + // & non-streaming VG. Spill the streaming value first. + Register RDSVLReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register StreamingVGReg = + MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::RDSVLI_XI), RDSVLReg).addImm(1); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::UBFMXri), StreamingVGReg) + .addReg(RDSVLReg) + .addImm(3) + .addImm(63); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(StreamingVGReg) + .addTargetIndex(AArch64::SAVED_STREAMING_VG_SLOT) + .addImm(0); + } + + Register VGReg; + if (MF.getSubtarget().hasSVE()) { + VGReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::CNTD_XPiI), VGReg) + .addImm(31) + .addImm(1); + } else { + VGReg = AArch64::X0; + const uint32_t *RegMask = TRI.getCallPreservedMask( + MF, CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::BL)) + .addExternalSymbol("__arm_get_current_vg") + .addRegMask(RegMask) + .addReg(AArch64::X0, RegState::ImplicitDefine); + } + + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(VGReg) + .addTargetIndex(AArch64::SAVED_VG_SLOT) + .addImm(0); + return true; +} + +INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", + "Machine SME ABI pass", false, false) + +bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { + assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); + return insertVGSaveForUnwinder(MF); +} + +FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); } diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index abc67eec32391..f226a0b73d7a4 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -54,6 +54,7 @@ ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation +; CHECK-NEXT: Machine SME ABI pass ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index e1481667a4ab7..961ff923491b8 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -161,6 +161,7 @@ ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: AArch64 MI Peephole Optimization pass +; CHECK-NEXT: Machine SME ABI pass ; CHECK-NEXT: AArch64 Dead register definitions ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: Init Undef Pass diff --git a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll index 94fe06733347a..2466a0b3bf162 100644 --- a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll +++ b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll @@ -7,11 +7,12 @@ define void @streaming_mode_change1() #0 { ; CHECK-LABEL: streaming_mode_change1: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -21,10 +22,9 @@ define void @streaming_mode_change1() #0 { ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret -; + ; OUTLINER-LABEL: streaming_mode_change1: ; OUTLINER-NOT: OUTLINED_FUNCTION_ -; call void @callee(); ret void; } @@ -33,11 +33,12 @@ define void @streaming_mode_change2() #0 { ; CHECK-LABEL: streaming_mode_change2: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -47,10 +48,9 @@ define void @streaming_mode_change2() #0 { ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret -; + ; OUTLINER-LABEL: streaming_mode_change2: ; OUTLINER-NOT: OUTLINED_FUNCTION_ -; call void @callee(); ret void; } @@ -59,11 +59,12 @@ define void @streaming_mode_change3() #0 { ; CHECK-LABEL: streaming_mode_change3: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -73,10 +74,9 @@ define void @streaming_mode_change3() #0 { ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret -; + ; OUTLINER-LABEL: streaming_mode_change3: ; OUTLINER-NOT: OUTLINED_FUNCTION_ -; call void @callee(); ret void; } diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 1f68815411097..b75accee2899f 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -89,16 +89,14 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_get_current_vg -; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: mov x0, x9 ; CHECK-NEXT: add x29, sp, #64 -; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg ; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: str x0, [x29, #32] ; CHECK-NEXT: bl __arm_sme_state_size ; CHECK-NEXT: sub sp, sp, x0 ; CHECK-NEXT: mov x20, sp @@ -122,7 +120,7 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: sub sp, x29, #64 -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -140,16 +138,14 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_get_current_vg -; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: mov x0, x9 ; CHECK-NEXT: add x29, sp, #64 -; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg ; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: str x0, [x29, #32] ; CHECK-NEXT: bl __arm_sme_state_size ; CHECK-NEXT: sub sp, sp, x0 ; CHECK-NEXT: mov x19, sp @@ -189,7 +185,7 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: sub sp, x29, #64 -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll index 8e3866fcec89a..60be329ab8567 100644 --- a/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll +++ b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll @@ -11,28 +11,33 @@ define void @dont_coalesce_args(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind ; CHECK-COALESCER-BARRIER-NEXT: {{ $}} ; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]] - ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-COALESCER-BARRIER-NEXT: [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF ; CHECK-COALESCER-BARRIER-NEXT: [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-COALESCER-BARRIER-NEXT: $z0 = COPY [[INSERT_SUBREG]] ; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR ; ; CHECK-REGALLOC-LABEL: name: dont_coalesce_args ; CHECK-REGALLOC: bb.0 (%ir-block.0): ; CHECK-REGALLOC-NEXT: liveins: $q0 ; CHECK-REGALLOC-NEXT: {{ $}} + ; CHECK-REGALLOC-NEXT: renamable $x8 = RDSVLI_XI 1, implicit $vg + ; CHECK-REGALLOC-NEXT: renamable $x8 = UBFMXri killed renamable $x8, 3, 63 + ; CHECK-REGALLOC-NEXT: STRXui killed renamable $x8, target-index(saved-streaming-vg-slot), 0 + ; CHECK-REGALLOC-NEXT: BL &__arm_get_current_vg, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def dead $lr, implicit $sp, implicit-def $x0 ; CHECK-REGALLOC-NEXT: STRQui $q0, %stack.0, 0 :: (store (s128) into %stack.0) - ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: STRXui killed $x0, target-index(saved-vg-slot), 0 + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) ; CHECK-REGALLOC-NEXT: renamable $q0 = KILL killed renamable $q0, implicit-def $z0 ; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-REGALLOC-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp ; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-REGALLOC-NEXT: RET_ReallyLR %sa = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %a, i64 0) call void @scalable_args( %sa) @@ -42,26 +47,31 @@ define void @dont_coalesce_args(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind define <2 x i64> @dont_coalesce_res() "aarch64_pstate_sm_body" nounwind { ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_res ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0): - ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z0 ; CHECK-COALESCER-BARRIER-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY [[COPY]].zsub ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY1]] - ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-COALESCER-BARRIER-NEXT: $q0 = COPY [[COALESCER_BARRIER_FPR128_]] ; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR implicit $q0 ; ; CHECK-REGALLOC-LABEL: name: dont_coalesce_res ; CHECK-REGALLOC: bb.0 (%ir-block.0): - ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: renamable $x8 = RDSVLI_XI 1, implicit $vg + ; CHECK-REGALLOC-NEXT: renamable $x8 = UBFMXri killed renamable $x8, 3, 63 + ; CHECK-REGALLOC-NEXT: STRXui killed renamable $x8, target-index(saved-streaming-vg-slot), 0 + ; CHECK-REGALLOC-NEXT: BL &__arm_get_current_vg, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def dead $lr, implicit $sp, implicit-def $x0 + ; CHECK-REGALLOC-NEXT: STRXui killed $x0, target-index(saved-vg-slot), 0 + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-REGALLOC-NEXT: BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 ; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-REGALLOC-NEXT: renamable $q0 = KILL renamable $q0, implicit killed $z0 ; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0) - ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-REGALLOC-NEXT: $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) ; CHECK-REGALLOC-NEXT: RET_ReallyLR implicit $q0 %sa = call @scalable_res() @@ -76,7 +86,7 @@ define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_psta ; CHECK-COALESCER-BARRIER-NEXT: {{ $}} ; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]] - ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-COALESCER-BARRIER-NEXT: [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF ; CHECK-COALESCER-BARRIER-NEXT: [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp @@ -84,7 +94,7 @@ define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_psta ; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_1:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COALESCER_BARRIER_FPR128_]] - ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-COALESCER-BARRIER-NEXT: $q0 = COPY [[COALESCER_BARRIER_FPR128_1]] ; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR implicit $q0 ; @@ -92,14 +102,19 @@ define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_psta ; CHECK-REGALLOC: bb.0 (%ir-block.0): ; CHECK-REGALLOC-NEXT: liveins: $q0 ; CHECK-REGALLOC-NEXT: {{ $}} + ; CHECK-REGALLOC-NEXT: renamable $x8 = RDSVLI_XI 1, implicit $vg + ; CHECK-REGALLOC-NEXT: renamable $x8 = UBFMXri killed renamable $x8, 3, 63 + ; CHECK-REGALLOC-NEXT: STRXui killed renamable $x8, target-index(saved-streaming-vg-slot), 0 + ; CHECK-REGALLOC-NEXT: BL &__arm_get_current_vg, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def dead $lr, implicit $sp, implicit-def $x0 ; CHECK-REGALLOC-NEXT: STRQui $q0, %stack.0, 0 :: (store (s128) into %stack.0) - ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: STRXui killed $x0, target-index(saved-vg-slot), 0 + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) ; CHECK-REGALLOC-NEXT: renamable $q0 = KILL killed renamable $q0, implicit-def $z0 ; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-REGALLOC-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp ; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr ; CHECK-REGALLOC-NEXT: $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) ; CHECK-REGALLOC-NEXT: RET_ReallyLR implicit $q0 %sa = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %a, i64 0) diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll index c4440e7bcc3ff..0545b3110e461 100644 --- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll +++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll @@ -14,9 +14,9 @@ define void @streaming_compatible() #0 { ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_get_current_vg -; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill +; CHECK-NEXT: str x0, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB0_2 @@ -28,10 +28,9 @@ define void @streaming_compatible() #0 { ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -52,10 +51,10 @@ define void @streaming_compatible_arg(float %f) #0 { ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_get_current_vg -; CHECK-NEXT: stp x0, x19, [sp, #88] // 16-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: str x0, [sp, #96] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB1_2 @@ -68,10 +67,9 @@ define void @streaming_compatible_arg(float %f) #0 { ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll index 980144d6ca584..a5f51926899dd 100644 --- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll @@ -9,9 +9,8 @@ declare void @my_func2( %v) define void @fbyte( %v) #0{ ; NOPAIR-LABEL: fbyte: ; NOPAIR: // %bb.0: -; NOPAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; NOPAIR-NEXT: cntd x9 -; NOPAIR-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill +; NOPAIR-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill +; NOPAIR-NEXT: stp x30, x19, [sp, #8] // 16-byte Folded Spill ; NOPAIR-NEXT: addvl sp, sp, #-18 ; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -42,7 +41,10 @@ define void @fbyte( %v) #0{ ; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; NOPAIR-NEXT: addvl sp, sp, #-1 +; NOPAIR-NEXT: cntd x8 +; NOPAIR-NEXT: addvl x9, sp, #19 ; NOPAIR-NEXT: str z0, [sp] // 16-byte Folded Spill +; NOPAIR-NEXT: str x8, [x9, #24] ; NOPAIR-NEXT: bl __arm_sme_state ; NOPAIR-NEXT: and x19, x0, #0x1 ; NOPAIR-NEXT: tbz w19, #0, .LBB0_2 @@ -85,15 +87,14 @@ define void @fbyte( %v) #0{ ; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; NOPAIR-NEXT: addvl sp, sp, #18 -; NOPAIR-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload -; NOPAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; NOPAIR-NEXT: ldp x30, x19, [sp, #8] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload ; NOPAIR-NEXT: ret ; ; PAIR-LABEL: fbyte: ; PAIR: // %bb.0: -; PAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; PAIR-NEXT: cntd x9 -; PAIR-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill +; PAIR-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill +; PAIR-NEXT: stp x30, x19, [sp, #8] // 16-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-18 ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -124,7 +125,10 @@ define void @fbyte( %v) #0{ ; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-1 +; PAIR-NEXT: cntd x8 +; PAIR-NEXT: addvl x9, sp, #19 ; PAIR-NEXT: str z0, [sp] // 16-byte Folded Spill +; PAIR-NEXT: str x8, [x9, #24] ; PAIR-NEXT: bl __arm_sme_state ; PAIR-NEXT: and x19, x0, #0x1 ; PAIR-NEXT: tbz w19, #0, .LBB0_2 @@ -167,8 +171,8 @@ define void @fbyte( %v) #0{ ; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: addvl sp, sp, #18 -; PAIR-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload -; PAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; PAIR-NEXT: ldp x30, x19, [sp, #8] // 16-byte Folded Reload +; PAIR-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload ; PAIR-NEXT: ret call void @my_func2( %v) ret void @@ -178,9 +182,9 @@ define void @fhalf( %v) #1{ ; NOPAIR-LABEL: fhalf: ; NOPAIR: // %bb.0: ; NOPAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; NOPAIR-NEXT: cntd x9 -; NOPAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; NOPAIR-NEXT: addvl sp, sp, #-18 +; NOPAIR-NEXT: cntd x8 +; NOPAIR-NEXT: addvl x9, sp, #18 ; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -209,6 +213,7 @@ define void @fhalf( %v) #1{ ; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill ; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str x8, [x9, #16] ; NOPAIR-NEXT: smstop sm ; NOPAIR-NEXT: bl my_func ; NOPAIR-NEXT: smstart sm @@ -247,21 +252,21 @@ define void @fhalf( %v) #1{ ; PAIR-LABEL: fhalf: ; PAIR: // %bb.0: ; PAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; PAIR-NEXT: cntd x9 -; PAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-18 ; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: cntd x8 +; PAIR-NEXT: addvl x9, sp, #18 ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill @@ -272,6 +277,7 @@ define void @fhalf( %v) #1{ ; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str x8, [x9, #16] ; PAIR-NEXT: smstop sm ; PAIR-NEXT: bl my_func ; PAIR-NEXT: smstart sm @@ -308,14 +314,13 @@ define void @ffloat( %v) #2 { ; NOPAIR-LABEL: ffloat: ; NOPAIR: // %bb.0: ; NOPAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; NOPAIR-NEXT: rdsvl x9, #1 -; NOPAIR-NEXT: lsr x9, x9, #3 -; NOPAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; NOPAIR-NEXT: cntd x9 -; NOPAIR-NEXT: str x9, [sp, #24] // 8-byte Folded Spill ; NOPAIR-NEXT: addsvl sp, sp, #-18 +; NOPAIR-NEXT: rdsvl x8, #1 +; NOPAIR-NEXT: addsvl x10, sp, #18 ; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: lsr x8, x8, #3 ; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: cntd x9 ; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill ; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill ; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill @@ -342,6 +347,9 @@ define void @ffloat( %v) #2 { ; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill ; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str x8, [x10, #16] +; NOPAIR-NEXT: addsvl x8, sp, #18 +; NOPAIR-NEXT: str x9, [x8, #24] ; NOPAIR-NEXT: smstart sm ; NOPAIR-NEXT: smstop sm ; NOPAIR-NEXT: bl my_func @@ -380,14 +388,13 @@ define void @ffloat( %v) #2 { ; PAIR-LABEL: ffloat: ; PAIR: // %bb.0: ; PAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; PAIR-NEXT: rdsvl x9, #1 -; PAIR-NEXT: lsr x9, x9, #3 -; PAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; PAIR-NEXT: cntd x9 -; PAIR-NEXT: str x9, [sp, #24] // 8-byte Folded Spill ; PAIR-NEXT: addsvl sp, sp, #-18 +; PAIR-NEXT: rdsvl x8, #1 +; PAIR-NEXT: addsvl x10, sp, #18 ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: lsr x8, x8, #3 ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: cntd x9 ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill @@ -414,6 +421,9 @@ define void @ffloat( %v) #2 { ; PAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str x8, [x10, #16] +; PAIR-NEXT: addsvl x8, sp, #18 +; PAIR-NEXT: str x9, [x8, #24] ; PAIR-NEXT: smstart sm ; PAIR-NEXT: smstop sm ; PAIR-NEXT: bl my_func diff --git a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll index a08e4896f5ee9..17a85f580bf40 100644 --- a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll +++ b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll @@ -1,20 +1,17 @@ -; RUN: llc -mtriple=aarch64-darwin -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s declare void @normal_callee(); define void @locally_streaming_fn() #0 { ; CHECK-LABEL: locally_streaming_fn: -; CHECK: ; %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-96]! ; 16-byte Folded Spill +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: stp d13, d12, [sp, #16] ; 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 -; CHECK-NEXT: stp d11, d10, [sp, #32] ; 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] ; 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] ; 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] ; 8-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 @@ -25,18 +22,23 @@ define void @locally_streaming_fn() #0 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: str x9, [sp, #80] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_offset vg, -24 ; CHECK-NEXT: smstop sm -; CHECK-NEXT: bl _normal_callee +; CHECK-NEXT: bl normal_callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] ; 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] ; 8-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] ; 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] ; 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #96 ; 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 ; CHECK-NEXT: .cfi_restore b8 diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 4a52bf27a7591..ce73f895738e8 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -17,8 +17,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: cntd x9 -; CHECK-FISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: cntd x8 +; CHECK-FISEL-NEXT: str x8, [sp, #88] ; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstart sm ; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload @@ -45,8 +45,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: cntd x9 -; CHECK-GISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: cntd x8 +; CHECK-GISEL-NEXT: str x8, [sp, #88] ; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstart sm ; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload @@ -80,8 +80,8 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: cntd x9 -; CHECK-COMMON-NEXT: str x9, [sp, #88] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: cntd x8 +; CHECK-COMMON-NEXT: str x8, [sp, #88] ; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload @@ -114,11 +114,11 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: rdsvl x9, #1 -; CHECK-COMMON-NEXT: lsr x9, x9, #3 -; CHECK-COMMON-NEXT: str x9, [sp, #104] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: cntd x9 -; CHECK-COMMON-NEXT: str x9, [sp, #112] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: lsr x8, x8, #3 +; CHECK-COMMON-NEXT: str x8, [sp, #104] +; CHECK-COMMON-NEXT: cntd x8 +; CHECK-COMMON-NEXT: str x8, [sp, #112] ; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload @@ -182,11 +182,11 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: rdsvl x9, #1 -; CHECK-COMMON-NEXT: lsr x9, x9, #3 -; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: cntd x9 -; CHECK-COMMON-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: lsr x8, x8, #3 +; CHECK-COMMON-NEXT: str x8, [sp, #72] +; CHECK-COMMON-NEXT: cntd x8 +; CHECK-COMMON-NEXT: str x8, [sp, #80] ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: blr x0 ; CHECK-COMMON-NEXT: smstop sm @@ -208,8 +208,8 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: cntd x9 -; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: cntd x8 +; CHECK-COMMON-NEXT: str x8, [sp, #72] ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: blr x0 ; CHECK-COMMON-NEXT: smstop sm @@ -339,13 +339,14 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-LABEL: f128_call_sm: ; CHECK-COMMON: // %bb.0: ; CHECK-COMMON-NEXT: sub sp, sp, #112 -; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: cntd x8 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-COMMON-NEXT: str x8, [sp, #104] ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload ; CHECK-COMMON-NEXT: bl __addtf3 @@ -403,13 +404,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-LABEL: frem_call_sm: ; CHECK-COMMON: // %bb.0: ; CHECK-COMMON-NEXT: sub sp, sp, #96 -; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: cntd x8 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str x8, [sp, #88] ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf @@ -432,14 +434,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-LABEL: frem_call_sm_compat: ; CHECK-COMMON: // %bb.0: ; CHECK-COMMON-NEXT: sub sp, sp, #112 -; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: cntd x8 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str x8, [sp, #96] ; CHECK-COMMON-NEXT: bl __arm_sme_state ; CHECK-COMMON-NEXT: and x19, x0, #0x1 ; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2 @@ -454,10 +456,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: .LBB12_4: ; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: add sp, sp, #112 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index e463e833bdbde..018f5f801d09c 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -128,18 +128,18 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-LABEL: test_lazy_save_and_conditional_smstart: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: add x29, sp, #64 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cntd x10 ; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: str x10, [x29, #32] ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-80] ; CHECK-NEXT: sub x9, x29, #80 @@ -167,7 +167,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: .LBB3_6: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: sub sp, x29, #64 -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll new file mode 100644 index 0000000000000..2d49ea542b96e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -O0 < %s | FileCheck %s + +; Example of locally streaming function that (at -O0) must preserve the LR (X30) +; before calling __arm_get_current_vg. +define void @foo() "aarch64_pstate_sm_body" { +; CHECK-LABEL: foo: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: str x0, [sp, #80] +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index 130a316bcc2ba..d6096ce86233e 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -11,11 +11,12 @@ define void @test0(ptr %callee) nounwind { ; CHECK-LABEL: test0: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: bl callee_sm @@ -36,11 +37,12 @@ define void @test1() nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: bl callee @@ -62,12 +64,12 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB2_2 @@ -90,10 +92,9 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-NEXT: // %bb.7: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB2_8: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -107,12 +108,12 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB3_2 @@ -146,10 +147,9 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-NEXT: // %bb.11: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB3_12: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -164,11 +164,12 @@ define void @test4() nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: bl callee_farg @@ -191,13 +192,14 @@ define void @test5(float %f) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #88] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl callee_farg @@ -220,13 +222,14 @@ define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #88] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl callee_farg_fret @@ -279,11 +282,12 @@ define void @test8() nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -322,14 +326,10 @@ define void @test10() "aarch64_pstate_sm_body" { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 @@ -340,6 +340,11 @@ define void @test10() "aarch64_pstate_sm_body" { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: str x9, [sp, #80] ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg @@ -375,13 +380,13 @@ define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test11: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -390,10 +395,9 @@ define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -411,14 +415,10 @@ define void @test12() "aarch64_pstate_sm_body" { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 @@ -429,6 +429,11 @@ define void @test12() "aarch64_pstate_sm_body" { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: str x9, [sp, #80] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop za ; CHECK-NEXT: .cfi_offset vg, -24 @@ -467,15 +472,17 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test13: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [x9, #88] ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload @@ -490,8 +497,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index 5ea5e3e7766e8..622877d48fb35 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -16,15 +16,17 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str x8, [x9, #88] ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i8 @@ -32,8 +34,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -49,15 +51,17 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str x8, [x9, #88] ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i16 @@ -65,8 +69,8 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -82,15 +86,17 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str x8, [x9, #88] ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i32 @@ -98,8 +104,8 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -115,15 +121,17 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str x8, [x9, #88] ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i64 @@ -131,8 +139,8 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -148,19 +156,21 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload @@ -171,8 +181,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -188,19 +198,21 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload @@ -211,8 +223,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -228,19 +240,21 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -251,8 +265,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -273,19 +287,21 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -296,8 +312,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -314,19 +330,21 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -337,8 +355,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -355,19 +373,21 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -378,8 +398,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -396,19 +416,21 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -419,8 +441,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -437,19 +459,21 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload @@ -460,8 +484,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -478,19 +502,21 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -501,8 +527,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -519,19 +545,21 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -542,8 +570,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -564,19 +592,21 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -587,8 +617,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -604,19 +634,21 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -627,8 +659,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -644,19 +676,21 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -667,8 +701,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -684,19 +718,21 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -707,8 +743,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -724,19 +760,21 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -747,8 +785,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -764,19 +802,21 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -787,8 +827,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -804,19 +844,21 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -827,8 +869,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -844,19 +886,21 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -867,8 +911,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -887,23 +931,25 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: and z1.b, z1.b, #0x1 ; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 ; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload @@ -914,8 +960,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -935,22 +981,21 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i8 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -964,22 +1009,21 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i16 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -993,22 +1037,21 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i32 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -1022,22 +1065,21 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i64 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -1051,25 +1093,24 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_f16 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1084,24 +1125,23 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_f32 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1116,24 +1156,23 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_f64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1152,24 +1191,23 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i8 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1185,24 +1223,23 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i16 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1218,24 +1255,23 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i32 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1251,24 +1287,23 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1284,25 +1319,24 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1f16 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1318,24 +1352,23 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1f32 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1351,24 +1384,23 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1f64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1388,25 +1420,24 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v16i8 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1421,25 +1452,24 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v8i16 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1454,25 +1484,24 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v4i32 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1487,25 +1516,24 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v2i64 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1520,25 +1548,24 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v8f16 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1553,25 +1580,24 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v4f32 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 @@ -1586,25 +1612,24 @@ define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x8, [sp, #96] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v2f64 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll index 1a49da84c00ce..036baff0b6e5a 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll @@ -9,14 +9,15 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar ; CHECK-LABEL: sm_body_sm_compatible_simple: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: lsr x8, x8, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: str x9, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x8, x0, #0x1 ; CHECK-NEXT: tbnz w8, #0, .LBB0_2 @@ -41,14 +42,15 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate ; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: lsr x8, x8, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] +; CHECK-NEXT: str x9, [sp, #88] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB1_2 @@ -62,10 +64,9 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -79,14 +80,15 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: cntd x10 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] +; CHECK-NEXT: str x10, [sp, #88] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB2_2 @@ -100,10 +102,9 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block ; CHECK-NEXT: // %bb.4: // %if.else ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_5: // %if.else +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -115,10 +116,9 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block ; CHECK-NEXT: // %bb.7: // %if.then ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_8: // %if.then +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index dd336e0f2e686..5fe9066192084 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -9,14 +9,15 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body ; CHECK-LABEL: locally_streaming_caller_streaming_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: lsr x8, x8, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: str x9, [sp, #80] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_compatible_callee ; CHECK-NEXT: bl streaming_compatible_callee @@ -51,33 +52,31 @@ define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_ define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_multiple_exit: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: lsr x9, x9, #3 -; CHECK-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #64] +; CHECK-NEXT: str x9, [sp, #72] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: cmp x0, #1 ; CHECK-NEXT: b.ne .LBB2_2 ; CHECK-NEXT: // %bb.1: // %if.else ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_2: // %if.end ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -99,16 +98,16 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta ; CHECK-LABEL: locally_streaming_caller_no_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 -; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] +; CHECK-NEXT: str x9, [sp, #88] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload @@ -118,11 +117,11 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret @@ -156,16 +155,17 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i ; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: lsr x8, x8, #3 ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #88] +; CHECK-NEXT: str x9, [sp, #96] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl streaming_compatible_callee_vec_args_ret @@ -189,16 +189,17 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #128 -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: lsr x8, x8, #3 ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #112] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #104] +; CHECK-NEXT: str x9, [sp, #112] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret @@ -225,16 +226,18 @@ define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" ; CHECK-LABEL: locally_streaming_caller_alloca: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill ; CHECK-NEXT: addsvl sp, sp, #-1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: addsvl x10, sp, #1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: str x8, [x10, #80] +; CHECK-NEXT: addsvl x8, sp, #1 +; CHECK-NEXT: str x9, [x8, #88] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: bl use_ptr @@ -272,16 +275,16 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat ; CHECK-LABEL: test_arg_survives_loop: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 -; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] +; CHECK-NEXT: str x9, [sp, #88] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB9_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -293,11 +296,11 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat ; CHECK-NEXT: fadd s0, s1, s0 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: @@ -319,14 +322,15 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: disable_tailcallopt: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: lsr x8, x8, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: str x9, [sp, #80] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_compatible_callee ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index e967f3b7be5e8..17f08ddfa04ef 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -37,12 +37,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp ; CHECK-LABEL: streaming_compatible_caller_normal_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB1_2 @@ -54,10 +54,9 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -76,12 +75,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c ; CHECK-LABEL: streaming_compatible_caller_streaming_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB2_2 @@ -93,10 +92,9 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -130,17 +128,19 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-LABEL: streaming_compatible_with_neon_vectors: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload @@ -167,8 +167,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -183,9 +183,8 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>) define @streaming_compatible_with_scalable_vectors( %arg) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: streaming_compatible_with_scalable_vectors: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #8] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -216,7 +215,10 @@ define @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_scalable_vectors( @normal_callee_scalable_vec_arg( %arg) %fadd = fadd %res, %arg @@ -276,9 +278,8 @@ declare @normal_callee_scalable_vec_arg( @streaming_compatible_with_predicate_vectors( %arg) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: streaming_compatible_with_predicate_vectors: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #8] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -309,7 +310,10 @@ define @streaming_compatible_with_predicate_vectors( @streaming_compatible_with_predicate_vectors( @normal_callee_predicate_vec_arg( %arg) %and = and %res, %arg @@ -370,12 +374,12 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl ; CHECK-LABEL: conditional_smstart_unreachable_block: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB7_2 @@ -394,15 +398,15 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: conditional_smstart_no_successor_block: ; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: tbz w0, #0, .LBB8_6 ; CHECK-NEXT: // %bb.1: // %if.then ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB8_3 @@ -414,10 +418,9 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co ; CHECK-NEXT: // %bb.4: // %if.then ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB8_5: // %if.then +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .LBB8_6: // %exit @@ -436,12 +439,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: disable_tailcallopt: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB9_2 @@ -453,10 +456,9 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB9_4: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -470,14 +472,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #128 ; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -24 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 @@ -487,13 +487,15 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: cntd x10 ; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x8, x1 -; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: str x10, [sp, #112] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: tbz w19, #0, .LBB10_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstop sm @@ -508,10 +510,9 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB10_4: // %entry ; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #112] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #128 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 438b941198449..3b03dab5fbbe6 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -22,11 +22,12 @@ define void @normal_caller_streaming_callee() nounwind { ; CHECK-LABEL: normal_caller_streaming_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm @@ -48,11 +49,12 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable ; CHECK-LABEL: streaming_caller_normal_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl normal_callee ; CHECK-NEXT: smstart sm @@ -105,11 +107,12 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind { ; CHECK-LABEL: call_to_function_pointer_streaming_enabled: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstop sm @@ -128,13 +131,14 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind { ; CHECK-LABEL: smstart_clobber_simdfp: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #88] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm @@ -155,8 +159,6 @@ define @smstart_clobber_sve( %x) nounwind { ; CHECK-LABEL: smstart_clobber_sve: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -187,7 +189,10 @@ define @smstart_clobber_sve( %x) nounwind { ; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #19 ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [x9, #16] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm @@ -234,8 +239,6 @@ define @smstart_clobber_sve_duplicate( %x) ; CHECK-LABEL: smstart_clobber_sve_duplicate: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -266,7 +269,10 @@ define @smstart_clobber_sve_duplicate( %x) ; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #19 ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [x9, #16] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: bl streaming_callee @@ -314,13 +320,15 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-LABEL: call_to_intrinsic_without_chain: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #88] +; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-NEXT: bl cos @@ -349,11 +357,12 @@ define void @disable_tailcallopt() nounwind { ; CHECK-LABEL: disable_tailcallopt: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm @@ -371,14 +380,15 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-LABEL: call_to_non_streaming_pass_sve_objects: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #3 ; CHECK-NEXT: rdsvl x3, #1 +; CHECK-NEXT: str x8, [x9, #80] ; CHECK-NEXT: addvl x0, sp, #2 ; CHECK-NEXT: addvl x1, sp, #1 ; CHECK-NEXT: mov x2, sp @@ -409,14 +419,15 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-LABEL: call_to_non_streaming_pass_args: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #104] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll index fe3f493353b50..d087b23a64921 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -15,15 +15,17 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-LABEL: test_no_stackslot_scavenging: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x24, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x24, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #1 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: str x8, [x9, #104] ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: smstop sm @@ -32,8 +34,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-NEXT: smstart sm ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x24, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x24, [sp, #72] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -49,20 +51,20 @@ define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-po ; CHECK-LABEL: test_no_stackslot_scavenging_with_fp: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-128]! // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: add x29, sp, #64 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: stp x28, x25, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: stp x28, x25, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x19, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: lsl x9, x0, #3 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str s0, [x29, #60] // 4-byte Folded Spill +; CHECK-NEXT: str x8, [x29, #48] ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: mov x19, sp -; CHECK-NEXT: str s0, [x29, #28] // 4-byte Folded Spill ; CHECK-NEXT: add x9, x9, #15 ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ; CHECK-NEXT: sub x8, x8, x9 @@ -70,12 +72,12 @@ define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-po ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr s0, [x29, #28] // 4-byte Folded Reload +; CHECK-NEXT: ldr s0, [x29, #60] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f ; CHECK-NEXT: smstart sm ; CHECK-NEXT: sub sp, x29, #64 -; CHECK-NEXT: ldp x24, x19, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: ldp x28, x25, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x25, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll index 0853325e449af..91f7ce05b80bc 100644 --- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -17,11 +17,10 @@ define void @vg_unwind_simple() #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: .cfi_offset b8, -24 ; CHECK-NEXT: .cfi_offset b9, -32 @@ -31,6 +30,8 @@ define void @vg_unwind_simple() #0 { ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: .cfi_offset vg, -8 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee @@ -57,12 +58,10 @@ define void @vg_unwind_simple() #0 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #64 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 ; FP-CHECK-NEXT: .cfi_offset w30, -24 @@ -75,6 +74,8 @@ define void @vg_unwind_simple() #0 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: cntd x8 +; FP-CHECK-NEXT: str x8, [x29, #16] ; FP-CHECK-NEXT: .cfi_offset vg, -16 ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: bl callee @@ -101,7 +102,6 @@ define void @vg_unwind_simple() #0 { ; ; OUTLINER-CHECK-LABEL: vg_unwind_simple: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void @callee(); ret void; } @@ -114,13 +114,11 @@ define void @vg_unwind_needs_gap() #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x20, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: stp x30, x20, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w20, -24 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 @@ -130,17 +128,18 @@ define void @vg_unwind_needs_gap() #0 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp x30, x20, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x20, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -160,15 +159,14 @@ define void @vg_unwind_needs_gap() #0 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; FP-CHECK-NEXT: stp x9, x20, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x20, [sp, #80] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #64 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 -; FP-CHECK-NEXT: .cfi_offset w20, -8 +; FP-CHECK-NEXT: .cfi_offset w20, -16 ; FP-CHECK-NEXT: .cfi_offset w30, -24 ; FP-CHECK-NEXT: .cfi_offset w29, -32 ; FP-CHECK-NEXT: .cfi_offset b8, -40 @@ -179,16 +177,18 @@ define void @vg_unwind_needs_gap() #0 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: cntd x8 +; FP-CHECK-NEXT: str x8, [x29, #24] ; FP-CHECK-NEXT: //APP ; FP-CHECK-NEXT: //NO_APP -; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: .cfi_offset vg, -8 ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: bl callee ; FP-CHECK-NEXT: smstart sm ; FP-CHECK-NEXT: .cfi_restore vg ; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 ; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; FP-CHECK-NEXT: ldr x20, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldr x20, [sp, #80] // 8-byte Folded Reload ; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -209,7 +209,6 @@ define void @vg_unwind_needs_gap() #0 { ; ; OUTLINER-CHECK-LABEL: vg_unwind_needs_gap: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void asm sideeffect "", "~{x20}"() call void @callee(); ret void; @@ -220,12 +219,11 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: .cfi_offset b8, -24 ; CHECK-NEXT: .cfi_offset b9, -32 @@ -235,7 +233,9 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 { ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #88] ; CHECK-NEXT: .cfi_offset vg, -8 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -264,13 +264,11 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: sub sp, sp, #112 ; FP-CHECK-NEXT: .cfi_def_cfa_offset 112 -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill -; FP-CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #80 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 ; FP-CHECK-NEXT: .cfi_offset w30, -24 @@ -283,7 +281,9 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: cntd x8 ; FP-CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x8, [x29, #16] ; FP-CHECK-NEXT: .cfi_offset vg, -16 ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -312,7 +312,6 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 { ; ; OUTLINER-CHECK-LABEL: vg_unwind_with_fixed_args: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void @fixed_callee(<4 x i32> %x); ret void; } @@ -320,11 +319,10 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 { define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-LABEL: vg_unwind_with_sve_args: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset w28, -8 +; CHECK-NEXT: stp x30, x28, [sp, #8] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w28, -16 ; CHECK-NEXT: .cfi_offset w30, -24 ; CHECK-NEXT: .cfi_offset w29, -32 ; CHECK-NEXT: addvl sp, sp, #-18 @@ -361,10 +359,13 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addvl x9, sp, #19 ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str x8, [x9, #24] ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset vg, -8 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl scalable_callee @@ -404,8 +405,8 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: .cfi_restore z13 ; CHECK-NEXT: .cfi_restore z14 ; CHECK-NEXT: .cfi_restore z15 -; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x28, [sp, #8] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w28 ; CHECK-NEXT: .cfi_restore w30 @@ -416,13 +417,11 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 48 -; FP-CHECK-NEXT: cntd x9 -; FP-CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill -; FP-CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; FP-CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: mov x29, sp ; FP-CHECK-NEXT: .cfi_def_cfa w29, 48 -; FP-CHECK-NEXT: .cfi_offset w27, -8 -; FP-CHECK-NEXT: .cfi_offset w28, -16 +; FP-CHECK-NEXT: .cfi_offset w27, -24 +; FP-CHECK-NEXT: .cfi_offset w28, -32 ; FP-CHECK-NEXT: .cfi_offset w30, -40 ; FP-CHECK-NEXT: .cfi_offset w29, -48 ; FP-CHECK-NEXT: addvl sp, sp, #-18 @@ -457,10 +456,12 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG ; FP-CHECK-NEXT: addvl sp, sp, #-1 +; FP-CHECK-NEXT: cntd x8 ; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x8, [x29, #32] ; FP-CHECK-NEXT: //APP ; FP-CHECK-NEXT: //NO_APP -; FP-CHECK-NEXT: .cfi_offset vg, -32 +; FP-CHECK-NEXT: .cfi_offset vg, -16 ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload ; FP-CHECK-NEXT: bl scalable_callee @@ -499,7 +500,7 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: .cfi_restore z14 ; FP-CHECK-NEXT: .cfi_restore z15 ; FP-CHECK-NEXT: .cfi_def_cfa wsp, 48 -; FP-CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload ; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 ; FP-CHECK-NEXT: .cfi_restore w27 @@ -510,7 +511,6 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; ; OUTLINER-CHECK-LABEL: vg_unwind_with_sve_args: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void asm sideeffect "", "~{x28}"() call void @scalable_callee( %x); ret void; @@ -524,12 +524,10 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset w30, -24 ; CHECK-NEXT: .cfi_offset w29, -32 ; CHECK-NEXT: .cfi_offset b8, -40 @@ -550,8 +548,12 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 { ; CHECK-NEXT: b.ne .LBB4_1 ; CHECK-NEXT: // %bb.2: // %entry ; CHECK-NEXT: .cfi_def_cfa_register wsp -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add x10, sp, #72, lsl #12 // =294912 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: add x10, x10, #88 +; CHECK-NEXT: str x8, [x10, #32760] +; CHECK-NEXT: str x9, [x0] ; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee @@ -581,15 +583,14 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 { ; FP-CHECK: // %bb.0: // %entry ; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; FP-CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x28, [sp, #80] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #64 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 -; FP-CHECK-NEXT: .cfi_offset w28, -8 +; FP-CHECK-NEXT: .cfi_offset w28, -16 ; FP-CHECK-NEXT: .cfi_offset w30, -24 ; FP-CHECK-NEXT: .cfi_offset w29, -32 ; FP-CHECK-NEXT: .cfi_offset b8, -40 @@ -608,9 +609,11 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 { ; FP-CHECK-NEXT: str xzr, [sp] ; FP-CHECK-NEXT: b.ne .LBB4_1 ; FP-CHECK-NEXT: // %bb.2: // %entry -; FP-CHECK-NEXT: mov x8, sp -; FP-CHECK-NEXT: str x8, [x0] -; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: cntd x8 +; FP-CHECK-NEXT: mov x9, sp +; FP-CHECK-NEXT: str x8, [x29, #24] +; FP-CHECK-NEXT: str x9, [x0] +; FP-CHECK-NEXT: .cfi_offset vg, -8 ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: bl callee ; FP-CHECK-NEXT: smstart sm @@ -618,7 +621,7 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 { ; FP-CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 ; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 ; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; FP-CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldr x28, [sp, #80] // 8-byte Folded Reload ; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -639,7 +642,6 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 { ; ; OUTLINER-CHECK-LABEL: vg_unwind_multiple_scratch_regs: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; entry: %v = alloca i8, i64 327680, align 1 store ptr %v, ptr %out, align 8 @@ -655,14 +657,10 @@ define void @vg_locally_streaming_fn() #3 { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 @@ -673,6 +671,11 @@ define void @vg_locally_streaming_fn() #3 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: lsr x8, x8, #3 +; CHECK-NEXT: str x8, [sp, #72] +; CHECK-NEXT: str x9, [sp, #80] ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg @@ -701,15 +704,10 @@ define void @vg_locally_streaming_fn() #3 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 -; FP-CHECK-NEXT: rdsvl x9, #1 ; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; FP-CHECK-NEXT: lsr x9, x9, #3 ; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; FP-CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #64 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 ; FP-CHECK-NEXT: .cfi_offset vg, -8 @@ -723,6 +721,11 @@ define void @vg_locally_streaming_fn() #3 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: rdsvl x8, #1 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: lsr x8, x8, #3 +; FP-CHECK-NEXT: str x8, [x29, #16] +; FP-CHECK-NEXT: str x9, [x29, #24] ; FP-CHECK-NEXT: bl callee ; FP-CHECK-NEXT: smstart sm ; FP-CHECK-NEXT: .cfi_restore vg @@ -751,7 +754,6 @@ define void @vg_locally_streaming_fn() #3 { ; ; OUTLINER-CHECK-LABEL: vg_locally_streaming_fn: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void @callee() call void @streaming_callee() call void @callee() @@ -763,13 +765,11 @@ define void @streaming_compatible_to_streaming() #4 { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -24 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 @@ -779,9 +779,11 @@ define void @streaming_compatible_to_streaming() #4 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: tbnz w19, #0, .LBB6_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm @@ -792,10 +794,9 @@ define void @streaming_compatible_to_streaming() #4 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB6_4: ; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -815,15 +816,14 @@ define void @streaming_compatible_to_streaming() #4 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #64 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 -; FP-CHECK-NEXT: .cfi_offset w19, -8 +; FP-CHECK-NEXT: .cfi_offset w19, -16 ; FP-CHECK-NEXT: .cfi_offset w30, -24 ; FP-CHECK-NEXT: .cfi_offset w29, -32 ; FP-CHECK-NEXT: .cfi_offset b8, -40 @@ -834,9 +834,11 @@ define void @streaming_compatible_to_streaming() #4 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: cntd x8 +; FP-CHECK-NEXT: str x8, [x29, #24] ; FP-CHECK-NEXT: bl __arm_sme_state ; FP-CHECK-NEXT: and x19, x0, #0x1 -; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: .cfi_offset vg, -8 ; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_2 ; FP-CHECK-NEXT: // %bb.1: ; FP-CHECK-NEXT: smstart sm @@ -849,7 +851,7 @@ define void @streaming_compatible_to_streaming() #4 { ; FP-CHECK-NEXT: .cfi_restore vg ; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 ; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -870,7 +872,6 @@ define void @streaming_compatible_to_streaming() #4 { ; ; OUTLINER-CHECK-LABEL: streaming_compatible_to_streaming: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void @streaming_callee() ret void } @@ -880,13 +881,11 @@ define void @streaming_compatible_to_non_streaming() #4 { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -24 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 @@ -896,9 +895,11 @@ define void @streaming_compatible_to_non_streaming() #4 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [sp, #80] ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: tbz w19, #0, .LBB7_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -909,10 +910,9 @@ define void @streaming_compatible_to_non_streaming() #4 { ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -932,15 +932,14 @@ define void @streaming_compatible_to_non_streaming() #4 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #64 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 -; FP-CHECK-NEXT: .cfi_offset w19, -8 +; FP-CHECK-NEXT: .cfi_offset w19, -16 ; FP-CHECK-NEXT: .cfi_offset w30, -24 ; FP-CHECK-NEXT: .cfi_offset w29, -32 ; FP-CHECK-NEXT: .cfi_offset b8, -40 @@ -951,9 +950,11 @@ define void @streaming_compatible_to_non_streaming() #4 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: cntd x8 +; FP-CHECK-NEXT: str x8, [x29, #24] ; FP-CHECK-NEXT: bl __arm_sme_state ; FP-CHECK-NEXT: and x19, x0, #0x1 -; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: .cfi_offset vg, -8 ; FP-CHECK-NEXT: tbz w19, #0, .LBB7_2 ; FP-CHECK-NEXT: // %bb.1: ; FP-CHECK-NEXT: smstop sm @@ -966,7 +967,7 @@ define void @streaming_compatible_to_non_streaming() #4 { ; FP-CHECK-NEXT: .cfi_restore vg ; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 ; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -987,7 +988,6 @@ define void @streaming_compatible_to_non_streaming() #4 { ; ; OUTLINER-CHECK-LABEL: streaming_compatible_to_non_streaming: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void @callee() ret void } @@ -1003,16 +1003,13 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 { ; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 96 ; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: mov x9, x0 ; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg -; NO-SVE-CHECK-NEXT: stp x0, x19, [sp, #80] // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: mov x0, x9 +; NO-SVE-CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; NO-SVE-CHECK-NEXT: add x29, sp, #64 ; NO-SVE-CHECK-NEXT: .cfi_def_cfa w29, 32 -; NO-SVE-CHECK-NEXT: .cfi_offset w19, -8 +; NO-SVE-CHECK-NEXT: .cfi_offset w19, -16 ; NO-SVE-CHECK-NEXT: .cfi_offset w30, -24 ; NO-SVE-CHECK-NEXT: .cfi_offset w29, -32 ; NO-SVE-CHECK-NEXT: .cfi_offset b8, -40 @@ -1023,10 +1020,12 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 { ; NO-SVE-CHECK-NEXT: .cfi_offset b13, -80 ; NO-SVE-CHECK-NEXT: .cfi_offset b14, -88 ; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96 +; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg ; NO-SVE-CHECK-NEXT: mov w8, w0 +; NO-SVE-CHECK-NEXT: str x0, [x29, #24] ; NO-SVE-CHECK-NEXT: bl __arm_sme_state ; NO-SVE-CHECK-NEXT: and x19, x0, #0x1 -; NO-SVE-CHECK-NEXT: .cfi_offset vg, -16 +; NO-SVE-CHECK-NEXT: .cfi_offset vg, -8 ; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2 ; NO-SVE-CHECK-NEXT: // %bb.1: ; NO-SVE-CHECK-NEXT: smstart sm @@ -1040,7 +1039,7 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 { ; NO-SVE-CHECK-NEXT: .cfi_restore vg ; NO-SVE-CHECK-NEXT: .cfi_def_cfa wsp, 96 ; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; NO-SVE-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -1058,10 +1057,8 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 { ; NO-SVE-CHECK-NEXT: .cfi_restore b14 ; NO-SVE-CHECK-NEXT: .cfi_restore b15 ; NO-SVE-CHECK-NEXT: ret -; ; OUTLINER-CHECK-LABEL: streaming_compatible_no_sve: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ -; call void @streaming_callee_with_arg(i32 %x) ret void } @@ -1072,30 +1069,28 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 { ; user-code as if it is part of the frame-setup when doing so. define void @test_rdsvl_right_after_prologue(i64 %x0) nounwind { ; NO-SVE-CHECK-LABEL: test_rdsvl_right_after_prologue: -; NO-SVE-CHECK: // %bb.0: -; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: mov x9, x0 -; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg -; NO-SVE-CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill -; NO-SVE-CHECK-NEXT: mov x0, x9 -; NO-SVE-CHECK-NEXT: rdsvl x8, #1 -; NO-SVE-CHECK-NEXT: add x29, sp, #64 -; NO-SVE-CHECK-NEXT: lsr x8, x8, #3 -; NO-SVE-CHECK-NEXT: mov x1, x0 -; NO-SVE-CHECK-NEXT: smstart sm -; NO-SVE-CHECK-NEXT: mov x0, x8 -; NO-SVE-CHECK-NEXT: bl bar -; NO-SVE-CHECK-NEXT: smstop sm -; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; NO-SVE-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; NO-SVE-CHECK-NEXT: ret +; NO-SVE-CHECK: // %bb.0: +; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: add x29, sp, #64 +; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg +; NO-SVE-CHECK-NEXT: rdsvl x8, #1 +; NO-SVE-CHECK-NEXT: mov x1, x0 +; NO-SVE-CHECK-NEXT: lsr x8, x8, #3 +; NO-SVE-CHECK-NEXT: str x0, [x29, #16] +; NO-SVE-CHECK-NEXT: smstart sm +; NO-SVE-CHECK-NEXT: mov x0, x8 +; NO-SVE-CHECK-NEXT: bl bar +; NO-SVE-CHECK-NEXT: smstop sm +; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ret %some_alloc = alloca i64, align 8 %rdsvl = tail call i64 @llvm.aarch64.sme.cntsd() call void @bar(i64 %rdsvl, i64 %x0) @@ -1112,11 +1107,10 @@ define void @vg_unwind_noasync() #5 { ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: .cfi_offset b8, -24 ; CHECK-NEXT: .cfi_offset b9, -32 @@ -1126,6 +1120,8 @@ define void @vg_unwind_noasync() #5 { ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: str x8, [sp, #72] ; CHECK-NEXT: .cfi_offset vg, -8 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee @@ -1152,12 +1148,10 @@ define void @vg_unwind_noasync() #5 { ; FP-CHECK: // %bb.0: ; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 -; FP-CHECK-NEXT: cntd x9 ; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; FP-CHECK-NEXT: add x29, sp, #64 ; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 ; FP-CHECK-NEXT: .cfi_offset w30, -24 @@ -1170,6 +1164,8 @@ define void @vg_unwind_noasync() #5 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: cntd x8 +; FP-CHECK-NEXT: str x8, [x29, #16] ; FP-CHECK-NEXT: .cfi_offset vg, -16 ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: bl callee @@ -1193,6 +1189,7 @@ define void @vg_unwind_noasync() #5 { ; FP-CHECK-NEXT: .cfi_restore b14 ; FP-CHECK-NEXT: .cfi_restore b15 ; FP-CHECK-NEXT: ret +; ; OUTLINER-CHECK-LABEL: vg_unwind_noasync: ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ ; diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index 3a33405200132..52854b8b01aa4 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -616,36 +616,36 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc ; CHECK0: // %bb.0: // %entry ; CHECK0-NEXT: sub sp, sp, #176 ; CHECK0-NEXT: .cfi_def_cfa_offset 176 -; CHECK0-NEXT: rdsvl x9, #1 -; CHECK0-NEXT: stp d15, d14, [sp, #48] // 16-byte Folded Spill -; CHECK0-NEXT: lsr x9, x9, #3 -; CHECK0-NEXT: stp d13, d12, [sp, #64] // 16-byte Folded Spill -; CHECK0-NEXT: stp d11, d10, [sp, #80] // 16-byte Folded Spill -; CHECK0-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; CHECK0-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK0-NEXT: str x25, [sp, #96] // 8-byte Folded Spill +; CHECK0-NEXT: stp x24, x23, [sp, #104] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #120] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #136] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset vg, -16 +; CHECK0-NEXT: .cfi_offset w19, -32 +; CHECK0-NEXT: .cfi_offset w20, -40 +; CHECK0-NEXT: .cfi_offset w21, -48 +; CHECK0-NEXT: .cfi_offset w22, -56 +; CHECK0-NEXT: .cfi_offset w23, -64 +; CHECK0-NEXT: .cfi_offset w24, -72 +; CHECK0-NEXT: .cfi_offset w25, -80 +; CHECK0-NEXT: .cfi_offset b8, -88 +; CHECK0-NEXT: .cfi_offset b9, -96 +; CHECK0-NEXT: .cfi_offset b10, -104 +; CHECK0-NEXT: .cfi_offset b11, -112 +; CHECK0-NEXT: .cfi_offset b12, -120 +; CHECK0-NEXT: .cfi_offset b13, -128 +; CHECK0-NEXT: .cfi_offset b14, -136 +; CHECK0-NEXT: .cfi_offset b15, -144 +; CHECK0-NEXT: rdsvl x8, #1 ; CHECK0-NEXT: cntd x9 -; CHECK0-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; CHECK0-NEXT: stp d9, d8, [sp, #96] // 16-byte Folded Spill -; CHECK0-NEXT: str x25, [sp, #112] // 8-byte Folded Spill -; CHECK0-NEXT: stp x24, x23, [sp, #128] // 16-byte Folded Spill -; CHECK0-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill -; CHECK0-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_offset w19, -8 -; CHECK0-NEXT: .cfi_offset w20, -16 -; CHECK0-NEXT: .cfi_offset w21, -24 -; CHECK0-NEXT: .cfi_offset w22, -32 -; CHECK0-NEXT: .cfi_offset w23, -40 -; CHECK0-NEXT: .cfi_offset w24, -48 -; CHECK0-NEXT: .cfi_offset w25, -64 -; CHECK0-NEXT: .cfi_offset b8, -72 -; CHECK0-NEXT: .cfi_offset b9, -80 -; CHECK0-NEXT: .cfi_offset b10, -88 -; CHECK0-NEXT: .cfi_offset b11, -96 -; CHECK0-NEXT: .cfi_offset b12, -104 -; CHECK0-NEXT: .cfi_offset b13, -112 -; CHECK0-NEXT: .cfi_offset b14, -120 -; CHECK0-NEXT: .cfi_offset b15, -128 -; CHECK0-NEXT: .cfi_offset vg, -136 ; CHECK0-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK0-NEXT: lsr x8, x8, #3 +; CHECK0-NEXT: str x8, [sp, #152] +; CHECK0-NEXT: str x9, [sp, #160] ; CHECK0-NEXT: smstart sm ; CHECK0-NEXT: //APP ; CHECK0-NEXT: //NO_APP @@ -655,15 +655,15 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc ; CHECK0-NEXT: str x0, [sp, #24] ; CHECK0-NEXT: str d0, [sp, #16] ; CHECK0-NEXT: smstop sm -; CHECK0-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x20, x19, [sp, #136] // 16-byte Folded Reload ; CHECK0-NEXT: mov w0, wzr -; CHECK0-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload -; CHECK0-NEXT: ldr x25, [sp, #112] // 8-byte Folded Reload -; CHECK0-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload -; CHECK0-NEXT: ldp d9, d8, [sp, #96] // 16-byte Folded Reload -; CHECK0-NEXT: ldp d11, d10, [sp, #80] // 16-byte Folded Reload -; CHECK0-NEXT: ldp d13, d12, [sp, #64] // 16-byte Folded Reload -; CHECK0-NEXT: ldp d15, d14, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #120] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x25, [sp, #96] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x24, x23, [sp, #104] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload ; CHECK0-NEXT: add sp, sp, #176 ; CHECK0-NEXT: .cfi_def_cfa_offset 0 ; CHECK0-NEXT: .cfi_restore w19 @@ -687,37 +687,37 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc ; CHECK64: // %bb.0: // %entry ; CHECK64-NEXT: sub sp, sp, #304 ; CHECK64-NEXT: .cfi_def_cfa_offset 304 -; CHECK64-NEXT: rdsvl x9, #1 -; CHECK64-NEXT: stp d15, d14, [sp, #112] // 16-byte Folded Spill -; CHECK64-NEXT: lsr x9, x9, #3 -; CHECK64-NEXT: stp d13, d12, [sp, #128] // 16-byte Folded Spill -; CHECK64-NEXT: stp d11, d10, [sp, #144] // 16-byte Folded Spill -; CHECK64-NEXT: str x9, [sp, #96] // 8-byte Folded Spill +; CHECK64-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x25, [sp, #224] // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #240] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #256] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset vg, -8 +; CHECK64-NEXT: .cfi_offset w19, -24 +; CHECK64-NEXT: .cfi_offset w20, -32 +; CHECK64-NEXT: .cfi_offset w21, -40 +; CHECK64-NEXT: .cfi_offset w22, -48 +; CHECK64-NEXT: .cfi_offset w23, -56 +; CHECK64-NEXT: .cfi_offset w24, -64 +; CHECK64-NEXT: .cfi_offset w25, -72 +; CHECK64-NEXT: .cfi_offset w29, -80 +; CHECK64-NEXT: .cfi_offset b8, -152 +; CHECK64-NEXT: .cfi_offset b9, -160 +; CHECK64-NEXT: .cfi_offset b10, -168 +; CHECK64-NEXT: .cfi_offset b11, -176 +; CHECK64-NEXT: .cfi_offset b12, -184 +; CHECK64-NEXT: .cfi_offset b13, -192 +; CHECK64-NEXT: .cfi_offset b14, -200 +; CHECK64-NEXT: .cfi_offset b15, -208 +; CHECK64-NEXT: rdsvl x8, #1 ; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: str x9, [sp, #104] // 8-byte Folded Spill -; CHECK64-NEXT: stp d9, d8, [sp, #160] // 16-byte Folded Spill -; CHECK64-NEXT: stp x29, x25, [sp, #240] // 16-byte Folded Spill -; CHECK64-NEXT: stp x24, x23, [sp, #256] // 16-byte Folded Spill -; CHECK64-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill -; CHECK64-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_offset w19, -8 -; CHECK64-NEXT: .cfi_offset w20, -16 -; CHECK64-NEXT: .cfi_offset w21, -24 -; CHECK64-NEXT: .cfi_offset w22, -32 -; CHECK64-NEXT: .cfi_offset w23, -40 -; CHECK64-NEXT: .cfi_offset w24, -48 -; CHECK64-NEXT: .cfi_offset w25, -56 -; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: .cfi_offset b8, -136 -; CHECK64-NEXT: .cfi_offset b9, -144 -; CHECK64-NEXT: .cfi_offset b10, -152 -; CHECK64-NEXT: .cfi_offset b11, -160 -; CHECK64-NEXT: .cfi_offset b12, -168 -; CHECK64-NEXT: .cfi_offset b13, -176 -; CHECK64-NEXT: .cfi_offset b14, -184 -; CHECK64-NEXT: .cfi_offset b15, -192 -; CHECK64-NEXT: .cfi_offset vg, -200 ; CHECK64-NEXT: str d0, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: lsr x8, x8, #3 +; CHECK64-NEXT: str x8, [sp, #288] +; CHECK64-NEXT: str x9, [sp, #296] ; CHECK64-NEXT: smstart sm ; CHECK64-NEXT: //APP ; CHECK64-NEXT: //NO_APP @@ -727,15 +727,15 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc ; CHECK64-NEXT: str x0, [sp, #8] ; CHECK64-NEXT: str d0, [sp, #88] ; CHECK64-NEXT: smstop sm -; CHECK64-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x20, x19, [sp, #272] // 16-byte Folded Reload ; CHECK64-NEXT: mov w0, wzr -; CHECK64-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x24, x23, [sp, #256] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x25, [sp, #240] // 16-byte Folded Reload -; CHECK64-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload -; CHECK64-NEXT: ldp d11, d10, [sp, #144] // 16-byte Folded Reload -; CHECK64-NEXT: ldp d13, d12, [sp, #128] // 16-byte Folded Reload -; CHECK64-NEXT: ldp d15, d14, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x22, x21, [sp, #256] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x24, x23, [sp, #240] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x25, [sp, #224] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK64-NEXT: add sp, sp, #304 ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 @@ -758,45 +758,45 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc ; ; CHECK1024-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming: ; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: rdsvl x9, #1 -; CHECK1024-NEXT: lsr x9, x9, #3 ; CHECK1024-NEXT: sub sp, sp, #1168 ; CHECK1024-NEXT: .cfi_def_cfa_offset 1168 -; CHECK1024-NEXT: str x9, [sp] // 8-byte Folded Spill -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; CHECK1024-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK1024-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK1024-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK1024-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK1024-NEXT: str x29, [sp, #1104] // 8-byte Folded Spill -; CHECK1024-NEXT: str x25, [sp, #1112] // 8-byte Folded Spill -; CHECK1024-NEXT: str x24, [sp, #1120] // 8-byte Folded Spill -; CHECK1024-NEXT: str x23, [sp, #1128] // 8-byte Folded Spill -; CHECK1024-NEXT: str x22, [sp, #1136] // 8-byte Folded Spill -; CHECK1024-NEXT: str x21, [sp, #1144] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1152] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1160] // 8-byte Folded Spill -; CHECK1024-NEXT: .cfi_offset w19, -8 -; CHECK1024-NEXT: .cfi_offset w20, -16 -; CHECK1024-NEXT: .cfi_offset w21, -24 -; CHECK1024-NEXT: .cfi_offset w22, -32 -; CHECK1024-NEXT: .cfi_offset w23, -40 -; CHECK1024-NEXT: .cfi_offset w24, -48 -; CHECK1024-NEXT: .cfi_offset w25, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: .cfi_offset b8, -1096 -; CHECK1024-NEXT: .cfi_offset b9, -1104 -; CHECK1024-NEXT: .cfi_offset b10, -1112 -; CHECK1024-NEXT: .cfi_offset b11, -1120 -; CHECK1024-NEXT: .cfi_offset b12, -1128 -; CHECK1024-NEXT: .cfi_offset b13, -1136 -; CHECK1024-NEXT: .cfi_offset b14, -1144 -; CHECK1024-NEXT: .cfi_offset b15, -1152 -; CHECK1024-NEXT: .cfi_offset vg, -1160 +; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK1024-NEXT: str x25, [sp, #1096] // 8-byte Folded Spill +; CHECK1024-NEXT: str x24, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x23, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1136] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1144] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset vg, -8 +; CHECK1024-NEXT: .cfi_offset w19, -24 +; CHECK1024-NEXT: .cfi_offset w20, -32 +; CHECK1024-NEXT: .cfi_offset w21, -40 +; CHECK1024-NEXT: .cfi_offset w22, -48 +; CHECK1024-NEXT: .cfi_offset w23, -56 +; CHECK1024-NEXT: .cfi_offset w24, -64 +; CHECK1024-NEXT: .cfi_offset w25, -72 +; CHECK1024-NEXT: .cfi_offset w29, -80 +; CHECK1024-NEXT: .cfi_offset b8, -1112 +; CHECK1024-NEXT: .cfi_offset b9, -1120 +; CHECK1024-NEXT: .cfi_offset b10, -1128 +; CHECK1024-NEXT: .cfi_offset b11, -1136 +; CHECK1024-NEXT: .cfi_offset b12, -1144 +; CHECK1024-NEXT: .cfi_offset b13, -1152 +; CHECK1024-NEXT: .cfi_offset b14, -1160 +; CHECK1024-NEXT: .cfi_offset b15, -1168 ; CHECK1024-NEXT: sub sp, sp, #1056 ; CHECK1024-NEXT: .cfi_def_cfa_offset 2224 +; CHECK1024-NEXT: rdsvl x8, #1 +; CHECK1024-NEXT: cntd x9 ; CHECK1024-NEXT: str d0, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: lsr x8, x8, #3 +; CHECK1024-NEXT: str x8, [sp, #2208] +; CHECK1024-NEXT: str x9, [sp, #2216] ; CHECK1024-NEXT: smstart sm ; CHECK1024-NEXT: //APP ; CHECK1024-NEXT: //NO_APP @@ -809,18 +809,18 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc ; CHECK1024-NEXT: mov w0, wzr ; CHECK1024-NEXT: add sp, sp, #1056 ; CHECK1024-NEXT: .cfi_def_cfa_offset 1168 -; CHECK1024-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr x19, [sp, #1160] // 8-byte Folded Reload -; CHECK1024-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1152] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x21, [sp, #1144] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x22, [sp, #1136] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x23, [sp, #1128] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x24, [sp, #1120] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x25, [sp, #1112] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1104] // 8-byte Folded Reload -; CHECK1024-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK1024-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1144] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x23, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x24, [sp, #1104] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x25, [sp, #1096] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload ; CHECK1024-NEXT: add sp, sp, #1168 ; CHECK1024-NEXT: .cfi_def_cfa_offset 0 ; CHECK1024-NEXT: .cfi_restore w19 @@ -1572,18 +1572,17 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK0: // %bb.0: ; CHECK0-NEXT: sub sp, sp, #176 ; CHECK0-NEXT: .cfi_def_cfa_offset 176 -; CHECK0-NEXT: cntd x9 ; CHECK0-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill ; CHECK0-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill ; CHECK0-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill ; CHECK0-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill -; CHECK0-NEXT: stp x30, x9, [sp, #128] // 16-byte Folded Spill -; CHECK0-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill -; CHECK0-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_offset w19, -8 -; CHECK0-NEXT: .cfi_offset w20, -16 -; CHECK0-NEXT: .cfi_offset w21, -24 -; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: str x30, [sp, #128] // 8-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #136] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #152] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -16 +; CHECK0-NEXT: .cfi_offset w20, -24 +; CHECK0-NEXT: .cfi_offset w21, -32 +; CHECK0-NEXT: .cfi_offset w22, -40 ; CHECK0-NEXT: .cfi_offset w30, -48 ; CHECK0-NEXT: .cfi_offset b8, -56 ; CHECK0-NEXT: .cfi_offset b9, -64 @@ -1593,13 +1592,15 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK0-NEXT: .cfi_offset b13, -96 ; CHECK0-NEXT: .cfi_offset b14, -104 ; CHECK0-NEXT: .cfi_offset b15, -112 +; CHECK0-NEXT: cntd x8 ; CHECK0-NEXT: mov w19, w1 ; CHECK0-NEXT: mov w20, w0 ; CHECK0-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill ; CHECK0-NEXT: stp q2, q3, [sp, #32] // 32-byte Folded Spill +; CHECK0-NEXT: str x8, [sp, #168] ; CHECK0-NEXT: bl __arm_sme_state ; CHECK0-NEXT: and x21, x0, #0x1 -; CHECK0-NEXT: .cfi_offset vg, -40 +; CHECK0-NEXT: .cfi_offset vg, -8 ; CHECK0-NEXT: tbz w21, #0, .LBB27_2 ; CHECK0-NEXT: // %bb.1: ; CHECK0-NEXT: smstop sm @@ -1615,7 +1616,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK0-NEXT: cset w21, lt ; CHECK0-NEXT: bl __arm_sme_state ; CHECK0-NEXT: and x22, x0, #0x1 -; CHECK0-NEXT: .cfi_offset vg, -40 +; CHECK0-NEXT: .cfi_offset vg, -8 ; CHECK0-NEXT: tbz w22, #0, .LBB27_6 ; CHECK0-NEXT: // %bb.5: ; CHECK0-NEXT: smstop sm @@ -1631,9 +1632,9 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK0-NEXT: tst w8, w21 ; CHECK0-NEXT: csel w0, w20, w19, ne ; CHECK0-NEXT: .cfi_restore vg -; CHECK0-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x20, x19, [sp, #152] // 16-byte Folded Reload ; CHECK0-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload -; CHECK0-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #136] // 16-byte Folded Reload ; CHECK0-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload ; CHECK0-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload ; CHECK0-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload @@ -1659,19 +1660,17 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK64: // %bb.0: ; CHECK64-NEXT: sub sp, sp, #320 ; CHECK64-NEXT: .cfi_def_cfa_offset 320 -; CHECK64-NEXT: cntd x9 ; CHECK64-NEXT: stp d15, d14, [sp, #128] // 16-byte Folded Spill ; CHECK64-NEXT: stp d13, d12, [sp, #144] // 16-byte Folded Spill ; CHECK64-NEXT: stp d11, d10, [sp, #160] // 16-byte Folded Spill ; CHECK64-NEXT: stp d9, d8, [sp, #176] // 16-byte Folded Spill ; CHECK64-NEXT: stp x29, x30, [sp, #256] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x22, [sp, #272] // 16-byte Folded Spill -; CHECK64-NEXT: stp x21, x20, [sp, #288] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #304] // 8-byte Folded Spill -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w20, -24 -; CHECK64-NEXT: .cfi_offset w21, -32 -; CHECK64-NEXT: .cfi_offset w22, -40 +; CHECK64-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -24 +; CHECK64-NEXT: .cfi_offset w20, -32 +; CHECK64-NEXT: .cfi_offset w21, -40 +; CHECK64-NEXT: .cfi_offset w22, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 ; CHECK64-NEXT: .cfi_offset b8, -136 @@ -1682,13 +1681,15 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK64-NEXT: .cfi_offset b13, -176 ; CHECK64-NEXT: .cfi_offset b14, -184 ; CHECK64-NEXT: .cfi_offset b15, -192 +; CHECK64-NEXT: cntd x8 ; CHECK64-NEXT: mov w19, w1 ; CHECK64-NEXT: mov w20, w0 ; CHECK64-NEXT: stp q0, q1, [sp, #64] // 32-byte Folded Spill ; CHECK64-NEXT: stp q2, q3, [sp, #96] // 32-byte Folded Spill +; CHECK64-NEXT: str x8, [sp, #304] ; CHECK64-NEXT: bl __arm_sme_state ; CHECK64-NEXT: and x21, x0, #0x1 -; CHECK64-NEXT: .cfi_offset vg, -48 +; CHECK64-NEXT: .cfi_offset vg, -16 ; CHECK64-NEXT: tbz w21, #0, .LBB27_2 ; CHECK64-NEXT: // %bb.1: ; CHECK64-NEXT: smstop sm @@ -1704,7 +1705,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK64-NEXT: cset w21, lt ; CHECK64-NEXT: bl __arm_sme_state ; CHECK64-NEXT: and x22, x0, #0x1 -; CHECK64-NEXT: .cfi_offset vg, -48 +; CHECK64-NEXT: .cfi_offset vg, -16 ; CHECK64-NEXT: tbz w22, #0, .LBB27_6 ; CHECK64-NEXT: // %bb.5: ; CHECK64-NEXT: smstop sm @@ -1720,8 +1721,8 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK64-NEXT: tst w8, w21 ; CHECK64-NEXT: csel w0, w20, w19, ne ; CHECK64-NEXT: .cfi_restore vg -; CHECK64-NEXT: ldp x20, x19, [sp, #296] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x22, x21, [sp, #280] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload ; CHECK64-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload ; CHECK64-NEXT: ldp d9, d8, [sp, #176] // 16-byte Folded Reload ; CHECK64-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload @@ -1749,22 +1750,20 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK1024: // %bb.0: ; CHECK1024-NEXT: sub sp, sp, #1152 ; CHECK1024-NEXT: .cfi_def_cfa_offset 1152 -; CHECK1024-NEXT: cntd x9 ; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill ; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill ; CHECK1024-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill -; CHECK1024-NEXT: str x22, [sp, #1112] // 8-byte Folded Spill -; CHECK1024-NEXT: str x21, [sp, #1120] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1128] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1136] // 8-byte Folded Spill -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w20, -24 -; CHECK1024-NEXT: .cfi_offset w21, -32 -; CHECK1024-NEXT: .cfi_offset w22, -40 +; CHECK1024-NEXT: str x22, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -24 +; CHECK1024-NEXT: .cfi_offset w20, -32 +; CHECK1024-NEXT: .cfi_offset w21, -40 +; CHECK1024-NEXT: .cfi_offset w22, -48 ; CHECK1024-NEXT: .cfi_offset w30, -56 ; CHECK1024-NEXT: .cfi_offset w29, -64 ; CHECK1024-NEXT: .cfi_offset b8, -1096 @@ -1777,15 +1776,17 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK1024-NEXT: .cfi_offset b15, -1152 ; CHECK1024-NEXT: sub sp, sp, #1088 ; CHECK1024-NEXT: .cfi_def_cfa_offset 2240 +; CHECK1024-NEXT: cntd x8 ; CHECK1024-NEXT: mov w19, w1 ; CHECK1024-NEXT: mov w20, w0 ; CHECK1024-NEXT: str q3, [sp, #1072] // 16-byte Folded Spill ; CHECK1024-NEXT: str q2, [sp, #1056] // 16-byte Folded Spill ; CHECK1024-NEXT: str q1, [sp, #1040] // 16-byte Folded Spill ; CHECK1024-NEXT: str q0, [sp, #1024] // 16-byte Folded Spill +; CHECK1024-NEXT: str x8, [sp, #2224] ; CHECK1024-NEXT: bl __arm_sme_state ; CHECK1024-NEXT: and x21, x0, #0x1 -; CHECK1024-NEXT: .cfi_offset vg, -48 +; CHECK1024-NEXT: .cfi_offset vg, -16 ; CHECK1024-NEXT: tbz w21, #0, .LBB27_2 ; CHECK1024-NEXT: // %bb.1: ; CHECK1024-NEXT: smstop sm @@ -1802,7 +1803,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK1024-NEXT: cset w21, lt ; CHECK1024-NEXT: bl __arm_sme_state ; CHECK1024-NEXT: and x22, x0, #0x1 -; CHECK1024-NEXT: .cfi_offset vg, -48 +; CHECK1024-NEXT: .cfi_offset vg, -16 ; CHECK1024-NEXT: tbz w22, #0, .LBB27_6 ; CHECK1024-NEXT: // %bb.5: ; CHECK1024-NEXT: smstop sm @@ -1822,11 +1823,11 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 ; CHECK1024-NEXT: add sp, sp, #1088 ; CHECK1024-NEXT: .cfi_def_cfa_offset 1152 ; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr x19, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload ; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1128] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x21, [sp, #1120] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x22, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1104] // 8-byte Folded Reload ; CHECK1024-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload ; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload ; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -1858,14 +1859,13 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { ; CHECK0-LABEL: svecc_call: ; CHECK0: // %bb.0: // %entry -; CHECK0-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK0-NEXT: str x29, [sp, #-48]! // 8-byte Folded Spill ; CHECK0-NEXT: .cfi_def_cfa_offset 48 -; CHECK0-NEXT: cntd x9 -; CHECK0-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill -; CHECK0-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_offset w19, -8 -; CHECK0-NEXT: .cfi_offset w27, -16 -; CHECK0-NEXT: .cfi_offset w28, -24 +; CHECK0-NEXT: stp x30, x28, [sp, #8] // 16-byte Folded Spill +; CHECK0-NEXT: stp x27, x19, [sp, #24] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -16 +; CHECK0-NEXT: .cfi_offset w27, -24 +; CHECK0-NEXT: .cfi_offset w28, -32 ; CHECK0-NEXT: .cfi_offset w30, -40 ; CHECK0-NEXT: .cfi_offset w29, -48 ; CHECK0-NEXT: addvl sp, sp, #-18 @@ -1906,12 +1906,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG ; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG ; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: addvl x10, sp, #18 ; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: str x9, [x10, #40] ; CHECK0-NEXT: //APP ; CHECK0-NEXT: //NO_APP ; CHECK0-NEXT: bl __arm_sme_state ; CHECK0-NEXT: and x19, x0, #0x1 -; CHECK0-NEXT: .cfi_offset vg, -32 +; CHECK0-NEXT: .cfi_offset vg, -8 ; CHECK0-NEXT: tbz w19, #0, .LBB28_2 ; CHECK0-NEXT: // %bb.1: // %entry ; CHECK0-NEXT: smstop sm @@ -1965,9 +1968,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK0-NEXT: .cfi_restore z13 ; CHECK0-NEXT: .cfi_restore z14 ; CHECK0-NEXT: .cfi_restore z15 -; CHECK0-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload -; CHECK0-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload -; CHECK0-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK0-NEXT: ldp x27, x19, [sp, #24] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x30, x28, [sp, #8] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x29, [sp], #48 // 8-byte Folded Reload ; CHECK0-NEXT: .cfi_def_cfa_offset 0 ; CHECK0-NEXT: .cfi_restore w19 ; CHECK0-NEXT: .cfi_restore w27 @@ -1980,13 +1983,12 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK64: // %bb.0: // %entry ; CHECK64-NEXT: sub sp, sp, #112 ; CHECK64-NEXT: .cfi_def_cfa_offset 112 -; CHECK64-NEXT: cntd x9 ; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x19, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_offset w19, -8 -; CHECK64-NEXT: .cfi_offset w27, -16 -; CHECK64-NEXT: .cfi_offset w28, -24 +; CHECK64-NEXT: stp x28, x27, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -16 +; CHECK64-NEXT: .cfi_offset w27, -24 +; CHECK64-NEXT: .cfi_offset w28, -32 ; CHECK64-NEXT: .cfi_offset w30, -40 ; CHECK64-NEXT: .cfi_offset w29, -48 ; CHECK64-NEXT: addvl sp, sp, #-18 @@ -2029,12 +2031,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 112 - 64 * VG ; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x01, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 176 + 144 * VG +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: addvl x10, sp, #18 ; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: str x9, [x10, #168] ; CHECK64-NEXT: //APP ; CHECK64-NEXT: //NO_APP ; CHECK64-NEXT: bl __arm_sme_state ; CHECK64-NEXT: and x19, x0, #0x1 -; CHECK64-NEXT: .cfi_offset vg, -32 +; CHECK64-NEXT: .cfi_offset vg, -8 ; CHECK64-NEXT: tbz w19, #0, .LBB28_2 ; CHECK64-NEXT: // %bb.1: // %entry ; CHECK64-NEXT: smstop sm @@ -2090,9 +2095,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK64-NEXT: .cfi_restore z13 ; CHECK64-NEXT: .cfi_restore z14 ; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: ldp x27, x19, [sp, #96] // 16-byte Folded Reload -; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x27, x19, [sp, #88] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x30, x28, [sp, #72] // 16-byte Folded Reload ; CHECK64-NEXT: add sp, sp, #112 ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 @@ -2106,16 +2111,14 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK1024: // %bb.0: // %entry ; CHECK1024-NEXT: sub sp, sp, #1072 ; CHECK1024-NEXT: .cfi_def_cfa_offset 1072 -; CHECK1024-NEXT: cntd x9 ; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill ; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: .cfi_offset w19, -8 -; CHECK1024-NEXT: .cfi_offset w27, -16 -; CHECK1024-NEXT: .cfi_offset w28, -24 +; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -16 +; CHECK1024-NEXT: .cfi_offset w27, -24 +; CHECK1024-NEXT: .cfi_offset w28, -32 ; CHECK1024-NEXT: .cfi_offset w30, -40 ; CHECK1024-NEXT: .cfi_offset w29, -48 ; CHECK1024-NEXT: addvl sp, sp, #-18 @@ -2158,12 +2161,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG ; CHECK1024-NEXT: sub sp, sp, #1024 ; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: addvl x10, sp, #18 ; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: str x9, [x10, #2088] ; CHECK1024-NEXT: //APP ; CHECK1024-NEXT: //NO_APP ; CHECK1024-NEXT: bl __arm_sme_state ; CHECK1024-NEXT: and x19, x0, #0x1 -; CHECK1024-NEXT: .cfi_offset vg, -32 +; CHECK1024-NEXT: .cfi_offset vg, -8 ; CHECK1024-NEXT: tbz w19, #0, .LBB28_2 ; CHECK1024-NEXT: // %bb.1: // %entry ; CHECK1024-NEXT: smstop sm @@ -2219,9 +2225,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK1024-NEXT: .cfi_restore z13 ; CHECK1024-NEXT: .cfi_restore z14 ; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: ldr x19, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload ; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload ; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload ; CHECK1024-NEXT: add sp, sp, #1072 @@ -2241,14 +2247,13 @@ entry: define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { ; CHECK0-LABEL: svecc_alloca_call: ; CHECK0: // %bb.0: // %entry -; CHECK0-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK0-NEXT: str x29, [sp, #-48]! // 8-byte Folded Spill ; CHECK0-NEXT: .cfi_def_cfa_offset 48 -; CHECK0-NEXT: cntd x9 -; CHECK0-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill -; CHECK0-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_offset w19, -8 -; CHECK0-NEXT: .cfi_offset w27, -16 -; CHECK0-NEXT: .cfi_offset w28, -24 +; CHECK0-NEXT: stp x30, x28, [sp, #8] // 16-byte Folded Spill +; CHECK0-NEXT: stp x27, x19, [sp, #24] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -16 +; CHECK0-NEXT: .cfi_offset w27, -24 +; CHECK0-NEXT: .cfi_offset w28, -32 ; CHECK0-NEXT: .cfi_offset w30, -40 ; CHECK0-NEXT: .cfi_offset w29, -48 ; CHECK0-NEXT: addvl sp, sp, #-18 @@ -2291,11 +2296,14 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, i32 %P1, i32 %P2, %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: svecc_call: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #-48]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w27, -16 -; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: stp x30, x28, [sp, #8] // 16-byte Folded Spill +; CHECK-NEXT: stp x27, x19, [sp, #24] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w27, -24 +; CHECK-NEXT: .cfi_offset w28, -32 ; CHECK-NEXT: .cfi_offset w30, -40 ; CHECK-NEXT: .cfi_offset w29, -48 ; CHECK-NEXT: addvl sp, sp, #-18 @@ -420,12 +419,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: addvl x10, sp, #18 ; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: str x9, [x10, #40] ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset vg, -8 ; CHECK-NEXT: tbz w19, #0, .LBB7_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstop sm @@ -479,9 +481,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK-NEXT: .cfi_restore z13 ; CHECK-NEXT: .cfi_restore z14 ; CHECK-NEXT: .cfi_restore z15 -; CHECK-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x27, x19, [sp, #24] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x28, [sp, #8] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp], #48 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 ; CHECK-NEXT: .cfi_restore w27 @@ -500,9 +502,9 @@ declare ptr @memset(ptr, i32, i32) ; objects, we emit correct offsets for all objects except for these VLA objects. ; CHECK-FRAMELAYOUT-LABEL: Function: vastate -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8 @@ -521,17 +523,15 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 112 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: add x29, sp, #64 ; CHECK-NEXT: .cfi_def_cfa w29, 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w19, -24 +; CHECK-NEXT: .cfi_offset w20, -32 ; CHECK-NEXT: .cfi_offset w30, -40 ; CHECK-NEXT: .cfi_offset w29, -48 ; CHECK-NEXT: .cfi_offset b8, -56 @@ -547,6 +547,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w20, w0 ; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: cntd x10 +; CHECK-NEXT: str x10, [x29, #32] ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-80] ; CHECK-NEXT: sub x9, x29, #80 @@ -554,7 +556,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: stur wzr, [x29, #-68] ; CHECK-NEXT: sturh w8, [x29, #-72] ; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl other ; CHECK-NEXT: smstart sm @@ -570,7 +572,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: sub sp, x29, #64 ; CHECK-NEXT: .cfi_def_cfa wsp, 112 -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload