-
Notifications
You must be signed in to change notification settings - Fork 15k
[AArch64][SME] Support aarch64-split-sve-objects with VLAs/realignment
#163816
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This was left out of the original patch (llvm#142392) to simplify the initial implementation. However, after refactoring the SVE prologue/epilogue code in llvm#162253, it's not much of an extension to support this case. The main change here is when restoring the SP from the FP for the SVE restores, we may need an additional frame offset to move from the start of the ZPR callee-saves to the start of the PPR callee-saves. This patch also fixes a previously latent bug where we'd add the `RealignmentPadding` when allocating the PPR locals, then again for the ZPR locals. This was unnecessary as the stack only needs to be realigned after all SVE allocations.
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis was left out of the original patch (#142392) to simplify the initial implementation. However, after refactoring the SVE prologue/epilogue code in #162253, it's not much of an extension to support this case. The main change here is when restoring the SP from the FP for the SVE restores, we may need an additional frame offset to move from the start of the ZPR callee-saves to the start of the PPR callee-saves. This patch also fixes a previously latent bug where we'd add the Patch is 158.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163816.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c76689f47d91c..115d7b1b6de8f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2373,13 +2373,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
return;
}
- const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
- if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
- LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
- "sized objects or realignment\n");
- return;
- }
-
// If another calling convention is explicitly set FPRs can't be promoted to
// ZPR callee-saves.
if (!is_contained({CallingConv::C, CallingConv::Fast,
@@ -2395,6 +2388,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Expected SVE to be available for PPRs");
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
// With SplitSVEObjects the CS hazard padding is placed between the
// PPRs and ZPRs. If there are any FPR CS there would be a hazard between
// them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 7e03b97584fe1..9f6c22b24f2a9 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -793,10 +793,9 @@ void AArch64PrologueEmitter::emitPrologue() {
CFAOffset += AllocateBeforePPRs;
assert(PPRRange.End == ZPRRange.Begin &&
"Expected ZPR callee saves after PPR locals");
- allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
- NonSVELocalsSize);
+ allocateStackSpace(
+ PPRRange.End, 0, AllocateAfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset,
+ MFI.hasVarSizedObjects() || ZPR.LocalsSize || NonSVELocalsSize);
CFAOffset += AllocateAfterPPRs;
} else {
assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
@@ -1308,6 +1307,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
SEHEpilogueStartI = MBB.end();
}
+void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI,
+ StackOffset Offset) {
+ // Other combinations could be supported, but are not currently needed.
+ assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 &&
+ "expected negative offset (with optional fixed portion)");
+ Register Base = AArch64::FP;
+ if (int64_t FixedOffset = Offset.getFixed()) {
+ // If we have a negative fixed offset, we need to first subtract it in a
+ // temporary register first (to avoid briefly deallocating the scalable
+ // portion of the offset).
+ Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP,
+ StackOffset::getFixed(FixedOffset), TII,
+ MachineInstr::FrameDestroy);
+ }
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base,
+ StackOffset::getScalable(Offset.getScalable()), TII,
+ MachineInstr::FrameDestroy);
+}
+
void AArch64EpilogueEmitter::emitEpilogue() {
MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
if (MBB.end() != EpilogueEndI) {
@@ -1408,6 +1427,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
AfterCSRPopSize += ProloguePopSize;
}
}
+
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1474,8 +1494,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
StackOffset SVEStackSize =
SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
- MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
- MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
// Deallocate the SVE area.
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -1490,7 +1508,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
}
// Deallocate callee-save SVE registers.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
NeedsWinCFI, &HasWinCFI);
} else if (AFI->hasSVEStackSize()) {
@@ -1501,28 +1519,26 @@ void AArch64EpilogueEmitter::emitEpilogue() {
(AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
: AArch64::SP;
if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
- // TODO: Support stack realigment and variable-sized objects.
- assert(
- SVELayout != SVEStackLayout::Split &&
- "unexpected stack realignment or variable sized objects with split "
- "SVE stack objects");
-
- Register CalleeSaveBase = AArch64::FP;
- if (int64_t CalleeSaveBaseOffset =
- AFI->getCalleeSaveBaseToFrameRecordOffset()) {
- // If we have have an non-zero offset to the non-SVE CS base we need to
- // compute the base address by subtracting the offest in a temporary
- // register first (to avoid briefly deallocating the SVE CS).
- CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
- &AArch64::GPR64RegClass);
- emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
- StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
+ // The offset from the frame-pointer to the start of the ZPR/PPR CSRs.
+ StackOffset FPOffsetZPRCSRs =
+ -SVECalleeSavesSize -
+ StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset());
+ StackOffset FPOffsetPPRCSRs = FPOffsetZPRCSRs + ZPR.CalleeSavesSize;
+
+ // With split SVE, the PPR locals are above the ZPR callee-saves.
+ if (ZPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split)
+ FPOffsetZPRCSRs -= PPR.LocalsSize;
+
+ // The code below will deallocate the stack space space by moving the SP
+ // to the start of the ZPR/PPR callee-save area.
+ moveSPBelowFP(ZPRRange.Begin, FPOffsetZPRCSRs);
+
+ if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) {
+ // Move to the start of the PPR area (this offset may be zero).
+ emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::SP,
+ FPOffsetPPRCSRs - FPOffsetZPRCSRs, TII,
MachineInstr::FrameDestroy);
}
- // The code below will deallocate the stack space space by moving the SP
- // to the start of the SVE callee-save area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
- -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
} else if (BaseForSVEDealloc == AArch64::SP) {
auto CFAOffset =
SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index bccaddaad9eec..029e607253b7f 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -174,6 +174,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon {
private:
bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
+ /// A helper for moving the SP to a negative offset from the FP, without
+ /// deallocating any stack in the range FP to FP + Offset.
+ void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset);
+
void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
const DebugLoc &DL) const;
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
index 35eafe8b7d99c..318710ccc80a0 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -182,63 +182,56 @@ body: |
RET_ReallyLR
# CHECK-LABEL: name: test_allocate_split_sve_realigned
-# CHECK: stackSize: 2080
+# CHECK: stackSize: 1056
# CHECK: bb.0.entry:
# CHECK: liveins: $z0, $p0, $lr
-# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
-# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
-# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
-# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930
#
# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
-# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
-# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
-# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1)
#
-# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
-# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
-# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
# CHECK-NEXT: RET_ReallyLR
# ASM-LABEL: test_allocate_split_sve_realigned
-# ASM: sub sp, sp, #1040
-# ASM-NEXT: .cfi_def_cfa_offset 1040
-# ASM-NEXT: str x29, [sp, #1024]
-# ASM-NEXT: str x30, [sp, #1032]
-# ASM-NEXT: add x29, sp, #1024
+# ASM: stp x29, x30, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: mov x29, sp
# ASM-NEXT: .cfi_def_cfa w29, 16
# ASM-NEXT: .cfi_offset w30, -8
# ASM-NEXT: .cfi_offset w29, -16
#
-# ASM: sub sp, x29, #1024
-# ASM-NEXT: .cfi_def_cfa wsp, 1040
-# ASM-NEXT: ldr x30, [sp, #1032]
-# ASM-NEXT: ldr x29, [sp, #1024]
-# ASM-NEXT: add sp, sp, #1040
+# ASM: mov sp, x29
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldp x29, x30, [sp], #16
# ASM-NEXT: .cfi_def_cfa_offset 0
# ASM-NEXT: .cfi_restore w30
# ASM-NEXT: .cfi_restore w29
-# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
#
-# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
# UNWINDINFO: DW_CFA_def_cfa_offset: +0
# UNWINDINFO-NEXT: DW_CFA_restore: reg30
# UNWINDINFO-NEXT: DW_CFA_restore: reg29
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index 690a39d12e6f1..d9474c6f00dce 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -749,36 +749,25 @@ entry:
}
declare ptr @memset(ptr, i32, i32)
-; FIXME: aarch64-split-sve-objects is currently not supported in this function
-; as it requires stack reealignment (for the 32-byte aligned alloca).
-; GPR CSRs
-; <hazard padding>
-; FPR CSRs
-; <hazrd padding>
-; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
-; <realignment padding>
-; -> sp
define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: zpr_and_ppr_local_realignment:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: sub x9, sp, #1040
-; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK-NEXT: add x29, sp, #1024
-; CHECK-NEXT: addvl x9, x9, #-2
-; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT: addvl x9, x9, #-1
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: sub x8, x29, #1024
-; CHECK-NEXT: str p0, [x8, #-1, mul vl]
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
; CHECK-NEXT: str z0, [x8, #-2, mul vl]
; CHECK-NEXT: str x0, [sp]
-; CHECK-NEXT: sub sp, x29, #1024
-; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%ppr_local = alloca <vscale x 16 x i1>
%zpr_local = alloca <vscale x 16 x i8>
@@ -822,3 +811,303 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
store volatile i64 %gpr, ptr %gpr_local
ret void
}
+
+; Only PPR callee-saves + a VLA
+define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_ppr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: addvl sp, x29, #-1
+; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ call void (...) @llvm.fake.use(ptr %alloc)
+ tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+ ret void
+}
+
+; Only ZPR callee-saves + a VLA
+define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_zpr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1024
+; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #1040] // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: addvl sp, x8, #-3
+; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: sub sp, x29, #1024
+; CHECK-NEXT: ldr x19, [sp, #1040] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ call void (...) @llvm.fake.use(ptr %alloc)
+ tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+ ret void
+}
+
+; PPR+ZPR callee-saves + a VLA
+define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: zpr_ppr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK...
[truncated]
|
| @@ -3512,14 +3512,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x | |||
| ; | |||
| ; CHECK64-LABEL: svecc_call_dynamic_alloca: | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Quite a lot of churn here, not really sure how helpful these tests are... I've created smaller test cases in split-sve-stack-frame-layout.ll that should be easier to follow.
This was left out of the original patch (#142392) to simplify the initial implementation. However, after refactoring the SVE prologue/epilogue code in #162253, it's not much of an extension to support this case.
The main change here is when restoring the SP from the FP for the SVE restores, we may need an additional frame offset to move from the start of the ZPR callee-saves to the start of the PPR callee-saves.
This patch also fixes a previously latent bug where we'd add the
RealignmentPaddingwhen allocating the PPR locals, then again for the ZPR locals. This was unnecessary as the stack only needs to be realigned after all SVE allocations.