Skip to content

Commit 12f392c

Browse files
authored
[AArch64][SME] Support aarch64-split-sve-objects with VLAs/realignment (#163816)
This was left out of the original patch (#142392) to simplify the initial implementation. However, after refactoring the SVE prologue/epilogue code in #162253, it's not much of an extension to support this case. The main change here is when restoring the SP from the FP for the SVE restores, we may need an additional frame offset to move from the start of the ZPR callee-saves to the start of the PPR callee-saves. This patch also fixes a previously latent bug where we'd add the `RealignmentPadding` when allocating the PPR locals, then again for the ZPR locals. This was unnecessary as the stack only needs to be realigned after all SVE allocations.
1 parent 89c2617 commit 12f392c

File tree

6 files changed

+1491
-799
lines changed

6 files changed

+1491
-799
lines changed

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
23802380
return;
23812381
}
23822382

2383-
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2384-
if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
2385-
LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
2386-
"sized objects or realignment\n");
2387-
return;
2388-
}
2389-
23902383
// If another calling convention is explicitly set FPRs can't be promoted to
23912384
// ZPR callee-saves.
23922385
if (!is_contained({CallingConv::C, CallingConv::Fast,
@@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
24022395
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
24032396
"Expected SVE to be available for PPRs");
24042397

2398+
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
24052399
// With SplitSVEObjects the CS hazard padding is placed between the
24062400
// PPRs and ZPRs. If there are any FPR CS there would be a hazard between
24072401
// them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp

Lines changed: 40 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -805,7 +805,7 @@ void AArch64PrologueEmitter::emitPrologue() {
805805
CFAOffset += SVEAllocs.BeforePPRs;
806806
assert(PPRRange.End == ZPRRange.Begin &&
807807
"Expected ZPR callee saves after PPR locals");
808-
allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs,
808+
allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs,
809809
EmitAsyncCFI && !HasFP, CFAOffset,
810810
MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs);
811811
CFAOffset += SVEAllocs.AfterPPRs;
@@ -1318,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
13181318
SEHEpilogueStartI = MBB.end();
13191319
}
13201320

1321+
void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI,
1322+
StackOffset Offset) {
1323+
// Other combinations could be supported, but are not currently needed.
1324+
assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 &&
1325+
"expected negative offset (with optional fixed portion)");
1326+
Register Base = AArch64::FP;
1327+
if (int64_t FixedOffset = Offset.getFixed()) {
1328+
// If we have a negative fixed offset, we need to first subtract it in a
1329+
// temporary register first (to avoid briefly deallocating the scalable
1330+
// portion of the offset).
1331+
Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
1332+
emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP,
1333+
StackOffset::getFixed(FixedOffset), TII,
1334+
MachineInstr::FrameDestroy);
1335+
}
1336+
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base,
1337+
StackOffset::getScalable(Offset.getScalable()), TII,
1338+
MachineInstr::FrameDestroy);
1339+
}
1340+
13211341
void AArch64EpilogueEmitter::emitEpilogue() {
13221342
MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
13231343
if (MBB.end() != EpilogueEndI) {
@@ -1418,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
14181438
AfterCSRPopSize += ProloguePopSize;
14191439
}
14201440
}
1441+
14211442
// Move past the restores of the callee-saved registers.
14221443
// If we plan on combining the sp bump of the local stack size and the callee
14231444
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1483,7 +1504,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
14831504

14841505
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
14851506
SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
1486-
MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
14871507

14881508
// Deallocate the SVE area.
14891509
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -1510,28 +1530,25 @@ void AArch64EpilogueEmitter::emitEpilogue() {
15101530
(AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
15111531
: AArch64::SP;
15121532
if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
1513-
// TODO: Support stack realigment and variable-sized objects.
1514-
assert(
1515-
SVELayout != SVEStackLayout::Split &&
1516-
"unexpected stack realignment or variable sized objects with split "
1517-
"SVE stack objects");
1518-
1519-
Register CalleeSaveBase = AArch64::FP;
1520-
if (int64_t CalleeSaveBaseOffset =
1521-
AFI->getCalleeSaveBaseToFrameRecordOffset()) {
1522-
// If we have have an non-zero offset to the non-SVE CS base we need to
1523-
// compute the base address by subtracting the offest in a temporary
1524-
// register first (to avoid briefly deallocating the SVE CS).
1525-
CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
1526-
&AArch64::GPR64RegClass);
1527-
emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
1528-
StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
1529-
MachineInstr::FrameDestroy);
1533+
if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) {
1534+
// The offset from the frame-pointer to the start of the ZPR saves.
1535+
StackOffset FPOffsetZPR =
1536+
-SVECalleeSavesSize - PPR.LocalsSize -
1537+
StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset());
1538+
// Deallocate the stack space space by moving the SP to the start of the
1539+
// ZPR/PPR callee-save area.
1540+
moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR);
1541+
}
1542+
// With split SVE, the predicates are stored in a separate area above the
1543+
// ZPR saves, so we must adjust the stack to the start of the PPRs.
1544+
if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) {
1545+
// The offset from the frame-pointer to the start of the PPR saves.
1546+
StackOffset FPOffsetPPR = -PPR.CalleeSavesSize;
1547+
// Move to the start of the PPR area.
1548+
assert(!FPOffsetPPR.getFixed() && "expected only scalable offset");
1549+
emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP,
1550+
FPOffsetPPR, TII, MachineInstr::FrameDestroy);
15301551
}
1531-
// The code below will deallocate the stack space space by moving the SP
1532-
// to the start of the SVE callee-save area.
1533-
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
1534-
-SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
15351552
} else if (BaseForSVEDealloc == AArch64::SP) {
15361553
auto NonSVELocals = StackOffset::getFixed(NumBytes);
15371554
auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) +

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon {
180180
private:
181181
bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
182182

183+
/// A helper for moving the SP to a negative offset from the FP, without
184+
/// deallocating any stack in the range FP to FP + Offset.
185+
void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset);
186+
183187
void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
184188
const DebugLoc &DL) const;
185189

llvm/test/CodeGen/AArch64/framelayout-split-sve.mir

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -162,63 +162,54 @@ body: |
162162
RET_ReallyLR
163163
164164
# CHECK-LABEL: name: test_allocate_split_sve_realigned
165-
# CHECK: stackSize: 2080
165+
# CHECK: stackSize: 1056
166166

167167
# CHECK: bb.0.entry:
168168
# CHECK: liveins: $z0, $p0, $lr
169-
# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0
170-
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
171-
# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
172-
# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
173-
# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
169+
# CHECK: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
170+
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
171+
# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
174172
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
175173
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
176174
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
177-
# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
178-
# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
179-
# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
175+
# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0
176+
# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg
177+
# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930
180178
#
181179
# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
182180
# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
183-
# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
184-
# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
185-
# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
181+
# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0)
182+
# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1)
186183
#
187-
# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
188-
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
189-
# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
190-
# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
191-
# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
184+
# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
185+
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
186+
# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
192187
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
193188
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
194189
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
195190
# CHECK-NEXT: RET_ReallyLR
196191

197192
# ASM-LABEL: test_allocate_split_sve_realigned
198-
# ASM: sub sp, sp, #1040
199-
# ASM-NEXT: .cfi_def_cfa_offset 1040
200-
# ASM-NEXT: str x29, [sp, #1024]
201-
# ASM-NEXT: str x30, [sp, #1032]
202-
# ASM-NEXT: add x29, sp, #1024
193+
# ASM: stp x29, x30, [sp, #-16]!
194+
# ASM-NEXT: .cfi_def_cfa_offset 16
195+
# ASM-NEXT: mov x29, sp
203196
# ASM-NEXT: .cfi_def_cfa w29, 16
204197
# ASM-NEXT: .cfi_offset w30, -8
205198
# ASM-NEXT: .cfi_offset w29, -16
206199
#
207-
# ASM: sub sp, x29, #1024
208-
# ASM-NEXT: .cfi_def_cfa wsp, 1040
209-
# ASM-NEXT: ldr x30, [sp, #1032]
210-
# ASM-NEXT: ldr x29, [sp, #1024]
211-
# ASM-NEXT: add sp, sp, #1040
200+
# ASM: mov sp, x29
201+
# ASM-NEXT: .cfi_def_cfa wsp, 16
202+
# ASM-NEXT: ldp x29, x30, [sp], #16
212203
# ASM-NEXT: .cfi_def_cfa_offset 0
213204
# ASM-NEXT: .cfi_restore w30
214205
# ASM-NEXT: .cfi_restore w29
215206

216-
# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
207+
# UNWINDINFO: DW_CFA_def_cfa_offset: +16
217208
# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
218209
# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
219210
# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
220211
#
221-
# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040
212+
# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
222213
# UNWINDINFO: DW_CFA_def_cfa_offset: +0
223214
# UNWINDINFO-NEXT: DW_CFA_restore: reg30
224215
# UNWINDINFO-NEXT: DW_CFA_restore: reg29

0 commit comments

Comments
 (0)