Skip to content

Commit 33609bd

Browse files
authored
[AArch64][SVE] Coalesce SVE prologue/epilogue stack adjustments (#163956)
With split SVE, it is possible to have multiple stack adjustments at the same location. Previously, these were all handled separately, which could result in more stack adjustments than necessary. This patch reworks the prologue/epilogue to group stack adjustments when possible. A nice side-effect is that the code for the prologue and epilogue is now more closely aligned/similar.
1 parent 5f3f175 commit 33609bd

File tree

4 files changed

+138
-200
lines changed

4 files changed

+138
-200
lines changed

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp

Lines changed: 73 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,22 @@ SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const {
370370
{ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}};
371371
}
372372

373+
SVEStackAllocations AArch64PrologueEpilogueCommon::getSVEStackAllocations(
374+
SVEFrameSizes const &SVE) {
375+
StackOffset AfterZPRs = SVE.ZPR.LocalsSize;
376+
StackOffset BeforePPRs = SVE.ZPR.CalleeSavesSize + SVE.PPR.CalleeSavesSize;
377+
StackOffset AfterPPRs = {};
378+
if (SVELayout == SVEStackLayout::Split) {
379+
BeforePPRs = SVE.PPR.CalleeSavesSize;
380+
// If there are no ZPR CSRs, place all local allocations after the ZPRs.
381+
if (SVE.ZPR.CalleeSavesSize)
382+
AfterPPRs += SVE.PPR.LocalsSize + SVE.ZPR.CalleeSavesSize;
383+
else
384+
AfterZPRs += SVE.PPR.LocalsSize; // Group allocation of locals.
385+
}
386+
return {BeforePPRs, AfterPPRs, AfterZPRs};
387+
}
388+
373389
struct SVEPartitions {
374390
struct {
375391
MachineBasicBlock::iterator Begin, End;
@@ -687,16 +703,19 @@ void AArch64PrologueEmitter::emitPrologue() {
687703
// All of the remaining stack allocations are for locals.
688704
determineLocalsStackSize(NumBytes, PrologueSaveSize);
689705

706+
auto [PPR, ZPR] = getSVEStackFrameSizes();
707+
SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
708+
690709
MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
691710
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
711+
assert(!SVEAllocs.AfterPPRs &&
712+
"unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord");
692713
// If we're doing SVE saves first, we need to immediately allocate space
693714
// for fixed objects, then space for the SVE callee saves.
694715
//
695716
// Windows unwind requires that the scalable size is a multiple of 16;
696717
// that's handled when the callee-saved size is computed.
697-
auto SaveSize =
698-
StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) +
699-
StackOffset::getFixed(FixedObject);
718+
auto SaveSize = SVEAllocs.BeforePPRs + StackOffset::getFixed(FixedObject);
700719
allocateStackSpace(PrologueBeginI, 0, SaveSize, false, StackOffset{},
701720
/*FollowupAllocs=*/true);
702721
NumBytes -= FixedObject;
@@ -764,12 +783,11 @@ void AArch64PrologueEmitter::emitPrologue() {
764783
if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
765784
emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
766785

767-
auto [PPR, ZPR] = getSVEStackFrameSizes();
768-
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
769786
StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes);
787+
SVEAllocs.AfterZPRs += NonSVELocalsSize;
788+
770789
StackOffset CFAOffset =
771790
StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize;
772-
773791
MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
774792
// Allocate space for the callee saves and PPR locals (if any).
775793
if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -780,31 +798,23 @@ void AArch64PrologueEmitter::emitPrologue() {
780798
if (EmitAsyncCFI)
781799
emitCalleeSavedSVELocations(AfterSVESavesI);
782800

783-
StackOffset AllocateBeforePPRs = SVECalleeSavesSize;
784-
StackOffset AllocateAfterPPRs = PPR.LocalsSize;
785-
if (SVELayout == SVEStackLayout::Split) {
786-
AllocateBeforePPRs = PPR.CalleeSavesSize;
787-
AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize;
788-
}
789-
allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs,
801+
allocateStackSpace(PPRRange.Begin, 0, SVEAllocs.BeforePPRs,
790802
EmitAsyncCFI && !HasFP, CFAOffset,
791-
MFI.hasVarSizedObjects() || AllocateAfterPPRs ||
792-
ZPR.LocalsSize || NonSVELocalsSize);
793-
CFAOffset += AllocateBeforePPRs;
803+
MFI.hasVarSizedObjects() || SVEAllocs.AfterPPRs ||
804+
SVEAllocs.AfterZPRs);
805+
CFAOffset += SVEAllocs.BeforePPRs;
794806
assert(PPRRange.End == ZPRRange.Begin &&
795807
"Expected ZPR callee saves after PPR locals");
796-
allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
808+
allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs,
797809
EmitAsyncCFI && !HasFP, CFAOffset,
798-
MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
799-
NonSVELocalsSize);
800-
CFAOffset += AllocateAfterPPRs;
810+
MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs);
811+
CFAOffset += SVEAllocs.AfterPPRs;
801812
} else {
802813
assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
803-
// Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been
804-
// allocated (and separate PPR locals are not supported, all SVE locals,
805-
// both PPR and ZPR, are within the ZPR locals area).
806-
assert(!PPR.LocalsSize && "Unexpected PPR locals!");
807-
CFAOffset += SVECalleeSavesSize;
814+
// Note: With CalleeSavesAboveFrameRecord, the SVE CS (BeforePPRs) have
815+
// already been allocated. PPR locals (included in AfterPPRs) are not
816+
// supported (note: this is asserted above).
817+
CFAOffset += SVEAllocs.BeforePPRs;
808818
}
809819

810820
// Allocate space for the rest of the frame including ZPR locals. Align the
@@ -815,9 +825,9 @@ void AArch64PrologueEmitter::emitPrologue() {
815825
// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the
816826
// correct value here, as NumBytes also includes padding bytes, which
817827
// shouldn't be counted here.
818-
allocateStackSpace(
819-
AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize,
820-
EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects());
828+
allocateStackSpace(AfterSVESavesI, RealignmentPadding, SVEAllocs.AfterZPRs,
829+
EmitAsyncCFI && !HasFP, CFAOffset,
830+
MFI.hasVarSizedObjects());
821831
}
822832

823833
// If we need a base pointer, set it up here. It's whatever the value of the
@@ -1472,27 +1482,26 @@ void AArch64EpilogueEmitter::emitEpilogue() {
14721482
assert(NumBytes >= 0 && "Negative stack allocation size!?");
14731483

14741484
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
1475-
StackOffset SVEStackSize =
1476-
SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
1485+
SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
14771486
MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
1478-
MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
14791487

14801488
// Deallocate the SVE area.
14811489
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
1482-
StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize;
1490+
assert(!SVEAllocs.AfterPPRs &&
1491+
"unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord");
14831492
// If the callee-save area is before FP, restoring the FP implicitly
1484-
// deallocates non-callee-save SVE allocations. Otherwise, deallocate them
1493+
// deallocates non-callee-save SVE allocations. Otherwise, deallocate them
14851494
// explicitly.
14861495
if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
14871496
emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
1488-
SVELocalsSize, TII, MachineInstr::FrameDestroy, false,
1489-
NeedsWinCFI, &HasWinCFI);
1497+
SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy,
1498+
false, NeedsWinCFI, &HasWinCFI);
14901499
}
14911500

14921501
// Deallocate callee-save SVE registers.
1493-
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1494-
SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
1495-
NeedsWinCFI, &HasWinCFI);
1502+
emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
1503+
SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy,
1504+
false, NeedsWinCFI, &HasWinCFI);
14961505
} else if (AFI->hasSVEStackSize()) {
14971506
// If we have stack realignment or variable-sized objects we must use the FP
14981507
// to restore SVE callee saves (as there is an unknown amount of
@@ -1524,46 +1533,33 @@ void AArch64EpilogueEmitter::emitEpilogue() {
15241533
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
15251534
-SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
15261535
} else if (BaseForSVEDealloc == AArch64::SP) {
1527-
auto CFAOffset =
1528-
SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
1529-
1530-
if (SVECalleeSavesSize) {
1531-
// Deallocate the non-SVE locals first before we can deallocate (and
1532-
// restore callee saves) from the SVE area.
1533-
auto NonSVELocals = StackOffset::getFixed(NumBytes);
1534-
emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
1535-
NonSVELocals, TII, MachineInstr::FrameDestroy, false,
1536-
NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
1537-
CFAOffset -= NonSVELocals;
1538-
NumBytes = 0;
1539-
}
1540-
1541-
if (ZPR.LocalsSize) {
1542-
emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
1543-
ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false,
1544-
NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
1545-
CFAOffset -= ZPR.LocalsSize;
1536+
auto NonSVELocals = StackOffset::getFixed(NumBytes);
1537+
auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) +
1538+
SVEAllocs.totalSize();
1539+
1540+
if (SVECalleeSavesSize || SVELayout == SVEStackLayout::Split) {
1541+
// Deallocate non-SVE locals now. This is needed to reach the SVE callee
1542+
// saves, but may also allow combining stack hazard bumps for split SVE.
1543+
SVEAllocs.AfterZPRs += NonSVELocals;
1544+
NumBytes -= NonSVELocals.getFixed();
15461545
}
1547-
1548-
StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize;
1549-
if (SVELayout == SVEStackLayout::Split &&
1550-
(PPR.LocalsSize || ZPR.CalleeSavesSize)) {
1551-
assert(PPRRange.Begin == ZPRRange.End &&
1552-
"Expected PPR restores after ZPR");
1553-
emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
1554-
PPR.LocalsSize + ZPR.CalleeSavesSize, TII,
1555-
MachineInstr::FrameDestroy, false, NeedsWinCFI,
1556-
&HasWinCFI, EmitCFI && !HasFP, CFAOffset);
1557-
CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize;
1558-
SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize;
1559-
}
1560-
1561-
// If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs:
1562-
if (SVECalleeSavesToDealloc)
1563-
emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
1564-
SVECalleeSavesToDealloc, TII,
1565-
MachineInstr::FrameDestroy, false, NeedsWinCFI,
1566-
&HasWinCFI, EmitCFI && !HasFP, CFAOffset);
1546+
// To deallocate the SVE stack adjust by the allocations in reverse.
1547+
emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
1548+
SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy,
1549+
false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
1550+
CFAOffset);
1551+
CFAOffset -= SVEAllocs.AfterZPRs;
1552+
assert(PPRRange.Begin == ZPRRange.End &&
1553+
"Expected PPR restores after ZPR");
1554+
emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
1555+
SVEAllocs.AfterPPRs, TII, MachineInstr::FrameDestroy,
1556+
false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
1557+
CFAOffset);
1558+
CFAOffset -= SVEAllocs.AfterPPRs;
1559+
emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
1560+
SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy,
1561+
false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
1562+
CFAOffset);
15671563
}
15681564

15691565
if (EmitCFI)

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ struct SVEFrameSizes {
3333
} PPR, ZPR;
3434
};
3535

36+
struct SVEStackAllocations {
37+
StackOffset BeforePPRs, AfterPPRs, AfterZPRs;
38+
StackOffset totalSize() const { return BeforePPRs + AfterPPRs + AfterZPRs; }
39+
};
40+
3641
class AArch64PrologueEpilogueCommon {
3742
public:
3843
AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -66,6 +71,7 @@ class AArch64PrologueEpilogueCommon {
6671
bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
6772

6873
SVEFrameSizes getSVEStackFrameSizes() const;
74+
SVEStackAllocations getSVEStackAllocations(SVEFrameSizes const &);
6975

7076
MachineFunction &MF;
7177
MachineBasicBlock &MBB;

0 commit comments

Comments
 (0)