Skip to content

Commit e6e556b

Browse files
committed
[AArch64][SME] Support split ZPR and PPR area allocation
For a while we have supported the `-aarch64-stack-hazard-size=<size>` option, which adds "hazard padding" between GPRs and FPR/ZPRs. However, there is currently a hole in this mitigation as PPR and FPR/ZPR accesses to the same area also cause streaming memory hazards (this is noted by `-pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=<val>`), and the current stack layout places PPRs and ZPRs within the same area. Which looks like: ------------------------------------ Higher address | callee-saved gpr registers | |---------------------------------- | | lr,fp (a.k.a. "frame record") | |-----------------------------------| <- fp(=x29) | <hazard padding> | |-----------------------------------| | callee-saved fp/simd/SVE regs | |-----------------------------------| | SVE stack objects | |-----------------------------------| | local variables of fixed size | | <FPR> | | <hazard padding> | | <GPR> | ------------------------------------| <- sp | Lower address With this patch the stack (and hazard padding) is rearranged so that hazard padding is placed between the PPRs and ZPRs rather than within the (fixed size) callee-save region. Which looks something like this: ------------------------------------ Higher address | callee-saved gpr registers | |---------------------------------- | | lr,fp (a.k.a. "frame record") | |-----------------------------------| <- fp(=x29) | callee-saved PPRs | | PPR stack objects | (These are SVE predicates) |-----------------------------------| | <hazard padding> | |-----------------------------------| | callee-saved ZPR regs | (These are SVE vectors) | ZPR stack objects | Note: FPRs are promoted to ZPRs |-----------------------------------| | local variables of fixed size | | <FPR> | | <hazard padding> | | <GPR> | ------------------------------------| <- sp | Lower address This layout is only enabled if: * SplitSVEObjects are enabled (`-aarch64-split-sve-objects`) - (This may be enabled by default in a later patch) * Streaming memory hazards are present - (`-aarch64-stack-hazard-size=<val>` != 0) * PPRs and FPRs/ZPRs are on the stack * There's no stack realignment or variable-sized objects - This is left as a TODO for now Additionally, any FPR callee-saves that are present will be promoted to ZPRs. This is to prevent stack hazards between FPRs and GRPs in the fixed size callee-save area (which would otherwise require more hazard padding, or moving the FPR callee-saves). This layout should resolve the hole in the hazard padding mitigation, and is not intended change codegen for non-SME code. Change-Id: I2e1906577c2ac79c40bc69e7c15e3ef09857445f
1 parent 271ed51 commit e6e556b

10 files changed

+2138
-403
lines changed

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1612,7 +1612,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
16121612
StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
16131613
MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
16141614
/*PreferFP=*/false,
1615-
/*ForSimm=*/true);
1615+
/*ForSimm=*/true,
1616+
/*FI=*/-1);
16161617
Register SrcReg = FrameReg;
16171618
if (FrameRegOffset) {
16181619
// Use output register as temporary.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 193 additions & 50 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AArch64/AArch64FrameLowering.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
7070
StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
7171
int64_t ObjectOffset, bool isFixed,
7272
bool isSVE, Register &FrameReg,
73-
bool PreferFP, bool ForSimm) const;
73+
bool PreferFP, bool ForSimm,
74+
int64_t FI) const;
7475
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
7576
MachineBasicBlock::iterator MI,
7677
ArrayRef<CalleeSavedInfo> CSI,
@@ -155,7 +156,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
155156
/// Returns the size of the entire ZPR stackframe (calleesaves + spills).
156157
StackOffset getZPRStackSize(const MachineFunction &MF) const;
157158

158-
/// Returns the size of the entire PPR stackframe (calleesaves + spills).
159+
/// Returns the size of the entire PPR stackframe (calleesaves + spills +
160+
/// hazard padding).
159161
StackOffset getPPRStackSize(const MachineFunction &MF) const;
160162

161163
/// Returns the size of the entire SVE stackframe (PPRs + ZPRs).

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
137137
uint64_t StackSizeZPR = 0;
138138
uint64_t StackSizePPR = 0;
139139

140+
/// Are SVE objects (vectors and predicates) split into separate regions on
141+
/// the stack.
142+
bool SplitSVEObjects = false;
143+
140144
/// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid.
141145
bool HasCalculatedStackSizeSVE = false;
142146

@@ -336,7 +340,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
336340

337341
bool isStackRealigned() const { return StackRealigned; }
338342
void setStackRealigned(bool s) { StackRealigned = s; }
339-
340343
bool hasCalleeSaveStackFreeSpace() const {
341344
return CalleeSaveStackHasFreeSpace;
342345
}
@@ -481,7 +484,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
481484
StackHazardCSRSlotIndex = Index;
482485
}
483486

484-
bool hasSplitSVEObjects() const { return false; }
487+
bool hasSplitSVEObjects() const { return SplitSVEObjects; }
488+
void setSplitSVEObjects(bool s) { SplitSVEObjects = s; }
485489

486490
SMEAttrs getSMEFnAttrs() const { return SMEFnAttrs; }
487491

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp

Lines changed: 105 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,36 @@ void AArch64PrologueEmitter::emitPrologue() {
747747
emitCalleeSavedSVELocations(AfterSVESavesI);
748748

749749
if (AFI->hasSplitSVEObjects()) {
750-
reportFatalInternalError("not implemented yet");
750+
assert(!FPAfterSVECalleeSaves &&
751+
"Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
752+
assert(!AFL.canUseRedZone(MF) &&
753+
"Cannot use redzone with aarch64-split-sve-objects");
754+
// TODO: Handle HasWinCFI/NeedsWinCFI?
755+
assert(!NeedsWinCFI &&
756+
"WinCFI with aarch64-split-sve-objects is not supported");
757+
758+
// Split ZPR and PPR allocation.
759+
// Allocate PPR callee saves
760+
allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
761+
EmitAsyncCFI && !HasFP, CFAOffset,
762+
MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
763+
ZPRLocalsSize || PPRLocalsSize);
764+
CFAOffset += PPRCalleeSavesSize;
765+
766+
// Allocate PPR locals + ZPR callee saves
767+
assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
768+
"Expected ZPR callee saves after PPR locals");
769+
allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
770+
PPRLocalsSize + ZPRCalleeSavesSize,
771+
EmitAsyncCFI && !HasFP, CFAOffset,
772+
MFI.hasVarSizedObjects() || ZPRLocalsSize);
773+
CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
774+
775+
// Allocate ZPR locals
776+
allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
777+
ZPRLocalsSize + StackOffset::getFixed(NumBytes),
778+
EmitAsyncCFI && !HasFP, CFAOffset,
779+
MFI.hasVarSizedObjects());
751780
} else {
752781
// Allocate space for the callee saves (if any).
753782
StackOffset LocalsSize =
@@ -1214,8 +1243,10 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
12141243
AFL.getOffsetOfLocalArea();
12151244
}
12161245

1246+
StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
12171247
for (const auto &Info : CSI) {
1218-
if (!MFI.isScalableStackID(Info.getFrameIdx()))
1248+
int FI = Info.getFrameIdx();
1249+
if (!MFI.isScalableStackID(FI))
12191250
continue;
12201251

12211252
// Not all unwinders may know about SVE registers, so assume the lowest
@@ -1226,9 +1257,13 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
12261257
continue;
12271258

12281259
StackOffset Offset =
1229-
StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
1260+
StackOffset::getScalable(MFI.getObjectOffset(FI)) -
12301261
StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
12311262

1263+
if (AFI->hasSplitSVEObjects() &&
1264+
MFI.getStackID(FI) == TargetStackID::ScalableVector)
1265+
Offset -= PPRStackSize;
1266+
12321267
CFIBuilder.insertCFIInst(
12331268
createCFAOffset(RegInfo, Reg, Offset, IncomingVGOffsetFromDefCFA));
12341269
}
@@ -1505,7 +1540,73 @@ void AArch64EpilogueEmitter::emitEpilogue() {
15051540
emitCalleeSavedSVERestores(RestoreEnd);
15061541
}
15071542
} else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
1508-
reportFatalInternalError("not implemented yet");
1543+
assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
1544+
"TODO: Support stack realigment / variable-sized objects");
1545+
// SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
1546+
// areas.
1547+
auto ZPRCalleeSavedSize =
1548+
StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
1549+
auto PPRCalleeSavedSize =
1550+
StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
1551+
StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
1552+
StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
1553+
1554+
MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
1555+
PPRRestoreEnd = FirstGPRRestoreI;
1556+
if (PPRCalleeSavedSize) {
1557+
PPRRestoreBegin = std::prev(PPRRestoreEnd);
1558+
while (PPRRestoreBegin != MBB.begin() &&
1559+
isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
1560+
--PPRRestoreBegin;
1561+
}
1562+
1563+
MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
1564+
ZPRRestoreEnd = PPRRestoreBegin;
1565+
if (ZPRCalleeSavedSize) {
1566+
ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
1567+
while (ZPRRestoreBegin != MBB.begin() &&
1568+
isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
1569+
--ZPRRestoreBegin;
1570+
}
1571+
1572+
auto CFAOffset =
1573+
SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
1574+
if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
1575+
// Deallocate the non-SVE locals first before we can deallocate (and
1576+
// restore callee saves) from the SVE area.
1577+
auto NonSVELocals = StackOffset::getFixed(NumBytes);
1578+
emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
1579+
NonSVELocals, TII, MachineInstr::FrameDestroy, false,
1580+
false, nullptr, EmitCFI && !HasFP, CFAOffset);
1581+
NumBytes = 0;
1582+
CFAOffset -= NonSVELocals;
1583+
}
1584+
1585+
if (ZPRLocalsSize) {
1586+
emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
1587+
ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
1588+
false, nullptr, EmitCFI && !HasFP, CFAOffset);
1589+
CFAOffset -= ZPRLocalsSize;
1590+
}
1591+
1592+
if (PPRLocalsSize || ZPRCalleeSavedSize) {
1593+
assert(PPRRestoreBegin == ZPRRestoreEnd &&
1594+
"Expected PPR restores after ZPR");
1595+
emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
1596+
PPRLocalsSize + ZPRCalleeSavedSize, TII,
1597+
MachineInstr::FrameDestroy, false, false, nullptr,
1598+
EmitCFI && !HasFP, CFAOffset);
1599+
CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
1600+
}
1601+
if (PPRCalleeSavedSize) {
1602+
emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
1603+
PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
1604+
false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
1605+
}
1606+
1607+
// We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
1608+
if (EmitCFI)
1609+
emitCalleeSavedSVERestores(ZPRRestoreEnd);
15091610
}
15101611

15111612
if (!HasFP) {

0 commit comments

Comments
 (0)