-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[Draft] Support save/restore point splitting in shrink-wrap #119359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[Draft] Support save/restore point splitting in shrink-wrap #119359
Conversation
|
@llvm/pr-subscribers-backend-webassembly @llvm/pr-subscribers-backend-arm Author: Elizaveta Noskova (enoskova-sc) ChangesThis patch introduces "-enable-shrink-wrap-into-multiple-points" Current algorithm disables Save / Restore point splitting for This patch also add support for multiple Save / Restore points only for RISCV. Now ShrinkWrap produces:
Shrink-Wrap points split Part 5. Part 1: #117862 Patch is 393.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119359.diff 358 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 09a6ca936fe1f4..d4dc53fc0ed32c 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -610,6 +610,24 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineJumpTable::Entry)
namespace llvm {
namespace yaml {
+struct SRPEntry {
+ StringValue Point;
+ std::vector<StringValue> Registers;
+
+ bool operator==(const SRPEntry &Other) const {
+ return Point == Other.Point && Registers == Other.Registers;
+ }
+};
+
+using SaveRestorePoints = std::vector<SRPEntry>;
+
+template <> struct MappingTraits<SRPEntry> {
+ static void mapping(IO &YamlIO, SRPEntry &Entry) {
+ YamlIO.mapRequired("point", Entry.Point);
+ YamlIO.mapRequired("registers", Entry.Registers);
+ }
+};
+
template <> struct MappingTraits<MachineJumpTable> {
static void mapping(IO &YamlIO, MachineJumpTable &JT) {
YamlIO.mapRequired("kind", JT.Kind);
@@ -618,6 +636,14 @@ template <> struct MappingTraits<MachineJumpTable> {
}
};
+} // namespace yaml
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::SRPEntry)
+
+namespace llvm {
+namespace yaml {
+
/// Serializable representation of MachineFrameInfo.
///
/// Doesn't serialize attributes like 'StackAlignment', 'IsStackRealignable' and
@@ -645,8 +671,8 @@ struct MachineFrameInfo {
bool HasTailCall = false;
bool IsCalleeSavedInfoValid = false;
unsigned LocalFrameSize = 0;
- StringValue SavePoint;
- StringValue RestorePoint;
+ SaveRestorePoints SavePoints;
+ SaveRestorePoints RestorePoints;
bool operator==(const MachineFrameInfo &Other) const {
return IsFrameAddressTaken == Other.IsFrameAddressTaken &&
@@ -667,7 +693,8 @@ struct MachineFrameInfo {
HasMustTailInVarArgFunc == Other.HasMustTailInVarArgFunc &&
HasTailCall == Other.HasTailCall &&
LocalFrameSize == Other.LocalFrameSize &&
- SavePoint == Other.SavePoint && RestorePoint == Other.RestorePoint &&
+ SavePoints == Other.SavePoints &&
+ RestorePoints == Other.RestorePoints &&
IsCalleeSavedInfoValid == Other.IsCalleeSavedInfoValid;
}
};
@@ -699,10 +726,12 @@ template <> struct MappingTraits<MachineFrameInfo> {
YamlIO.mapOptional("isCalleeSavedInfoValid", MFI.IsCalleeSavedInfoValid,
false);
YamlIO.mapOptional("localFrameSize", MFI.LocalFrameSize, (unsigned)0);
- YamlIO.mapOptional("savePoint", MFI.SavePoint,
- StringValue()); // Don't print it out when it's empty.
- YamlIO.mapOptional("restorePoint", MFI.RestorePoint,
- StringValue()); // Don't print it out when it's empty.
+ YamlIO.mapOptional(
+ "savePoints", MFI.SavePoints,
+ SaveRestorePoints()); // Don't print it out when it's empty.
+ YamlIO.mapOptional(
+ "restorePoints", MFI.RestorePoints,
+ SaveRestorePoints()); // Don't print it out when it's empty.
}
};
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
index 74cf94398736dd..88800d91ef51a9 100644
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -185,6 +185,11 @@ class MachineDominatorTree : public DomTreeBase<MachineBasicBlock> {
return Base::findNearestCommonDominator(A, B);
}
+ /// Returns the nearest common dominator of the given blocks.
+ /// If that tree node is a virtual root, a nullptr will be returned.
+ MachineBasicBlock *
+ findNearestCommonDominator(ArrayRef<MachineBasicBlock *> Blocks) const;
+
MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
applySplitCriticalEdges();
return Base::getNode(BB);
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 213b7ec6b3fbfb..d746466d41c3e2 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -27,6 +27,21 @@ class MachineBasicBlock;
class BitVector;
class AllocaInst;
+using SaveRestorePoints = DenseMap<MachineBasicBlock *, std::vector<Register>>;
+
+class CalleeSavedInfoPerBB {
+ DenseMap<MachineBasicBlock *, std::vector<CalleeSavedInfo>> Map;
+
+public:
+ std::vector<CalleeSavedInfo> get(MachineBasicBlock *MBB) const {
+ return Map.lookup(MBB);
+ }
+
+ void set(DenseMap<MachineBasicBlock *, std::vector<CalleeSavedInfo>> CSI) {
+ Map = std::move(CSI);
+ }
+};
+
/// The CalleeSavedInfo class tracks the information need to locate where a
/// callee saved register is in the current frame.
/// Callee saved reg can also be saved to a different register rather than
@@ -37,6 +52,8 @@ class CalleeSavedInfo {
int FrameIdx;
unsigned DstReg;
};
+ std::vector<MachineBasicBlock *> SpilledIn;
+ std::vector<MachineBasicBlock *> RestoredIn;
/// Flag indicating whether the register is actually restored in the epilog.
/// In most cases, if a register is saved, it is also restored. There are
/// some situations, though, when this is not the case. For example, the
@@ -58,9 +75,9 @@ class CalleeSavedInfo {
explicit CalleeSavedInfo(unsigned R, int FI = 0) : Reg(R), FrameIdx(FI) {}
// Accessors.
- Register getReg() const { return Reg; }
- int getFrameIdx() const { return FrameIdx; }
- unsigned getDstReg() const { return DstReg; }
+ Register getReg() const { return Reg; }
+ int getFrameIdx() const { return FrameIdx; }
+ unsigned getDstReg() const { return DstReg; }
void setFrameIdx(int FI) {
FrameIdx = FI;
SpilledToReg = false;
@@ -72,6 +89,16 @@ class CalleeSavedInfo {
bool isRestored() const { return Restored; }
void setRestored(bool R) { Restored = R; }
bool isSpilledToReg() const { return SpilledToReg; }
+ ArrayRef<MachineBasicBlock *> spilledIn() const { return SpilledIn; }
+ ArrayRef<MachineBasicBlock *> restoredIn() const { return RestoredIn; }
+ void addSpilledIn(MachineBasicBlock *MBB) { SpilledIn.push_back(MBB); }
+ void addRestoredIn(MachineBasicBlock *MBB) { RestoredIn.push_back(MBB); }
+ void setSpilledIn(std::vector<MachineBasicBlock *> BBV) {
+ SpilledIn = std::move(BBV);
+ }
+ void setRestoredIn(std::vector<MachineBasicBlock *> BBV) {
+ RestoredIn = std::move(BBV);
+ }
};
/// The MachineFrameInfo class represents an abstract stack frame until
@@ -295,6 +322,10 @@ class MachineFrameInfo {
/// Has CSInfo been set yet?
bool CSIValid = false;
+ CalleeSavedInfoPerBB CSInfoPerSave;
+
+ CalleeSavedInfoPerBB CSInfoPerRestore;
+
/// References to frame indices which are mapped
/// into the local frame allocation block. <FrameIdx, LocalOffset>
SmallVector<std::pair<int, int64_t>, 32> LocalFrameObjects;
@@ -331,9 +362,16 @@ class MachineFrameInfo {
bool HasTailCall = false;
/// Not null, if shrink-wrapping found a better place for the prologue.
- MachineBasicBlock *Save = nullptr;
+ MachineBasicBlock *Prolog = nullptr;
/// Not null, if shrink-wrapping found a better place for the epilogue.
- MachineBasicBlock *Restore = nullptr;
+ MachineBasicBlock *Epilog = nullptr;
+
+ /// Not empty, if shrink-wrapping found a better place for saving callee
+ /// saves.
+ SaveRestorePoints SavePoints;
+ /// Not empty, if shrink-wrapping found a better place for restoring callee
+ /// saves.
+ SaveRestorePoints RestorePoints;
/// Size of the UnsafeStack Frame
uint64_t UnsafeStackSize = 0;
@@ -809,21 +847,105 @@ class MachineFrameInfo {
/// \copydoc getCalleeSavedInfo()
std::vector<CalleeSavedInfo> &getCalleeSavedInfo() { return CSInfo; }
+ /// Returns callee saved info vector for provided save point in
+ /// the current function.
+ std::vector<CalleeSavedInfo> getCSInfoPerSave(MachineBasicBlock *MBB) const {
+ return CSInfoPerSave.get(MBB);
+ }
+
+ /// Returns callee saved info vector for provided restore point
+ /// in the current function.
+ std::vector<CalleeSavedInfo>
+ getCSInfoPerRestore(MachineBasicBlock *MBB) const {
+ return CSInfoPerRestore.get(MBB);
+ }
+
/// Used by prolog/epilog inserter to set the function's callee saved
/// information.
void setCalleeSavedInfo(std::vector<CalleeSavedInfo> CSI) {
CSInfo = std::move(CSI);
}
+ /// Used by prolog/epilog inserter to set the function's callee saved
+ /// information for particular save point.
+ void setCSInfoPerSave(
+ DenseMap<MachineBasicBlock *, std::vector<CalleeSavedInfo>> CSI) {
+ CSInfoPerSave.set(CSI);
+ }
+
+ /// Used by prolog/epilog inserter to set the function's callee saved
+ /// information for particular restore point.
+ void setCSInfoPerRestore(
+ DenseMap<MachineBasicBlock *, std::vector<CalleeSavedInfo>> CSI) {
+ CSInfoPerRestore.set(CSI);
+ }
+
/// Has the callee saved info been calculated yet?
bool isCalleeSavedInfoValid() const { return CSIValid; }
void setCalleeSavedInfoValid(bool v) { CSIValid = v; }
- MachineBasicBlock *getSavePoint() const { return Save; }
- void setSavePoint(MachineBasicBlock *NewSave) { Save = NewSave; }
- MachineBasicBlock *getRestorePoint() const { return Restore; }
- void setRestorePoint(MachineBasicBlock *NewRestore) { Restore = NewRestore; }
+ const SaveRestorePoints &getRestorePoints() const { return RestorePoints; }
+
+ const SaveRestorePoints &getSavePoints() const { return SavePoints; }
+
+ std::pair<MachineBasicBlock *, std::vector<Register>>
+ getRestorePoint(MachineBasicBlock *MBB) const {
+ if (auto It = RestorePoints.find(MBB); It != RestorePoints.end())
+ return *It;
+
+ std::vector<Register> Regs = {};
+ return std::make_pair(nullptr, Regs);
+ }
+
+ std::pair<MachineBasicBlock *, std::vector<Register>>
+ getSavePoint(MachineBasicBlock *MBB) const {
+ if (auto It = SavePoints.find(MBB); It != SavePoints.end())
+ return *It;
+
+ std::vector<Register> Regs = {};
+ return std::make_pair(nullptr, Regs);
+ }
+
+ void setSavePoints(SaveRestorePoints NewSavePoints) {
+ SavePoints = std::move(NewSavePoints);
+ }
+
+ void setRestorePoints(SaveRestorePoints NewRestorePoints) {
+ RestorePoints = std::move(NewRestorePoints);
+ }
+
+ void setSavePoint(MachineBasicBlock *MBB, std::vector<Register> &Regs) {
+ if (SavePoints.contains(MBB))
+ SavePoints[MBB] = Regs;
+ else
+ SavePoints.insert(std::make_pair(MBB, Regs));
+ }
+
+ static const SaveRestorePoints constructSaveRestorePoints(
+ const SaveRestorePoints &SRP,
+ const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &BBMap) {
+ SaveRestorePoints Pts{};
+ for (auto &Src : SRP) {
+ Pts.insert(std::make_pair(BBMap.find(Src.first)->second, Src.second));
+ }
+ return Pts;
+ }
+
+ void setRestorePoint(MachineBasicBlock *MBB, std::vector<Register> &Regs) {
+ if (RestorePoints.contains(MBB))
+ RestorePoints[MBB] = Regs;
+ else
+ RestorePoints.insert(std::make_pair(MBB, Regs));
+ }
+
+ MachineBasicBlock *getProlog() const { return Prolog; }
+ void setProlog(MachineBasicBlock *BB) { Prolog = BB; }
+ MachineBasicBlock *getEpilog() const { return Epilog; }
+ void setEpilog(MachineBasicBlock *BB) { Epilog = BB; }
+
+ void clearSavePoints() { SavePoints.clear(); }
+ void clearRestorePoints() { RestorePoints.clear(); }
uint64_t getUnsafeStackSize() const { return UnsafeStackSize; }
void setUnsafeStackSize(uint64_t Size) { UnsafeStackSize = Size; }
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 97de0197da9b40..373455a630a993 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -199,6 +199,10 @@ class TargetFrameLowering {
return false;
}
+ /// enableCSRSaveRestorePointsSplit - Returns true if the target support
+ /// multiple save/restore points in shrink wrapping.
+ virtual bool enableCSRSaveRestorePointsSplit() const { return false; }
+
/// Returns true if the stack slot holes in the fixed and callee-save stack
/// area should be used when allocating other stack locations to reduce stack
/// size.
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index e2543f883f91ce..835eeb2362ceb1 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -124,6 +124,10 @@ class MIRParserImpl {
bool initializeFrameInfo(PerFunctionMIParsingState &PFS,
const yaml::MachineFunction &YamlMF);
+ bool initializeSaveRestorePoints(PerFunctionMIParsingState &PFS,
+ const yaml::SaveRestorePoints &YamlSRP,
+ bool IsSavePoints);
+
bool initializeCallSiteInfo(PerFunctionMIParsingState &PFS,
const yaml::MachineFunction &YamlMF);
@@ -832,18 +836,9 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
MFI.setHasTailCall(YamlMFI.HasTailCall);
MFI.setCalleeSavedInfoValid(YamlMFI.IsCalleeSavedInfoValid);
MFI.setLocalFrameSize(YamlMFI.LocalFrameSize);
- if (!YamlMFI.SavePoint.Value.empty()) {
- MachineBasicBlock *MBB = nullptr;
- if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint))
- return true;
- MFI.setSavePoint(MBB);
- }
- if (!YamlMFI.RestorePoint.Value.empty()) {
- MachineBasicBlock *MBB = nullptr;
- if (parseMBBReference(PFS, MBB, YamlMFI.RestorePoint))
- return true;
- MFI.setRestorePoint(MBB);
- }
+ initializeSaveRestorePoints(PFS, YamlMFI.SavePoints, true /*IsSavePoints*/);
+ initializeSaveRestorePoints(PFS, YamlMFI.RestorePoints,
+ false /*IsSavePoints*/);
std::vector<CalleeSavedInfo> CSIInfo;
// Initialize the fixed frame objects.
@@ -1058,8 +1053,40 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
return false;
}
-bool MIRParserImpl::initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
- const yaml::MachineJumpTable &YamlJTI) {
+bool MIRParserImpl::initializeSaveRestorePoints(
+ PerFunctionMIParsingState &PFS, const yaml::SaveRestorePoints &YamlSRP,
+ bool IsSavePoints) {
+ SMDiagnostic Error;
+ MachineFunction &MF = PFS.MF;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ llvm::SaveRestorePoints SRPoints;
+
+ for (const auto &Entry : YamlSRP) {
+ const auto &MBBSource = Entry.Point;
+ MachineBasicBlock *MBB = nullptr;
+ if (parseMBBReference(PFS, MBB, MBBSource.Value))
+ return true;
+
+ std::vector<Register> Registers{};
+ for (auto &RegStr : Entry.Registers) {
+ Register Reg;
+ if (parseNamedRegisterReference(PFS, Reg, RegStr.Value, Error))
+ return error(Error, RegStr.SourceRange);
+
+ Registers.push_back(Reg);
+ }
+ SRPoints.insert(std::make_pair(MBB, Registers));
+ }
+
+ if (IsSavePoints)
+ MFI.setSavePoints(SRPoints);
+ else
+ MFI.setRestorePoints(SRPoints);
+ return false;
+}
+
+bool MIRParserImpl::initializeJumpTableInfo(
+ PerFunctionMIParsingState &PFS, const yaml::MachineJumpTable &YamlJTI) {
MachineJumpTableInfo *JTI = PFS.MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
for (const auto &Entry : YamlJTI.Entries) {
std::vector<MachineBasicBlock *> Blocks;
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index c8f6341c1224d2..ea7c504d355c19 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -117,7 +117,10 @@ class MIRPrinter {
const MachineRegisterInfo &RegInfo,
const TargetRegisterInfo *TRI);
void convert(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
- const MachineFrameInfo &MFI);
+ const MachineFrameInfo &MFI, const TargetRegisterInfo *TRI);
+ void convert(ModuleSlotTracker &MST, yaml::SaveRestorePoints &YamlSRP,
+ const DenseMap<MachineBasicBlock *, std::vector<Register>> &SRP,
+ const TargetRegisterInfo *TRI);
void convert(yaml::MachineFunction &MF,
const MachineConstantPool &ConstantPool);
void convert(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
@@ -235,7 +238,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
convert(YamlMF, MF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
MachineModuleSlotTracker MST(MMI, &MF);
MST.incorporateFunction(MF.getFunction());
- convert(MST, YamlMF.FrameInfo, MF.getFrameInfo());
+ convert(MST, YamlMF.FrameInfo, MF.getFrameInfo(),
+ MF.getSubtarget().getRegisterInfo());
convertStackObjects(YamlMF, MF, MST);
convertEntryValueObjects(YamlMF, MF, MST);
convertCallSiteObjects(YamlMF, MF, MST);
@@ -372,7 +376,8 @@ void MIRPrinter::convert(yaml::MachineFunction &YamlMF,
void MIRPrinter::convert(ModuleSlotTracker &MST,
yaml::MachineFrameInfo &YamlMFI,
- const MachineFrameInfo &MFI) {
+ const MachineFrameInfo &MFI,
+ const TargetRegisterInfo *TRI) {
YamlMFI.IsFrameAddressTaken = MFI.isFrameAddressTaken();
YamlMFI.IsReturnAddressTaken = MFI.isReturnAddressTaken();
YamlMFI.HasStackMap = MFI.hasStackMap();
@@ -392,14 +397,10 @@ void MIRPrinter::convert(ModuleSlotTracker &MST,
YamlMFI.HasTailCall = MFI.hasTailCall();
YamlMFI.IsCalleeSavedInfoValid = MFI.isCalleeSavedInfoValid();
YamlMFI.LocalFrameSize = MFI.getLocalFrameSize();
- if (MFI.getSavePoint()) {
- raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
- StrOS << printMBBReference(*MFI.getSavePoint());
- }
- if (MFI.getRestorePoint()) {
- raw_string_ostream StrOS(YamlMFI.RestorePoint.Value);
- StrOS << printMBBReference(*MFI.getRestorePoint());
- }
+ if (!MFI.getSavePoints().empty())
+ convert(MST, YamlMFI.SavePoints, MFI.getSavePoints(), TRI);
+ if (!MFI.getRestorePoints().empty())
+ convert(MST, YamlMFI.RestorePoints, MFI.getRestorePoints(), TRI);
}
void MIRPrinter::convertEntryValueObjects(yaml::MachineFunction &YMF,
@@ -618,6 +619,28 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
}
}
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+ yaml::SaveRestorePoints &YamlSRP,
+ const SaveRestorePoints &SRP,
+ const TargetRegisterInfo *TRI) {
+ for (const auto &MBBEntry : SRP) {
+ std::string Str;
+ yaml::SRPEntry Entry;
+ raw_string_ostream StrOS(Str);
+ StrOS << printMBBReference(*MBBEntry.first);
+ Entry.Point = StrOS.str();
+ Str.clear();
+ for (auto &Reg : MBBEntry.second) {
+ if (Reg != MCRegister::NoRegister) {
+ StrOS << printReg(Reg, TRI);
+ Entry.Registers.push_back(StrOS.str());
+ Str.clear();
+ }
+ }
+ YamlSRP.push_back(Entry);
+ }
+}
+
void MIRPrinter::convert(ModuleSlotTracker &MST,
yaml::MachineJumpTable &YamlJTI,
const MachineJumpTableInfo &JTI) {
diff --git a/llvm/lib/CodeGen/MachineDominators.cpp b/llvm/lib/CodeGen/MachineDominators.cpp
index a2cc8fdfa7c9f9..384f90c6da66c0 100644
--- a/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/llvm/lib/CodeGen/MachineDominators.cpp
@@ -189,3 +189,19 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
NewBBs.clear();
CriticalEdgesToSplit.clear();
}
+
+MachineBasicBlock *MachineDominatorTree::findNearestCommonDominator(
+ ArrayRef<MachineBasicBlock *> Blocks) const {
+ assert(!Blocks.empty());
+
+ MachineBasicBlock *NCD = Blocks.front();
+ for (MachineBasicBlock *BB : Blocks.drop_front()) {
+ NCD = Base::findNearestCommonDominator(NCD, BB);
+
+ // Stop when the root is reached.
+ if (Base::isVirtualRoot(Base::getNode(NCD)))
+ return nullptr;
+ }
+
+ return NCD;
+}
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index e4b993850f73dc..c6658d2e9eba88 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -244,6 +244,23 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
}
OS << "\n";
}
+
+ OS << "save/restore points:\n";
+
+ if (!SavePoints.empty()...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
9f67919 to
ea5e86f
Compare
|
@qcolombet, could you take a look, please? |
| static int64_t calculateCSRSpillOffsets(MachineFrameInfo &MFI, | ||
| const TargetFrameLowering *TFI, | ||
| int MinCSFI, int FrameIdx) { | ||
| int LocalAreaOffset = -TFI->getOffsetOfLocalArea(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this negated instead of using the raw value? Is this assuming the stack grows down?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, here I assume, that for RISC-V stack grows down.
| // does the same, not via new instructions but via save/restore libcalls. | ||
| if (!STI.hasStdExtZcmp() && !STI.enableSaveRestore()) | ||
| return true; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does not return in all paths.
Shrink-Wrap points split Part 2. RFC: https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581 Part 1: #117862 Part 3: #119357 Part 4: #119358 Part 5: #119359
…355) Shrink-Wrap points split Part 2. RFC: https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581 Part 1: llvm/llvm-project#117862 Part 3: llvm/llvm-project#119357 Part 4: llvm/llvm-project#119358 Part 5: llvm/llvm-project#119359
Currently mir supports only one save and one restore point
specification:
```
savePoint: '%bb.1'
restorePoint: '%bb.2'
```
This patch provide possibility to have multiple save and multiple
restore points in mir:
```
savePoints:
- point: '%bb.1'
restorePoints:
- point: '%bb.2'
```
Shrink-Wrap points split Part 3.
RFC:
https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581
Part 1: #117862
Part 2: #119355
Part 4: #119358
Part 5: #119359
Currently mir supports only one save and one restore point
specification:
```
savePoint: '%bb.1'
restorePoint: '%bb.2'
```
This patch provide possibility to have multiple save and multiple
restore points in mir:
```
savePoints:
- point: '%bb.1'
restorePoints:
- point: '%bb.2'
```
Shrink-Wrap points split Part 3.
RFC:
https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581
Part 1: llvm/llvm-project#117862
Part 2: llvm/llvm-project#119355
Part 4: llvm/llvm-project#119358
Part 5: llvm/llvm-project#119359
dongjianqiang2
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In practice this causes shrink wrapping to degenerate toward the traditional “save all at entry / restore all at exit” approach, losing the benefit of minimizing prologue/epilogue scope.
llvm/lib/CodeGen/ShrinkWrap.cpp
Outdated
| void ShrinkWrap::performSimpleShrinkWrap(RegScavenger *RS, | ||
| MachineBasicBlock &SavePoint) { | ||
| auto MF = SavePoint.getParent(); | ||
| auto CSRs = getTargetCSRList(*MF); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whenever a basic block touches any callee-saved register (CSR), the pass updates save/restore points for all CSRs. This can make the result overly conservative: saves are moved closer to the function entry and restores closer to the exit, even for registers that are not actually used in the current block.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@dongjianqiang2, my series of patches doesn't cause the original shrink wrapping to degenerate.
Currently, shrink wrap stops searching for a prolog whenever a basic block touches CSR or a frame index instruction is encountered (https://github.com/llvm/llvm-project/blob/915e9adbe5d1c577a21ac8b495b7c54c465460fd/llvm/lib/CodeGen/ShrinkWrap.cpp#L859 ).
My approach is based on the current shrink wrap algorithm in simple cases (https://github.com/llvm/llvm-project/blob/ea5e86fe0905c79c339f4bf9788a38429c935488/llvm/lib/CodeGen/ShrinkWrap.cpp#L1034 ) and in more advanced cases (if splitting is enabled and possible), it tries not to perform all the saves and restores in prolog/epilog if possible.
I agree that my approach is currently overconservative, but it is planned to be relaxed in the follow-up patches.
Please clarify your opinion if you still think that my approach worsens the current shrink wrap.
…119358) This patch adds the MIR parsing and serialization support for save and restore points with subsets of callee saved registers. That is, it syntactically allows a function to contain two or more distinct sub-regions in which distinct subsets of registers are spilled/filled as callee save. This is useful if e.g. one of the CSRs isn't modified in one of the sub-regions, but is in the other(s). Support for actually using this capability in code generation is still forthcoming. This patch is the next logical step for multiple save/restore points support. All points are now stored in DenseMap from MBB to vector of CalleeSavedInfo. Shrink-Wrap points split Part 4. RFC: https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581 Part 1: #117862 (landed) Part 2: #119355 (landed) Part 3: #119357 (landed) Part 5: #119359 (likely to be further split)
… registers (#119358) This patch adds the MIR parsing and serialization support for save and restore points with subsets of callee saved registers. That is, it syntactically allows a function to contain two or more distinct sub-regions in which distinct subsets of registers are spilled/filled as callee save. This is useful if e.g. one of the CSRs isn't modified in one of the sub-regions, but is in the other(s). Support for actually using this capability in code generation is still forthcoming. This patch is the next logical step for multiple save/restore points support. All points are now stored in DenseMap from MBB to vector of CalleeSavedInfo. Shrink-Wrap points split Part 4. RFC: https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581 Part 1: llvm/llvm-project#117862 (landed) Part 2: llvm/llvm-project#119355 (landed) Part 3: llvm/llvm-project#119357 (landed) Part 5: llvm/llvm-project#119359 (likely to be further split)
If I remember correctly this is to be expanded further. @enoskova-sc can you please chime in? @lenary, feel free to take a look, this is the non-regalloc approach |
| std::sort(CSInfoPerBB[BB].begin(), CSInfoPerBB[BB].end(), | ||
| [](const CalleeSavedInfo &Lhs, const CalleeSavedInfo &Rhs) { | ||
| return Lhs.getFrameIdx() < Rhs.getFrameIdx(); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you missing a continue here?
|
I wanted to bring up this patch (due to @francisvm): https://reviews.llvm.org/D41765 It seems to me like separating the shrink wrapping interface from the implementation would provide us with a good opportunity to experiment with changes like this. |
|
@cofibrant, thank you for pointing me to this RFC https://reviews.llvm.org/D41765. |
|
@enoskova-sc I've started looking at applying this to the latest HEAD in order to do some testing. If you've already done such a rebase / merge then that would be helpful! |
|
@asb, currently, I don't have rebased version. But If you wait a little bit I can rebase this MR and push here. |
With this patch the possibility to store multiple Save and Restore points in MachineFrameInfo appears. As the logical consequnce of it, the notions "Save point" / "Restore point" are no longer synonyms for "Prolog" / "Epilog". Currently, "Prolog" / "Epilog" is the place for stack allocation / deallocation and "Save point" / "Restore point" is the place for register spills and restores. So, now we need to store in MachineFrameInfo not only vector of Save and vector of Restore blocks, but Prolog and Epilog. As we assume to have multiple Save and Restore points we need to know the list of registers, we store / restore in each point. Threfore our SavePoint become a pair <MachineBasicBlock, std::vector<Register>>. The full support for operating with multiple Save / Restore points is supported only in RISCV backend.
ea5e86f to
a22b599
Compare
|
I've rebased my patches against a relatively recent LLVM version. While the patches aren’t perfect yet, I’ve made an effort to ensure all tests pass. As a next step, I plan to split this merge request into smaller parts and prepare the next patch in a separate MR for easier review. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
a22b599 to
a1e733c
Compare
| @@ -0,0 +1,290 @@ | |||
| # RUN: llc -march=riscv64 -run-pass shrink-wrap -enable-shrink-wrap-into-multiple-points=true %s -o - | FileCheck %s | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the check lines be generated with llvm/utils/update_mir_test_checks.py?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I don't know how to force update_mir_test_checks.py to add checks on Machine IR "metadata", like savePoint list. Maybe you know?
| @@ -267,6 +266,7 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. | |||
| ; CHECK-NEXT: cmpld 28, 4 | |||
| ; CHECK-NEXT: ble 0, .LBB0_3 | |||
| ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit | |||
| <<<<<<< HEAD | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like a merge conflict marker got left over
Thank you! I've tried this out (the most recent push, a1e733c) and run into some problems. I'm getting runtime crashes for povray, x264, and xz in spec. I have a reduced test case below that from glancing at the diff is likely a recurring issue, but I can't guarantee it's the only one. Beyond that, the set of functions this results in codegen differences for is not very large and I see ~0% dynamic instruction count difference for the other SPEC benchmarks that do run. This is when compiling with For the miscompile, the following testcase will end up allocating 16 byte stack in the entry block and again in one of its successors, but only deallocating 16 bytes at entry: ; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
%struct.widget = type { float, i8, %struct.baz }
%struct.baz = type { [2 x double], [8 x i8] }
@global = external global [6 x %struct.widget]
@global.1 = external dso_local global [3 x %struct.widget]
define void @snork(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7) {
bb:
%load = load i1, ptr %arg, align 1
br i1 %load, label %bb9, label %bb8
bb8: ; preds = %bb
store float 0x3FD99999A0000000, ptr @global, align 8
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global, i64 8), align 8
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global, i64 12), align 4
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global, i64 16), align 8
store float 0x3FE99999A0000000, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 16), align 8
store float 0x3FB47AE140000000, ptr @global.1, align 8
store float 0.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 56), align 8
store float 0x7FF8000000000000, ptr %arg7, align 4
store float 0.000000e+00, ptr %arg6, align 8
store float 0.000000e+00, ptr %arg1, align 8
store float 1.000000e+00, ptr %arg2, align 4
store i8 0, ptr %arg3, align 4
tail call void @llvm.memset.p0.i64(ptr align 4 %arg, i8 0, i64 16, i1 false)
br label %bb9
bb9: ; preds = %bb8, %bb
ret void
}
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg) #0
attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }
I compile with My first attempt to reduce this ended up with an example that actually appears to be correct, but does demonstrate a case where ; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
%struct.widget = type { float, i8, %struct.baz }
%struct.baz = type { [2 x double], [8 x i8] }
@global = external dso_local global [2 x %struct.widget]
@global.1 = external dso_local global [2 x %struct.widget]
@global.2 = external dso_local global [5 x %struct.widget]
define void @snork(ptr %arg, i1 %arg1) {
bb:
br i1 %arg1, label %bb3, label %bb2
bb2: ; preds = %bb
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global, i64 40), align 8
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global, i64 44), align 4
store float 0x3FE3333340000000, ptr @global.1, align 8
store float 0.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 8), align 8
store float 0x3FD3F7CEE0000000, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 12), align 4
store float 0x3FC99999A0000000, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 16), align 8
store float 0.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 44), align 4
store float 0x3FB0E56040000000, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 48), align 8
store float 0.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.1, i64 56), align 8
store float 0x3F50624DE0000000, ptr @global.2, align 8
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.2, i64 48), align 8
store float 0.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.2, i64 56), align 8
store float 0x3F889374C0000000, ptr %arg, align 8
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.2, i64 72), align 8
store float 1.000000e+00, ptr getelementptr inbounds nuw (i8, ptr @global.2, i64 76), align 4
br label %bb3
bb3: ; preds = %bb2, %bb
ret void
}
|
a1e733c to
3e23566
Compare
|
@asb, thank you for reviewing my patch and for highlighting the issues you observed! I’ve addressed the problems in my shrink-wrap implementation that caused the behavior you saw in your reduced examples. Regarding the SPEC failures—you mentioned—I’ll work on reproducing and analyzing them as well. As for the performance results you’re seeing: I confirm they’re expected. The current implementation falls back to the simple (non-split) shrink-wrap mode whenever a frame adjustment instruction is encountered (see code). This means that for functions containing calls (which introduce frame adjustments), shrink-wrap splitting is disabled. The reason for this conservative approach is that functions may use _builtin_alloca, which gets lowered to a frame adjustment instruction, which can happen in the middle of the function and break algorithm assumptions. At the moment, I haven’t found a reliable way to detect _builtin_alloca other than checking for the presence of frame adjustment instructions. I'll prepare an example with _builtin_alloca and post it tomorrow for clarification. |
|
Thanks for the updates. There are definitely remaining issues as I'm still getting crashes in those same benchmarks (and additionally in gcc). Looking at the codegen diffs for spec built with an unmodified baseline compiler vs this branch and picking the first one where the bad codegen is easy to see (another example of stack frame being allocated twice) I get this minimal reproducer: ; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
define double @widget(ptr %arg, ptr %arg1, double %arg2, i1 %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15) {
bb:
%select = select i1 %arg3, double %arg2, double 0.000000e+00
%fadd = fadd double %arg2, -1.000000e+00
%fadd16 = fadd double %arg6, -1.000000e+00
%fsub = fsub double 1.000000e+00, %arg15
%fsub17 = fsub double 1.000000e+00, %arg2
%getelementptr = getelementptr i8, ptr %arg, i64 16
store double %fsub17, ptr %arg, align 8
%call = tail call double @llvm.fmuladd.f64(double %arg2, double 0.000000e+00, double 0.000000e+00)
store double 0.000000e+00, ptr %arg1, align 8
%load = load double, ptr inttoptr (i64 136 to ptr), align 8
%call18 = tail call double @llvm.fmuladd.f64(double %call, double %fadd, double %load)
%call19 = tail call double @llvm.fmuladd.f64(double %arg14, double %arg13, double %call18)
%load20 = load double, ptr inttoptr (i64 176 to ptr), align 8
%call21 = tail call double @llvm.fmuladd.f64(double %load20, double 0.000000e+00, double %call19)
%call22 = tail call double @llvm.fmuladd.f64(double %arg8, double %call21, double %arg4)
store double %call22, ptr %getelementptr, align 8
%load23 = load double, ptr %arg, align 8
%call24 = tail call double @llvm.fmuladd.f64(double %select, double 0.000000e+00, double %load23)
%call25 = tail call double @llvm.fmuladd.f64(double %arg5, double %call24, double %arg7)
store double %call25, ptr %arg, align 8
%load26 = load double, ptr %arg1, align 8
%call27 = tail call double @llvm.fmuladd.f64(double %load26, double 0.000000e+00, double %arg10)
%call28 = tail call double @llvm.fmuladd.f64(double %call27, double 0.000000e+00, double %arg9)
store double %call28, ptr %arg1, align 8
store double %arg2, ptr %arg, align 8
%fmul = fmul double %arg11, %fsub
%call29 = tail call double @llvm.fmuladd.f64(double %fadd16, double 0.000000e+00, double %arg12)
%call30 = tail call double @llvm.fmuladd.f64(double %fmul, double %call29, double 0.000000e+00)
ret double %call30
}
; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
declare double @llvm.fmuladd.f64(double, double, double) #0
attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
The bad codegen is the duplicated |
This patch introduces "-enable-shrink-wrap-into-multiple-points" option, which enables splitting Save and Restore points during ShrinkWrap pass, i.e. insert registers saves and restores as close as possible to their usage. Current algorithm disables Save / Restore point splitting for functions with instructions with FrameIndex operands, with EHPads and with any Stack accesses beacuse it is difficult to prove the safety of it. This patch also add support for multiple Save / Restore points only for RISCV. Now ShrinkWrap produces: - list of SavePoint + Registers - list of RestorePoint + Registers - Prolog (NCD of Save points) - Epilog (NCPD of Restore points)
3e23566 to
59873d9
Compare
|
With the new revision, I have fixed the last reproduction, you provide. Below is the example with __builtin_alloca, I promised. The assembler for it is the following: In the assembler above we can see stack pointer decrement after all the spills performed: With that, all the reloads should be performed, strictly after builtin_alloca's stack decrement, or consider builtin_alloca SP offset. P.S. Currently I'm working on checking and fixing SPECs on qemu. |
|
Hi @enoskova-sc, with the latest push I'm not longer seeing crashes for SPEC povray, x264, and xz. I do still see one for SPEC's gcc benchmark and I'm working to reduce it (sadly this isn't quite as straightforward as the other ones). For context, this is the diffstat after applying this patch: For the dynamic alloca problem you mention, have you looked at whether MFI.hasVarSizedObjects is sufficient? In terms of splitting this up and landing something, being incremental and conservative for each patch is definitely what we need. But given the split shrink wrap logic kicks in so rarely with the current implementation, I'd be very interested in looking ahead a little bit to see what benefit we might get when we relax the restrictions a bit. And if that's not viable then we might need to rethink what to do. |
|
@asb, thanks for pointing out MFI.hasVarSizedObjects—it seems suitable for detecting functions that use alloca. I’ll try using it to relax the current restrictions. |
|
Just for record keeping, a prior round of results can be found here: https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581/7. They seem to roughly match the current results, though deepsjeng does show a small improvement. |
This patch introduces "-enable-shrink-wrap-into-multiple-points"
option, which enables splitting Save and Restore points during ShrinkWrap pass, i.e.
insert registers saves and restores as close as possible to their usage.
Current algorithm disables Save / Restore point splitting for
functions with instructions with FrameIndex operands,
with EHPads and with any Stack accesses beacuse it is difficult to prove the safety of it.
This patch also add support for multiple Save / Restore points only for RISCV.
Now ShrinkWrap produces:
Shrink-Wrap points split Part 5.
RFC: https://discourse.llvm.org/t/shrink-wrap-save-restore-points-splitting/83581
Part 1: #117862
Part 2: #119355
Part 3: #119357
Part 4: #119358