[BOLT] Custom function alignment

spupyrev · memfrob · commit dfbc472dbe10 · 2022-10-04T19:13:51.000-04:00
Summary:
A new 'compact' function aligner that takes function sizes in consideration. The approach is based on the following assumptions:
-- It is not desirable to introduce a large offset when aligning short functions, as it leads to a lot of "wasted" address space.
-- For longer functions, the offset can be larger than the default 32 bytes; However, using 64 bytes for the offset still worsen performance, as again a lot of address space is wasted.
-- Cold parts of functions can still use the default max-32 offset.

The algorithm is switched on/off by flag 'use-compact-aligner' and is controlled by parameters align-functions-max-bytes and align-cold-functions-max-bytes described above. In my tests the best performance is produced with '-use-compact-aligner=true -align-functions-max-bytes=48 -align-cold-functions-max-bytes=32'.

(cherry picked from FBD6194092)
diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h
@@ -241,7 +241,13 @@ class BinaryFunction {
   uint64_t MaxSize{std::numeric_limits<uint64_t>::max()};
 
   /// Alignment requirements for the function.
-  uint64_t Alignment{2};
+  uint16_t Alignment{2};
+
+  /// Maximum number of bytes used for alignment of hot part of the function.
+  uint16_t MaxAlignmentBytes{0};
+
+  /// Maximum number of bytes used for alignment of cold part of the function.
+  uint16_t MaxColdAlignmentBytes{0};
 
   const MCSymbol *PersonalityFunction{nullptr};
   uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel};
@@ -1580,15 +1586,33 @@ class BinaryFunction {
     return *this;
   }
 
-  BinaryFunction &setAlignment(uint64_t Align) {
+  BinaryFunction &setAlignment(uint16_t Align) {
     Alignment = Align;
     return *this;
   }
 
-  uint64_t getAlignment() const {
+  uint16_t getAlignment() const {
     return Alignment;
   }
 
+  BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) {
+    MaxAlignmentBytes = MaxAlignBytes;
+    return *this;
+  }
+
+  uint16_t getMaxAlignmentBytes() const {
+    return MaxAlignmentBytes;
+  }
+
+  BinaryFunction &setMaxColdAlignmentBytes(uint16_t MaxAlignBytes) {
+    MaxColdAlignmentBytes = MaxAlignBytes;
+    return *this;
+  }
+
+  uint16_t getMaxColdAlignmentBytes() const {
+    return MaxColdAlignmentBytes;
+  }
+
   BinaryFunction &setImageAddress(uint64_t Address) {
     ImageAddress = Address;
     return *this;
diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "BinaryPassManager.h"
+#include "Passes/Aligner.h"
 #include "Passes/AllocCombiner.h"
 #include "Passes/FrameOptimizer.h"
 #include "Passes/IndirectCallPromotion.h"
@@ -393,6 +394,8 @@ void BinaryFunctionPassManager::runAllPasses(
     llvm::make_unique<SimplifyConditionalTailCalls>(PrintSCTC),
     opts::SimplifyConditionalTailCalls);
 
+  Manager.registerPass(llvm::make_unique<AlignerPass>());
+
   // This pass should always run last.*
   Manager.registerPass(llvm::make_unique<FinalizeFunctions>(PrintFinalized));
 
diff --git a/bolt/Passes/Aligner.cpp b/bolt/Passes/Aligner.cpp
@@ -0,0 +1,101 @@
+//===--- Aligner.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Aligner.h"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> Relocs;
+
+cl::opt<bool>
+UseCompactAligner("use-compact-aligner",
+  cl::desc("Use compact approach for aligning functions"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+AlignFunctions("align-functions",
+  cl::desc("align functions at a given value (relocation mode)"),
+  cl::init(64),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+AlignFunctionsMaxBytes("align-functions-max-bytes",
+  cl::desc("maximum number of bytes to use to align functions"),
+  cl::init(32),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // end namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+// Align function to the specified byte-boundary (typically, 64) offsetting
+// the fuction by not more than the corresponding value
+void alignMaxBytes(BinaryFunction &Function) {
+  Function.setAlignment(opts::AlignFunctions);
+  Function.setMaxAlignmentBytes(opts::AlignFunctionsMaxBytes);
+  Function.setMaxColdAlignmentBytes(opts::AlignFunctionsMaxBytes);
+}
+
+// Align function to the specified byte-boundary (typically, 64) offsetting
+// the fuction by not more than the minimum over
+// -- the size of the function
+// -- the specified number of bytes
+void alignCompact(BinaryContext &BC, BinaryFunction &Function) {
+  size_t HotSize = 0;
+  size_t ColdSize = 0;
+  for (const auto *BB : Function.layout()) {
+    if (BB->isCold())
+      ColdSize += BC.computeCodeSize(BB->begin(), BB->end());
+    else
+      HotSize += BC.computeCodeSize(BB->begin(), BB->end());
+  }
+
+  Function.setAlignment(opts::AlignFunctions);
+  if (HotSize > 0)
+    Function.setMaxAlignmentBytes(
+      std::min(size_t(opts::AlignFunctionsMaxBytes), HotSize));
+
+  // using the same option, max-align-bytes, both for cold and hot parts of the
+  // functions, as aligning cold functions typically does not affect performance
+  if (ColdSize > 0)
+    Function.setMaxColdAlignmentBytes(
+      std::min(size_t(opts::AlignFunctionsMaxBytes), ColdSize));
+}
+
+} // end anonymous namespace
+
+void AlignerPass::runOnFunctions(BinaryContext &BC,
+                                 std::map<uint64_t, BinaryFunction> &BFs,
+                                 std::set<uint64_t> &LargeFunctions) {
+  if (!opts::Relocs)
+    return;
+
+  for (auto &It : BFs) {
+    auto &Function = It.second;
+    if (opts::UseCompactAligner)
+      alignCompact(BC, Function);
+    else
+      alignMaxBytes(Function);
+  }
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/Passes/Aligner.h b/bolt/Passes/Aligner.h
@@ -0,0 +1,38 @@
+//===--------- Passes/Aligner.h -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+class AlignerPass : public BinaryFunctionPass {
+ public:
+  explicit AlignerPass() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override {
+    return "aligner";
+  }
+
+  /// Pass entry point
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+
+#endif
diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMBOLTPasses
+  Aligner.cpp
   AllocCombiner.cpp
   BinaryPasses.cpp
   BinaryFunctionCallGraph.cpp
diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp
@@ -90,20 +90,6 @@ OutputFilename("o",
   cl::Required,
   cl::cat(BoltOutputCategory));
 
-cl::opt<unsigned>
-AlignFunctions("align-functions",
-  cl::desc("align functions at a given value (relocation mode)"),
-  cl::init(64),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
-cl::opt<unsigned>
-AlignFunctionsMaxBytes("align-functions-max-bytes",
-  cl::desc("maximum number of bytes to use to align functions"),
-  cl::init(32),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
 cl::opt<bool>
 AllowStripped("allow-stripped",
   cl::desc("allow processing of stripped binaries"),
@@ -2190,8 +2176,11 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Functio
 
   if (opts::Relocs) {
     Streamer.EmitCodeAlignment(BinaryFunction::MinAlign);
-    Streamer.EmitCodeAlignment(opts::AlignFunctions,
-                               opts::AlignFunctionsMaxBytes);
+    auto MaxAlignBytes = EmitColdPart
+      ? Function.getMaxColdAlignmentBytes()
+      : Function.getMaxAlignmentBytes();
+    if (MaxAlignBytes > 0)
+      Streamer.EmitCodeAlignment(Function.getAlignment(), MaxAlignBytes);
   } else {
     Streamer.EmitCodeAlignment(Function.getAlignment());
     Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress());

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`add_llvm_library(LLVMBOLTPasses`
	`2`	`+ Aligner.cpp`
`2`	`3`	`AllocCombiner.cpp`
`3`	`4`	`BinaryPasses.cpp`
`4`	`5`	`BinaryFunctionCallGraph.cpp`