Make WriteIndexesThinBackend multi threaded

Nuri Amari · Nuri Amari · commit 445bd5650902 · 2024-10-06T16:32:09.000-07:00
We've noticed that for large builds executing thin-link can take on the
order of 10s of minutes. We are only using a single thread to write the
sharded indices and import files for each input bitcode file. While we
need to ensure the index files produced lists modules in a deterministic
order, that doesn't prevent us from executing the rest of the work in parallel.

In this change we use a thread pool to execute as much of the backend's
work as possible in parallel. In local testing on a machine with 80
cores, this change makes a thin-link for ~100,000 input files run in ~2 minutes.
Without this change is takes upwards of 10 minutes.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
@@ -1395,11 +1395,12 @@ class lto::ThinBackendProc {
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
   virtual Error wait() = 0;
   virtual unsigned getThreadCount() = 0;
+  virtual bool isSensitiveToInputOrder() { return false; }
 
   // Write sharded indices and (optionally) imports to disk
   Error emitFiles(const FunctionImporter::ImportMapTy &ImportList,
                   llvm::StringRef ModulePath,
-                  const std::string &NewModulePath) {
+                  const std::string &NewModulePath) const {
     ModuleToSummariesForIndexTy ModuleToSummariesForIndex;
     GVSummaryPtrSet DeclarationSummaries;
 
@@ -1614,6 +1615,10 @@ namespace {
 class WriteIndexesThinBackend : public ThinBackendProc {
   std::string OldPrefix, NewPrefix, NativeObjectPrefix;
   raw_fd_ostream *LinkedObjectsFile;
+  DefaultThreadPool BackendThreadPool;
+  std::optional<Error> Err;
+  std::mutex ErrMu;
+  std::mutex OnWriteMu;
 
 public:
   WriteIndexesThinBackend(
@@ -1635,8 +1640,6 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
     StringRef ModulePath = BM.getModuleIdentifier();
-    std::string NewModulePath =
-        getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
 
     if (LinkedObjectsFile) {
       std::string ObjectPrefix =
@@ -1646,19 +1649,48 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       *LinkedObjectsFile << LinkedObjectsFilePath << '\n';
     }
 
-    if (auto E = emitFiles(ImportList, ModulePath, NewModulePath))
-      return E;
+    BackendThreadPool.async(
+        [this](const StringRef ModulePath,
+               const FunctionImporter::ImportMapTy &ImportList,
+               const std::string &OldPrefix, const std::string &NewPrefix) {
+          std::string NewModulePath =
+              getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
+          auto E = emitFiles(ImportList, ModulePath, NewModulePath);
+          if (E) {
+            std::unique_lock<std::mutex> L(ErrMu);
+            if (Err)
+              Err = joinErrors(std::move(*Err), std::move(E));
+            else
+              Err = std::move(E);
+            return;
+          }
+          if (OnWrite) {
+            // Serialize calls to the on write callback in case it is not thread
+            // safe
+            std::unique_lock<std::mutex> L(OnWriteMu);
+            OnWrite(std::string(ModulePath));
+          }
+        },
+        ModulePath, ImportList, OldPrefix, NewPrefix);
+    return Error::success();
+  }
 
-    if (OnWrite)
-      OnWrite(std::string(ModulePath));
+  Error wait() override {
+    BackendThreadPool.wait();
+    if (Err)
+      return std::move(*Err);
     return Error::success();
   }
 
-  Error wait() override { return Error::success(); }
+  unsigned getThreadCount() override {
+    return BackendThreadPool.getMaxConcurrency();
+  }
 
-  // WriteIndexesThinBackend should always return 1 to prevent module
-  // re-ordering and avoid non-determinism in the final link.
-  unsigned getThreadCount() override { return 1; }
+  bool isSensitiveToInputOrder() override {
+    // The order which modules are written to LinkedObjectsFile should be
+    // deterministic and match the order they are passed on the command line.
+    return true;
+  }
 };
 } // end anonymous namespace
 
@@ -1854,20 +1886,20 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
           ResolvedODR[Mod.first], ThinLTO.ModuleMap);
     };
 
-    if (BackendProcess->getThreadCount() == 1) {
-      // Process the modules in the order they were provided on the
-      // command-line. It is important for this codepath to be used for
-      // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
-      // ThinLTO objects in the same order as the inputs, which otherwise would
-      // affect the final link order.
+    if (BackendProcess->getThreadCount() == 1 ||
+        BackendProcess->isSensitiveToInputOrder()) {
+      // Process the modules in the order they were provided on the command-line.
+      // It is important for this codepath to be used for WriteIndexesThinBackend,
+      // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
+      // order as the inputs, which otherwise would affect the final link order.
       for (int I = 0, E = ModuleMap.size(); I != E; ++I)
         if (Error E = ProcessOneModule(I))
           return E;
     } else {
       // When executing in parallel, process largest bitsize modules first to
       // improve parallelism, and avoid starving the thread pool near the end.
-      // This saves about 15 sec on a 36-core machine while link `clang.exe`
-      // (out of 100 sec).
+      // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
+      // of 100 sec).
       std::vector<BitcodeModule *> ModulesVec;
       ModulesVec.reserve(ModuleMap.size());
       for (auto &Mod : ModuleMap)