Skip to content

[lld][MachO]Multi-threaded i/o. Twice as fast linking a large project. #147134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c55b5b2
Multi-threaded disk i/o.
johnno1962 Jul 5, 2025
3d11a33
Afterthoughts.
johnno1962 Jul 6, 2025
02fb145
multiThreadedPageIn of library archives.
johnno1962 Jul 6, 2025
a8eeead
Multi-thread i/o in background.
johnno1962 Jul 8, 2025
55e26a8
Response to first review.
johnno1962 Jul 9, 2025
817036b
Second review.
johnno1962 Jul 12, 2025
c07e168
Semms to make a difference.
johnno1962 Jul 12, 2025
eb4827c
Update lld/MachO/Driver.cpp
johnno1962 Jul 15, 2025
ce93ae3
De-Obfuscate loop and thread reaping.
johnno1962 Jul 15, 2025
c47e5c3
Avoiding possible deadlock.
johnno1962 Jul 17, 2025
5caf5a6
Update lld/MachO/Options.td
johnno1962 Jul 17, 2025
890c492
Update lld/MachO/Driver.cpp
johnno1962 Jul 17, 2025
9714785
Update lld/MachO/Driver.cpp
johnno1962 Jul 17, 2025
85fd77f
Update lld/MachO/Driver.cpp
johnno1962 Jul 17, 2025
6f5f7cb
Fourth review.
johnno1962 Jul 17, 2025
e3e0369
Switch to std::atomic_int.
johnno1962 Jul 17, 2025
febf5a9
Switch to std::unique_ptr.
johnno1962 Jul 17, 2025
a5f7a42
Remove a couple of warnings.
johnno1962 Jul 18, 2025
6b874b2
Try LLVM_ATTRIBUTE_UNUSED
johnno1962 Jul 18, 2025
84154d4
Update lld/MachO/Driver.cpp
johnno1962 Jul 18, 2025
ed9f07e
Comparing inner loops.
johnno1962 Jul 18, 2025
ff732ed
Is this valid C++??
johnno1962 Jul 19, 2025
ef23af2
In search of a work queue abstraction.
johnno1962 Jul 21, 2025
4bf74e8
OK to use Process::GetEnv?
johnno1962 Jul 21, 2025
ed9bdb7
Formatting of benchmarks.
johnno1962 Jul 22, 2025
4cab9be
Encapsulate SerialBackgroundQueue. Remove DEBUG output.
johnno1962 Jul 25, 2025
f7c8008
Revert LLVM_DEBUG as gives error: use of undeclared identifier 'DEBUG…
johnno1962 Jul 25, 2025
a6dd0bc
Update lld/MachO/Driver.cpp
johnno1962 Jul 26, 2025
cf5c3fb
Add NDEBUGs
johnno1962 Jul 26, 2025
5901e7d
Update lld/MachO/Driver.cpp
johnno1962 Jul 30, 2025
39cffd0
Update lld/MachO/Driver.cpp
johnno1962 Jul 30, 2025
0af2bde
Update lld/MachO/Options.td
johnno1962 Jul 30, 2025
bb91c53
Fifth review followups.
johnno1962 Jul 30, 2025
432fb04
Headers no longer used
johnno1962 Aug 6, 2025
30b8c13
Threads becomes workers.
johnno1962 Aug 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lld/MachO/Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ struct Configuration {
bool interposable = false;
bool errorForArchMismatch = false;
bool ignoreAutoLink = false;
int readThreads = 0;
// ld64 allows invalid auto link options as long as the link succeeds. LLD
// does not, but there are cases in the wild where the invalid linker options
// exist. This allows users to ignore the specific invalid options in the case
Expand Down
185 changes: 175 additions & 10 deletions lld/MachO/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "lld/Common/Reproduce.h"
#include "lld/Common/Version.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/MachO.h"
Expand All @@ -41,11 +42,14 @@
#include "llvm/Object/Archive.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/TarWriter.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/Threading.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/TargetParser/Host.h"
#include "llvm/TextAPI/Architecture.h"
Expand Down Expand Up @@ -282,11 +286,122 @@ static void saveThinArchiveToRepro(ArchiveFile const *file) {
": Archive::children failed: " + toString(std::move(e)));
}

static InputFile *addFile(StringRef path, LoadType loadType,
bool isLazy = false, bool isExplicit = true,
bool isBundleLoader = false,
bool isForceHidden = false) {
std::optional<MemoryBufferRef> buffer = readFile(path);
class DeferredFile {
public:
DeferredFile(StringRef path, bool isLazy, MemoryBufferRef buffer)
: path(path), isLazy(isLazy), buffer(buffer) {}
StringRef path;
bool isLazy;
MemoryBufferRef buffer;
};
using DeferredFiles = std::vector<DeferredFile>;

class SerialBackgroundQueue {
std::deque<std::function<void()>> queue;
std::thread *running;
std::mutex mutex;

public:
void queueWork(std::function<void()> work, bool reap) {
mutex.lock();
if (running && (queue.empty() || reap)) {
mutex.unlock();
running->join();
mutex.lock();
delete running;
running = nullptr;
}

if (!reap) {
queue.emplace_back(std::move(work));
if (!running)
running = new std::thread([&]() {
bool shouldPop = false;
while (true) {
mutex.lock();
if (shouldPop)
queue.pop_front();
if (queue.empty()) {
mutex.unlock();
break;
}
auto work = std::move(queue.front());
shouldPop = true;
mutex.unlock();
work();
}
});
}
mutex.unlock();
}
};

#ifndef NDEBUG
#include <iomanip>
#include <iostream>
#endif
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think having these includes here goes against the LLVM coding standards: https://llvm.org/docs/CodingStandards.html#include-style

They should be at the top.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed the includes and the std::setprecision(4) that required them. Guess I'm just going to have to view times in scientific notation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the record, I don't think they need to be removed and you can still use setprecision, but the includes should not be in the middle of the file.


// Most input files have been mapped but not yet paged in.
// This code forces the page-ins on multiple threads so
// the process is not stalled waiting on disk buffer i/o.
void multiThreadedPageInBackground(DeferredFiles &deferred) {
using namespace std::chrono;
static const size_t pageSize = Process::getPageSizeEstimate();
static const size_t largeArchive = 10 * 1024 * 1024;
std::atomic_int index = 0;
#ifndef NDEBUG
std::atomic_int numDeferedFilesTouched = 0;
static std::atomic_uint64_t totalBytes = 0;
auto t0 = high_resolution_clock::now();
#endif

parallelFor(0, config->readThreads, [&](size_t I) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of how you are using parallelFor, config->readThreads is only the maximum number of threads that might be spawn for this, not the exact number of threads, which can be lower, governed by the -threads parameter. parallelFor uses llvm::parallel::strategy internally to decide the actual number of threads, which is setup globally when the driver finds the -threads argument.

Using parallelFor as its authors intended will also avoid the need of the index variable and keeping track of it. I provided a snippet before of how I would switch the strategy to one that fits your idea of having a different number of threads for reading files. It should be in some old comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my other comment. I'd missed yours. I'm only following the benchmarks as this is the thrust of this PR.

Copy link
Contributor Author

@johnno1962 johnno1962 Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where are we on this PR? I've removed my benchmarking statements for a release build and looked for pre-existing abstraction for queuing work and found BackgroundQueue which looked promising but it's part of clangd. What do you see as being the blockers now (assuming you're still interested)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is probably good code, but being hidden away in Clangd makes it difficult to reuse. There might be ideas in there than can be copied, though, or moved into a common part of the LLVM source to be usable from both places. Not necessary if you don't want to, but all the concurrency code is making this review more complicated than it should because we are not using proven pieces of code.

I still think that using parallelFor(0, config->readThreads, …) is incorrect, and while it will do config->readThreads "chunks" of work, it will not necessarily do them in readThread number of threads (it can be less).

Either the code changes if you really want to have readThread number of threads, or the code changes to iterate over all the deferred files and let the parallel algorithm do its work chunking into pieces. In any case, the code should change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get that paralellFor may not use exactly readThreads threads the limit in practice being the number of CPUs on the host which is the default value for the -threads option. My point is it doesn't actually matter the precise number of threads as long as there are more than one tickling input flies into memory. The code using paralellFor at least I understand and it is performant.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If readThreads does not longer represent the exact number of threads that is used to read, maybe it deserve other name and different argument name and help text for the argument. If this is going to use the -threads value anyway, why one needs anything more than -parallalelize-input-preload boolean flag, and use deferred.size() as the value. And if one is going to use deferred.size(), why not use parallelFor iterating over deferred and remove a bunch of code to handle the indices manually?

PD: it can be less performant that this version, but I will argue that I would prefer less performance and easier code to maintain.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to take some time to think about how to reply. I'm still not sure you understand how I need this code to work with respect to threading. I'll explain tomorrow. We're so very nearly there if you ask me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@drodriguez, it seems to me we're orbiting on two related remaining disagreements: the use of parallelFor and the type of option (boolean vs. the approximate number of thread for proactive paging).

Let me unpack the first decision as I am convinced the code is following my understanding of the requirement and is not that difficult to understand. The requirement is to perform a large number of very simple operations efficiently. To use an analogy we have a supermarket where 8000 people want to buy a toothbrush. The solution is not to have 8000 tills (threads) nor is it to use 8000 different cashiers opening and closing the till for each request (running up a thread for each operation). The most efficient approach is to have a limited number of tills and have each customer take a numbered ticket as they arrive. These customers are called one at a time on the basis of the index like a post office. If supermarkets worked this way you wouldn't have the stress of deciding which till to join ahead of time nor would the process have to take any longer than absolutely necessary. This is my understanding of the use of parallelFor and the atomics. It is efficient because the overhead of a mutex or atomic metering the allocation of each ticket to a til/queue/thread is extremely low and is not critically dependant on the number of threads.

There may be some algorithm somewhere in llvm that does exactly this but I'd rather have control as when we tried to delegate thread management before it was 50% slower and not, I believe, any easier to understand.

Any chance we can reach agreement on this before we move onto the exact nature of the option? My position there is that I would prefer not to recycle an existing option that does who knows what. Being able to specify the number of threads independently for this feature is a feature and gives us a value for the maximum number of threads parallelFor will use. I'm about to be away from my computer for two weeks and I'd really appreciate it if we could get nearer to landing this PR in case it develops a conflict.

Copy link
Contributor

@drodriguez drodriguez Aug 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to use this approach to parallelFor because you really think it is more efficient in every case, I think the documentation should be modified. When one is passing --read-threads=X, I would expect X threads, but the current implementation can do less threads (because -threads is lower). If you want to keep this approach, the argument name should not lie about what it represent (call it "workers", call it "parallelism", call it whatever except "threads").

auto preloadDeferredFile = [&](const DeferredFile &deferredFile) {
  const StringRef &buff = deferred[localIndex].buffer.getBuffer();
  if (buff.size() > largeArchive)
     continue;
#ifndef NDEBUG
  totalBytes += buff.size();
  numDeferedFilesTouched += 1;
#endif

  // Reference all file's mmap'd pages to load them into memory.
  for (const char *page = buff.data(), *end = page + buff.size();
      page < end; page += pageSize)
    LLVM_ATTRIBUTE_UNUSED volatile char t = *page;
};
#if LLVM_ENABLE_THREADS
{ // Create scope for waiting for the taskGroup
  std::atomic_size_t index = 0;
  llvm::parallel::TaskGroup taskGroup;
  for (int w = 0; w < config->readWorkers; w++)
    taskGroup.spawn([&index, &preloadDeferredFile, &deferred]() {
      while (true) {
        size_t localIndex = index.fetch_add(1);
        if (localIndex >= deferred.size())
          break;
        preloadDeferredFile(deferred[localIndex]);
      }
    });
}
#else
// not sure if you want to preload in this case
// for (const DeferredFile &deferredFile: deferredFile) {
//   preloadDeferredFile(deferredFile);
// }
#endif

It is more complicated code, mostly because one has to deal with the possibility that LLVM is being compiled without threads support, but by not using parallelFor we are indicating whoever reads this code that this is not a simple parallelFor in which each item is processed individually. We are not abusing parallelFor to build our own logic on top.

But, this is my opinion, if any other reviewer says "I am fine with this usage of parallelFor" feel free to ignore me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks very much for your code, it helps me read your mind. I've pushed another commit which also renames the option to --read-workers instead of --read-threads. See what you think!

while (true) {
int localIndex = index.fetch_add(1);
if (localIndex >= (int)deferred.size())
break;
const StringRef &buff = deferred[localIndex].buffer.getBuffer();
if (buff.size() > largeArchive)
continue;
#ifndef NDEBUG
totalBytes += buff.size();
numDeferedFilesTouched += 1;
#endif

// Reference all file's mmap'd pages to load them into memory.
for (const char *page = buff.data(), *end = page + buff.size();
page < end; page += pageSize)
LLVM_ATTRIBUTE_UNUSED volatile char t = *page;
}
});

#ifndef NDEBUG
auto dt = high_resolution_clock::now() - t0;
if (Process::GetEnv("LLD_MULTI_THREAD_PAGE"))
std::cerr << "multiThreadedPageIn " << totalBytes << "/"
<< numDeferedFilesTouched << "/" << deferred.size() << "/"
<< std::setprecision(4)
<< duration_cast<milliseconds>(dt).count() / 1000. << "\n";
#endif
}

static void multiThreadedPageIn(const DeferredFiles &deferred,
bool reap = false) {
static SerialBackgroundQueue pageInQueue;
pageInQueue.queueWork(
[=]() {
DeferredFiles files = deferred;
multiThreadedPageInBackground(files);
},
reap);
}

static InputFile *processFile(std::optional<MemoryBufferRef> buffer,
DeferredFiles *archiveContents, StringRef path,
LoadType loadType, bool isLazy = false,
bool isExplicit = true,
bool isBundleLoader = false,
bool isForceHidden = false) {
if (!buffer)
return nullptr;
MemoryBufferRef mbref = *buffer;
Expand Down Expand Up @@ -379,6 +494,8 @@ static InputFile *addFile(StringRef path, LoadType loadType,
continue;
}

if (archiveContents)
archiveContents->emplace_back(path, isLazy, *mb);
if (!hasObjCSection(*mb))
continue;
if (Error e = file->fetch(c, "-ObjC"))
Expand All @@ -390,7 +507,8 @@ static InputFile *addFile(StringRef path, LoadType loadType,
": Archive::children failed: " + toString(std::move(e)));
}
}
file->addLazySymbols();
if (!archiveContents || archiveContents->empty())
file->addLazySymbols();
loadedArchives[path] = ArchiveFileInfo{file, isCommandLineLoad};
newFile = file;
break;
Expand Down Expand Up @@ -441,6 +559,24 @@ static InputFile *addFile(StringRef path, LoadType loadType,
return newFile;
}

static InputFile *addFile(StringRef path, LoadType loadType,
bool isLazy = false, bool isExplicit = true,
bool isBundleLoader = false,
bool isForceHidden = false) {
return processFile(readFile(path), nullptr, path, loadType, isLazy,
isExplicit, isBundleLoader, isForceHidden);
}

static void deferFile(StringRef path, bool isLazy, DeferredFiles &deferred) {
std::optional<MemoryBufferRef> buffer = readFile(path);
if (!buffer)
return;
if (config->readThreads)
deferred.emplace_back(path, isLazy, *buffer);
else
processFile(buffer, nullptr, path, LoadType::CommandLine, isLazy);
}

static std::vector<StringRef> missingAutolinkWarnings;
static void addLibrary(StringRef name, bool isNeeded, bool isWeak,
bool isReexport, bool isHidden, bool isExplicit,
Expand Down Expand Up @@ -564,13 +700,14 @@ void macho::resolveLCLinkerOptions() {
}
}

static void addFileList(StringRef path, bool isLazy) {
static void addFileList(StringRef path, bool isLazy,
DeferredFiles &deferredFiles) {
std::optional<MemoryBufferRef> buffer = readFile(path);
if (!buffer)
return;
MemoryBufferRef mbref = *buffer;
for (StringRef path : args::getLines(mbref))
addFile(rerootPath(path), LoadType::CommandLine, isLazy);
deferFile(rerootPath(path), isLazy, deferredFiles);
}

// We expect sub-library names of the form "libfoo", which will match a dylib
Expand Down Expand Up @@ -1222,14 +1359,16 @@ static void createFiles(const InputArgList &args) {
bool isLazy = false;
// If we've processed an opening --start-lib, without a matching --end-lib
bool inLib = false;
DeferredFiles deferredFiles;

for (const Arg *arg : args) {
const Option &opt = arg->getOption();
warnIfDeprecatedOption(opt);
warnIfUnimplementedOption(opt);

switch (opt.getID()) {
case OPT_INPUT:
addFile(rerootPath(arg->getValue()), LoadType::CommandLine, isLazy);
deferFile(rerootPath(arg->getValue()), isLazy, deferredFiles);
break;
case OPT_needed_library:
if (auto *dylibFile = dyn_cast_or_null<DylibFile>(
Expand All @@ -1249,7 +1388,7 @@ static void createFiles(const InputArgList &args) {
dylibFile->forceWeakImport = true;
break;
case OPT_filelist:
addFileList(arg->getValue(), isLazy);
addFileList(arg->getValue(), isLazy, deferredFiles);
break;
case OPT_force_load:
addFile(rerootPath(arg->getValue()), LoadType::CommandLineForce);
Expand Down Expand Up @@ -1295,6 +1434,24 @@ static void createFiles(const InputArgList &args) {
break;
}
}

if (config->readThreads) {
multiThreadedPageIn(deferredFiles);

DeferredFiles archiveContents;
std::vector<ArchiveFile *> archives;
for (auto &file : deferredFiles) {
auto inputFile = processFile(file.buffer, &archiveContents, file.path,
LoadType::CommandLine, file.isLazy);
if (ArchiveFile *archive = dyn_cast<ArchiveFile>(inputFile))
archives.push_back(archive);
}

if (!archiveContents.empty())
multiThreadedPageIn(archiveContents);
for (auto *archive : archives)
archive->addLazySymbols();
}
}

static void gatherInputSections() {
Expand Down Expand Up @@ -1687,6 +1844,14 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
}
}

if (auto *arg = args.getLastArg(OPT_read_threads)) {
StringRef v(arg->getValue());
unsigned threads = 0;
if (!llvm::to_integer(v, threads, 0) || threads < 0)
error(arg->getSpelling() + ": expected a positive integer, but got '" +
arg->getValue() + "'");
config->readThreads = threads;
}
if (auto *arg = args.getLastArg(OPT_threads_eq)) {
StringRef v(arg->getValue());
unsigned threads = 0;
Expand Down
3 changes: 3 additions & 0 deletions lld/MachO/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,9 @@ def dead_strip : Flag<["-"], "dead_strip">,
def interposable : Flag<["-"], "interposable">,
HelpText<"Indirects access to all exported symbols in an image">,
Group<grp_opts>;
def read_threads : Joined<["--"], "read-threads=">,
HelpText<"Number of threads to use to proactively page in files for faster disk i/o. 20 would be a typical value, use 0 to disable this feature.">,
Group<grp_lld>;
def order_file : Separate<["-"], "order_file">,
MetaVarName<"<file>">,
HelpText<"Layout functions and data according to specification in <file>">,
Expand Down