-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[lld][MachO]Multi-threaded i/o. Twice as fast linking a large project. #147134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 29 commits
c55b5b2
3d11a33
02fb145
a8eeead
55e26a8
817036b
c07e168
eb4827c
ce93ae3
c47e5c3
5caf5a6
890c492
9714785
85fd77f
6f5f7cb
e3e0369
febf5a9
a5f7a42
6b874b2
84154d4
ed9f07e
ff732ed
ef23af2
4bf74e8
ed9bdb7
4cab9be
f7c8008
a6dd0bc
cf5c3fb
5901e7d
39cffd0
0af2bde
bb91c53
432fb04
30b8c13
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
#include "lld/Common/Reproduce.h" | ||
#include "lld/Common/Version.h" | ||
#include "llvm/ADT/DenseSet.h" | ||
#include "llvm/ADT/ScopeExit.h" | ||
#include "llvm/ADT/StringExtras.h" | ||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/BinaryFormat/MachO.h" | ||
|
@@ -41,11 +42,14 @@ | |
#include "llvm/Object/Archive.h" | ||
#include "llvm/Option/ArgList.h" | ||
#include "llvm/Support/CommandLine.h" | ||
#include "llvm/Support/Debug.h" | ||
#include "llvm/Support/FileSystem.h" | ||
#include "llvm/Support/Parallel.h" | ||
#include "llvm/Support/Path.h" | ||
#include "llvm/Support/Process.h" | ||
#include "llvm/Support/TarWriter.h" | ||
#include "llvm/Support/TargetSelect.h" | ||
#include "llvm/Support/Threading.h" | ||
#include "llvm/Support/TimeProfiler.h" | ||
#include "llvm/TargetParser/Host.h" | ||
#include "llvm/TextAPI/Architecture.h" | ||
|
@@ -282,11 +286,122 @@ static void saveThinArchiveToRepro(ArchiveFile const *file) { | |
": Archive::children failed: " + toString(std::move(e))); | ||
} | ||
|
||
static InputFile *addFile(StringRef path, LoadType loadType, | ||
bool isLazy = false, bool isExplicit = true, | ||
bool isBundleLoader = false, | ||
bool isForceHidden = false) { | ||
std::optional<MemoryBufferRef> buffer = readFile(path); | ||
class DeferredFile { | ||
public: | ||
DeferredFile(StringRef path, bool isLazy, MemoryBufferRef buffer) | ||
: path(path), isLazy(isLazy), buffer(buffer) {} | ||
StringRef path; | ||
bool isLazy; | ||
MemoryBufferRef buffer; | ||
}; | ||
johnno1962 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
using DeferredFiles = std::vector<DeferredFile>; | ||
|
||
class SerialBackgroundQueue { | ||
johnno1962 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
std::deque<std::function<void()>> queue; | ||
std::thread *running; | ||
std::mutex mutex; | ||
|
||
public: | ||
void queueWork(std::function<void()> work, bool reap) { | ||
mutex.lock(); | ||
if (running && (queue.empty() || reap)) { | ||
mutex.unlock(); | ||
running->join(); | ||
mutex.lock(); | ||
delete running; | ||
running = nullptr; | ||
} | ||
|
||
if (!reap) { | ||
queue.emplace_back(std::move(work)); | ||
if (!running) | ||
running = new std::thread([&]() { | ||
bool shouldPop = false; | ||
while (true) { | ||
mutex.lock(); | ||
if (shouldPop) | ||
queue.pop_front(); | ||
if (queue.empty()) { | ||
mutex.unlock(); | ||
break; | ||
} | ||
auto work = std::move(queue.front()); | ||
shouldPop = true; | ||
mutex.unlock(); | ||
work(); | ||
} | ||
johnno1962 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}); | ||
} | ||
mutex.unlock(); | ||
} | ||
}; | ||
|
||
#ifndef NDEBUG | ||
#include <iomanip> | ||
#include <iostream> | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think having these includes here goes against the LLVM coding standards: https://llvm.org/docs/CodingStandards.html#include-style They should be at the top. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've removed the includes and the std::setprecision(4) that required them. Guess I'm just going to have to view times in scientific notation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the record, I don't think they need to be removed and you can still use |
||
|
||
// Most input files have been mapped but not yet paged in. | ||
// This code forces the page-ins on multiple threads so | ||
// the process is not stalled waiting on disk buffer i/o. | ||
void multiThreadedPageInBackground(DeferredFiles &deferred) { | ||
using namespace std::chrono; | ||
static const size_t pageSize = Process::getPageSizeEstimate(); | ||
static const size_t largeArchive = 10 * 1024 * 1024; | ||
std::atomic_int index = 0; | ||
#ifndef NDEBUG | ||
std::atomic_int numDeferedFilesTouched = 0; | ||
static std::atomic_uint64_t totalBytes = 0; | ||
auto t0 = high_resolution_clock::now(); | ||
#endif | ||
|
||
parallelFor(0, config->readThreads, [&](size_t I) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of how you are using Using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See my other comment. I'd missed yours. I'm only following the benchmarks as this is the thrust of this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where are we on this PR? I've removed my benchmarking statements for a release build and looked for pre-existing abstraction for queuing work and found BackgroundQueue which looked promising but it's part of clangd. What do you see as being the blockers now (assuming you're still interested)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is probably good code, but being hidden away in Clangd makes it difficult to reuse. There might be ideas in there than can be copied, though, or moved into a common part of the LLVM source to be usable from both places. Not necessary if you don't want to, but all the concurrency code is making this review more complicated than it should because we are not using proven pieces of code. I still think that using Either the code changes if you really want to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get that paralellFor may not use exactly readThreads threads the limit in practice being the number of CPUs on the host which is the default value for the -threads option. My point is it doesn't actually matter the precise number of threads as long as there are more than one tickling input flies into memory. The code using paralellFor at least I understand and it is performant. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If PD: it can be less performant that this version, but I will argue that I would prefer less performance and easier code to maintain. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to take some time to think about how to reply. I'm still not sure you understand how I need this code to work with respect to threading. I'll explain tomorrow. We're so very nearly there if you ask me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @drodriguez, it seems to me we're orbiting on two related remaining disagreements: the use of parallelFor and the type of option (boolean vs. the approximate number of thread for proactive paging). Let me unpack the first decision as I am convinced the code is following my understanding of the requirement and is not that difficult to understand. The requirement is to perform a large number of very simple operations efficiently. To use an analogy we have a supermarket where 8000 people want to buy a toothbrush. The solution is not to have 8000 tills (threads) nor is it to use 8000 different cashiers opening and closing the till for each request (running up a thread for each operation). The most efficient approach is to have a limited number of tills and have each customer take a numbered ticket as they arrive. These customers are called one at a time on the basis of the index like a post office. If supermarkets worked this way you wouldn't have the stress of deciding which till to join ahead of time nor would the process have to take any longer than absolutely necessary. This is my understanding of the use of parallelFor and the atomics. It is efficient because the overhead of a mutex or atomic metering the allocation of each ticket to a til/queue/thread is extremely low and is not critically dependant on the number of threads. There may be some algorithm somewhere in llvm that does exactly this but I'd rather have control as when we tried to delegate thread management before it was 50% slower and not, I believe, any easier to understand. Any chance we can reach agreement on this before we move onto the exact nature of the option? My position there is that I would prefer not to recycle an existing option that does who knows what. Being able to specify the number of threads independently for this feature is a feature and gives us a value for the maximum number of threads parallelFor will use. I'm about to be away from my computer for two weeks and I'd really appreciate it if we could get nearer to landing this PR in case it develops a conflict. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to use this approach to
It is more complicated code, mostly because one has to deal with the possibility that LLVM is being compiled without threads support, but by not using But, this is my opinion, if any other reviewer says "I am fine with this usage of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks very much for your code, it helps me read your mind. I've pushed another commit which also renames the option to |
||
while (true) { | ||
int localIndex = index.fetch_add(1); | ||
if (localIndex >= (int)deferred.size()) | ||
break; | ||
const StringRef &buff = deferred[localIndex].buffer.getBuffer(); | ||
if (buff.size() > largeArchive) | ||
continue; | ||
#ifndef NDEBUG | ||
totalBytes += buff.size(); | ||
numDeferedFilesTouched += 1; | ||
#endif | ||
|
||
// Reference all file's mmap'd pages to load them into memory. | ||
for (const char *page = buff.data(), *end = page + buff.size(); | ||
page < end; page += pageSize) | ||
LLVM_ATTRIBUTE_UNUSED volatile char t = *page; | ||
} | ||
}); | ||
|
||
#ifndef NDEBUG | ||
auto dt = high_resolution_clock::now() - t0; | ||
if (Process::GetEnv("LLD_MULTI_THREAD_PAGE")) | ||
std::cerr << "multiThreadedPageIn " << totalBytes << "/" | ||
<< numDeferedFilesTouched << "/" << deferred.size() << "/" | ||
<< std::setprecision(4) | ||
<< duration_cast<milliseconds>(dt).count() / 1000. << "\n"; | ||
#endif | ||
} | ||
|
||
static void multiThreadedPageIn(const DeferredFiles &deferred, | ||
bool reap = false) { | ||
johnno1962 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
static SerialBackgroundQueue pageInQueue; | ||
pageInQueue.queueWork( | ||
[=]() { | ||
DeferredFiles files = deferred; | ||
multiThreadedPageInBackground(files); | ||
}, | ||
reap); | ||
} | ||
|
||
static InputFile *processFile(std::optional<MemoryBufferRef> buffer, | ||
DeferredFiles *archiveContents, StringRef path, | ||
LoadType loadType, bool isLazy = false, | ||
bool isExplicit = true, | ||
bool isBundleLoader = false, | ||
bool isForceHidden = false) { | ||
if (!buffer) | ||
return nullptr; | ||
MemoryBufferRef mbref = *buffer; | ||
|
@@ -379,6 +494,8 @@ static InputFile *addFile(StringRef path, LoadType loadType, | |
continue; | ||
} | ||
|
||
if (archiveContents) | ||
archiveContents->emplace_back(path, isLazy, *mb); | ||
if (!hasObjCSection(*mb)) | ||
continue; | ||
if (Error e = file->fetch(c, "-ObjC")) | ||
|
@@ -390,7 +507,8 @@ static InputFile *addFile(StringRef path, LoadType loadType, | |
": Archive::children failed: " + toString(std::move(e))); | ||
} | ||
} | ||
file->addLazySymbols(); | ||
if (!archiveContents || archiveContents->empty()) | ||
file->addLazySymbols(); | ||
loadedArchives[path] = ArchiveFileInfo{file, isCommandLineLoad}; | ||
newFile = file; | ||
break; | ||
|
@@ -441,6 +559,24 @@ static InputFile *addFile(StringRef path, LoadType loadType, | |
return newFile; | ||
} | ||
|
||
static InputFile *addFile(StringRef path, LoadType loadType, | ||
bool isLazy = false, bool isExplicit = true, | ||
bool isBundleLoader = false, | ||
bool isForceHidden = false) { | ||
return processFile(readFile(path), nullptr, path, loadType, isLazy, | ||
isExplicit, isBundleLoader, isForceHidden); | ||
} | ||
|
||
static void deferFile(StringRef path, bool isLazy, DeferredFiles &deferred) { | ||
std::optional<MemoryBufferRef> buffer = readFile(path); | ||
if (!buffer) | ||
return; | ||
if (config->readThreads) | ||
deferred.emplace_back(path, isLazy, *buffer); | ||
else | ||
processFile(buffer, nullptr, path, LoadType::CommandLine, isLazy); | ||
} | ||
|
||
static std::vector<StringRef> missingAutolinkWarnings; | ||
static void addLibrary(StringRef name, bool isNeeded, bool isWeak, | ||
bool isReexport, bool isHidden, bool isExplicit, | ||
|
@@ -564,13 +700,14 @@ void macho::resolveLCLinkerOptions() { | |
} | ||
} | ||
|
||
static void addFileList(StringRef path, bool isLazy) { | ||
static void addFileList(StringRef path, bool isLazy, | ||
DeferredFiles &deferredFiles) { | ||
std::optional<MemoryBufferRef> buffer = readFile(path); | ||
if (!buffer) | ||
return; | ||
MemoryBufferRef mbref = *buffer; | ||
for (StringRef path : args::getLines(mbref)) | ||
addFile(rerootPath(path), LoadType::CommandLine, isLazy); | ||
deferFile(rerootPath(path), isLazy, deferredFiles); | ||
} | ||
|
||
// We expect sub-library names of the form "libfoo", which will match a dylib | ||
|
@@ -1222,14 +1359,16 @@ static void createFiles(const InputArgList &args) { | |
bool isLazy = false; | ||
// If we've processed an opening --start-lib, without a matching --end-lib | ||
bool inLib = false; | ||
DeferredFiles deferredFiles; | ||
|
||
for (const Arg *arg : args) { | ||
const Option &opt = arg->getOption(); | ||
warnIfDeprecatedOption(opt); | ||
warnIfUnimplementedOption(opt); | ||
|
||
switch (opt.getID()) { | ||
case OPT_INPUT: | ||
addFile(rerootPath(arg->getValue()), LoadType::CommandLine, isLazy); | ||
deferFile(rerootPath(arg->getValue()), isLazy, deferredFiles); | ||
break; | ||
case OPT_needed_library: | ||
if (auto *dylibFile = dyn_cast_or_null<DylibFile>( | ||
|
@@ -1249,7 +1388,7 @@ static void createFiles(const InputArgList &args) { | |
dylibFile->forceWeakImport = true; | ||
break; | ||
case OPT_filelist: | ||
addFileList(arg->getValue(), isLazy); | ||
addFileList(arg->getValue(), isLazy, deferredFiles); | ||
break; | ||
case OPT_force_load: | ||
addFile(rerootPath(arg->getValue()), LoadType::CommandLineForce); | ||
|
@@ -1295,6 +1434,24 @@ static void createFiles(const InputArgList &args) { | |
break; | ||
} | ||
} | ||
|
||
if (config->readThreads) { | ||
multiThreadedPageIn(deferredFiles); | ||
|
||
DeferredFiles archiveContents; | ||
std::vector<ArchiveFile *> archives; | ||
for (auto &file : deferredFiles) { | ||
auto inputFile = processFile(file.buffer, &archiveContents, file.path, | ||
LoadType::CommandLine, file.isLazy); | ||
if (ArchiveFile *archive = dyn_cast<ArchiveFile>(inputFile)) | ||
archives.push_back(archive); | ||
} | ||
|
||
if (!archiveContents.empty()) | ||
multiThreadedPageIn(archiveContents); | ||
for (auto *archive : archives) | ||
archive->addLazySymbols(); | ||
} | ||
} | ||
|
||
static void gatherInputSections() { | ||
|
@@ -1687,6 +1844,14 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS, | |
} | ||
} | ||
|
||
if (auto *arg = args.getLastArg(OPT_read_threads)) { | ||
StringRef v(arg->getValue()); | ||
unsigned threads = 0; | ||
if (!llvm::to_integer(v, threads, 0) || threads < 0) | ||
johnno1962 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
error(arg->getSpelling() + ": expected a positive integer, but got '" + | ||
arg->getValue() + "'"); | ||
config->readThreads = threads; | ||
} | ||
if (auto *arg = args.getLastArg(OPT_threads_eq)) { | ||
StringRef v(arg->getValue()); | ||
unsigned threads = 0; | ||
|
Uh oh!
There was an error while loading. Please reload this page.