-
Notifications
You must be signed in to change notification settings - Fork 15.4k
Add support for dynamic libraries in CLANG_BOLT #127020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for dynamic libraries in CLANG_BOLT #127020
Conversation
|
@llvm/pr-subscribers-clang Author: None (serge-sans-paille) ChangesFull diff: https://github.com/llvm/llvm-project/pull/127020.diff 2 Files Affected:
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index ad336fcc45b60..10ea5de387220 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -23,10 +23,14 @@ if(CLANG_PLUGIN_SUPPORT)
set(support_plugins SUPPORT_PLUGINS)
endif()
+set(CLANG_BOLT_ALLOWLIST INSTRUMENT PERF LBR)
set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
- May be specified as Instrument or Perf or LBR to use a particular profiling \
+May be specified as one of ${CLANG_BOLT_ALLOWLIST} to use a particular profiling \
mechanism.")
string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+if (CLANG_BOLT AND NOT CLANG_BOLT IN_LIST CLANG_BOLT_ALLOWLIST)
+ message(FATAL_ERROR "Specified CLANG_BOLT value '${CLANG_BOLT}' is not one of ${CLANG_BOLT_ALLOWLIST}.")
+endif()
if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj)
@@ -164,6 +168,28 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
)
set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
+ set(CLANG_BOLT_INPUTS $<TARGET_FILE:clang>)
+ set(CLANG_INSTRUMENTED_OUTPUTS ${CLANG_INSTRUMENTED})
+
+ # Add in dynamically linked libraries, if needs be. Currently only supported
+ # on Linux because it relies on LD_PRELOAD for instrumentation.
+ if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ if (CLANG_LINK_CLANG_DYLIB)
+ set(CLANG_CPP_BOLT_INSTRUMENTED "clang-cxx-bolt.inst" CACHE STRING
+ "Name of BOLT-instrumented Clang library")
+ set(CLANG_CPP_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_CPP_BOLT_INSTRUMENTED})
+ list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:clang-cpp>)
+ list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${CLANG_CPP_INSTRUMENTED})
+ endif()
+ if (LLVM_LINK_LLVM_DYLIB)
+ set(LLVM_BOLT_INSTRUMENTED "LLVM-bolt.inst" CACHE STRING
+ "Name of BOLT-instrumented LLVM library")
+ set(LLVM_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLVM_BOLT_INSTRUMENTED})
+ list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:LLVM>)
+ list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${LLVM_INSTRUMENTED})
+ endif()
+ endif()
+
# This POST_BUILD command is executed unconditionally even if the clang target
# is already built. We need to wrap the whole bolt optimization process in
# a single python wrapper, so that we can first check if the binary has
@@ -172,15 +198,15 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
TARGET clang POST_BUILD
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
bolt-optimize
- --method ${CLANG_BOLT}
- --input $<TARGET_FILE:clang>
- --instrumented-output ${CLANG_INSTRUMENTED}
- --fdata ${BOLT_FDATA}
- --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
- --readelf $<TARGET_FILE:llvm-readobj>
- --bolt $<TARGET_FILE:llvm-bolt>
- --lit "${LIT_COMMAND}"
- --merge-fdata $<TARGET_FILE:merge-fdata>
+ --method ${CLANG_BOLT}
+ --input "${CLANG_BOLT_INPUTS}"
+ --instrumented-output "${CLANG_INSTRUMENTED_OUTPUTS}"
+ --fdata ${BOLT_FDATA}
+ --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
+ --readelf $<TARGET_FILE:llvm-readobj>
+ --bolt $<TARGET_FILE:llvm-bolt>
+ --lit "${LIT_COMMAND}"
+ --merge-fdata $<TARGET_FILE:merge-fdata>
COMMENT "Optimizing Clang with BOLT"
USES_TERMINAL
VERBATIM
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 55c5160a71c4f..ea32ef216bcaa 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -559,6 +559,22 @@ def genOrderFile(args):
return 0
+def filter_bolt_optimized(inputs, instrumented_outputs)
+ new_inputs = []
+ new_instrumented_ouputs = []
+ for input, instrumented_output in zip(inputs, instrumented_outputs):
+ output = subprocess.check_output(
+ [opts.readelf, "-WS", input], universal_newlines=True
+ )
+
+ # This binary has already been bolt-optimized, so skip further processing.
+ if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+ print(f"Skipping {input}, it's already instrumented")
+ else:
+ new_inputs.append(input)
+ new_instrumented_ouputs.append(instrumented_output)
+ return new_inputs, new_instrumented_ouputs
+
def bolt_optimize(args):
parser = argparse.ArgumentParser("%prog [options] ")
@@ -574,47 +590,66 @@ def bolt_optimize(args):
opts = parser.parse_args(args)
- output = subprocess.check_output(
- [opts.readelf, "-WS", opts.input], universal_newlines=True
- )
+ inputs = opts.input.split(';')
+ instrumented_outputs = opts.instrumented_output.split(';')
+ assert len(inputs) == len(instrumented_outputs), "inconsistent --input / --instrumented-output arguments"
- # This binary has already been bolt-optimized, so skip further processing.
- if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+ inputs, instrumented_outputs = filter_bolt_optimized(inputs, instrumented_outputs)
+ if not inputs:
return 0
+ environ = os.environ.copy()
if opts.method == "INSTRUMENT":
- process = subprocess.run(
- [
- opts.bolt,
- opts.input,
- "-o",
- opts.instrumented_output,
- "-instrument",
- "--instrumentation-file-append-pid",
- f"--instrumentation-file={opts.fdata}",
- ],
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- )
+ preloads = []
+ for input, instrumented_output in zip(inputs, instrumented_outputs):
+ args = [
+ opts.bolt,
+ input,
+ "-o",
+ instrumented_output,
+ "-instrument",
+ "--instrumentation-file-append-pid",
+ f"--instrumentation-file={opts.fdata}",
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ )
- print(process.args)
- for line in process.stdout:
- sys.stdout.write(line)
- process.check_returncode()
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
- process = subprocess.run(
- [
+ output = subprocess.check_output(
+ [opts.readelf, "--file-header", input], universal_newlines=True
+ )
+ if re.search(r"Type:\s*((Shared)|(DYN))", output):
+ # force using the instrumented version
+ preloads.append(instrumented_output)
+
+ if preloads:
+ print("Patching execution environment for dynamic library")
+ environ["LD_PRELOAD"] = os.pathsep.join(preloads)
+
+
+ args = [
sys.executable,
opts.lit,
- os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
- ],
+ "-v",
+ os.path.join(opts.perf_training_binary_dir, f"bolt-fdata"),
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
+ env=environ,
)
- print(process.args)
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()
@@ -624,35 +659,37 @@ def bolt_optimize(args):
merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
- shutil.copy(opts.input, f"{opts.input}-prebolt")
+ for input in inputs:
+ shutil.copy(input, f"{input}-prebolt")
- process = subprocess.run(
- [
- opts.bolt,
- f"{opts.input}-prebolt",
- "-o",
- opts.input,
- "-data",
- opts.fdata,
- "-reorder-blocks=ext-tsp",
- "-reorder-functions=cdsort",
- "-split-functions",
- "-split-all-cold",
- "-split-eh",
- "-dyno-stats",
- "-use-gnu-stack",
- "-update-debug-sections",
- "-nl" if opts.method == "PERF" else "",
- ],
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- )
+ args = [
+ opts.bolt,
+ f"{input}-prebolt",
+ "-o",
+ input,
+ "-data",
+ opts.fdata,
+ "-reorder-blocks=ext-tsp",
+ "-reorder-functions=cdsort",
+ "-split-functions",
+ "-split-all-cold",
+ "-split-eh",
+ "-dyno-stats",
+ "-use-gnu-stack",
+ "-update-debug-sections",
+ "-nl" if opts.method == "PERF" else "",
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ )
- print(process.args)
- for line in process.stdout:
- sys.stdout.write(line)
- process.check_returncode()
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
commands = {
|
|
cc @tbaederr , @nikic and @sylvestre , this could be useful when packaging clang optimized with bolt on Fedora or Debian |
eeeaa81 to
c3a9588
Compare
When linking clang with libLLVM and clang-cpp dynamically, bolt post processing only optimizes the clang binary. This patch makes sure it also instruments libLLVM and libclang-cpp, otherwise optimizing just the clang binary yields limited benefits. This currently only works on Linux due to reliance on LD_PRELOAD to have the instrumented binary use the instrumented shared libraries.
c3a9588 to
a701851
Compare
|
@tstellar : gentle ping :-) |
| # Add in dynamically linked libraries, if needs be. Currently only supported | ||
| # on Linux because it relies on LD_PRELOAD for instrumentation. | ||
| if (CMAKE_SYSTEM_NAME STREQUAL "Linux") | ||
| if (CLANG_LINK_CLANG_DYLIB) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why condition this on CLANG_LINK_CLANG_DYLIB here ? Do we only want to do the optimization if clang is linking against the shared library? Wouldn't library consumers benefit from these optimizations even if clang is linked statically.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In theory yes, but in this scenario, we would need another executable linking with that library to run, otherwise we can't gather the runtime information from the instrumented library.
| if preloads: | ||
| print("Patching execution environment for dynamic library") | ||
| environ["LD_PRELOAD"] = os.pathsep.join(preloads) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The instrumented binary is still linked to the non-instrumented libraries. LD_PRELOAD makes sure we use symbol from the instrumented libraries instead.
tstellar
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like the idea here. I wonder if this could be generalized somehow in add_llvm_library so that it could be applied to arbitrary libraries. I had wanted to try to generalize this for executable in add_llvm_executable.
I'm fine taking this patch as-is without generalizing it, but I'm curious if you think this would work as a follow up change in the future.
Thanks Tom for your review. Unfortunately this would require providing an executable to link to, and a relevant profiling scenario to gather instrumentation profile. That's very specific to each library, I don't think I want to do that now :-) |
|
@tstellar any more thoughts on this? |
tstellar
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/8657 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/113/builds/6145 Here is the relevant piece of the build log for the reference |
|
This commit has broken our CI. We do LTO-PGO-BOLT 3-stage build in our CI and this commit has caused the build failure. Reverting this commit brings the CI back to green. Please see the error below: |
|
@madhur13490 : can you share the full traceback? It's missing the actual exception. I'd appreciate if you'd share details concerning the system used too. thanks! |
|
@serge-sans-paille The build failure I am reporting is most likely what has already been reported on this issue, and the link to the buildbot log is also available. Reproducing this on any local machine should be easily possible by following the buildbot's recipe. E.g.buildbot |
|
Indeed. Looks like a small typo during refactoring, I'm testing the patch right now. |
Patch typo introduced in #127020
|
9db72e5 fixes the typo. Thanks @madhur13490 for the ping |
Simpler detection of dynamic library operands as the readelf one seems to be unreliable (works on my setup, not on buildbots). This is a follow-up to #127020
|
This is now fixed, as showcased by https://lab.llvm.org/buildbot/#/builders/113/builds/6165. Sorry for the inconvenience. |
No description provided.