Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/cmake/caches/BOLT.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")

set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
Expand Down
40 changes: 31 additions & 9 deletions clang/tools/driver/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,28 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
)
set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")

set(CLANG_BOLT_INPUTS $<TARGET_FILE:clang>)
set(CLANG_INSTRUMENTED_OUTPUTS ${CLANG_INSTRUMENTED})

# Add in dynamically linked libraries, if needs be. Currently only supported
# on Linux because it relies on LD_PRELOAD for instrumentation.
if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
if (CLANG_LINK_CLANG_DYLIB)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why condition this on CLANG_LINK_CLANG_DYLIB here ? Do we only want to do the optimization if clang is linking against the shared library? Wouldn't library consumers benefit from these optimizations even if clang is linked statically.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory yes, but in this scenario, we would need another executable linking with that library to run, otherwise we can't gather the runtime information from the instrumented library.

set(CLANG_CPP_BOLT_INSTRUMENTED "clang-cxx-bolt.inst" CACHE STRING
"Name of BOLT-instrumented Clang library")
set(CLANG_CPP_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_CPP_BOLT_INSTRUMENTED})
list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:clang-cpp>)
list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${CLANG_CPP_INSTRUMENTED})
endif()
if (LLVM_LINK_LLVM_DYLIB)
set(LLVM_BOLT_INSTRUMENTED "LLVM-bolt.inst" CACHE STRING
"Name of BOLT-instrumented LLVM library")
set(LLVM_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLVM_BOLT_INSTRUMENTED})
list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:LLVM>)
list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${LLVM_INSTRUMENTED})
endif()
endif()

# This POST_BUILD command is executed unconditionally even if the clang target
# is already built. We need to wrap the whole bolt optimization process in
# a single python wrapper, so that we can first check if the binary has
Expand All @@ -176,15 +198,15 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
TARGET clang POST_BUILD
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
bolt-optimize
--method ${CLANG_BOLT}
--input $<TARGET_FILE:clang>
--instrumented-output ${CLANG_INSTRUMENTED}
--fdata ${BOLT_FDATA}
--perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
--readelf $<TARGET_FILE:llvm-readobj>
--bolt $<TARGET_FILE:llvm-bolt>
--lit "${LIT_COMMAND}"
--merge-fdata $<TARGET_FILE:merge-fdata>
--method ${CLANG_BOLT}
--input "${CLANG_BOLT_INPUTS}"
--instrumented-output "${CLANG_INSTRUMENTED_OUTPUTS}"
--fdata ${BOLT_FDATA}
--perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
--readelf $<TARGET_FILE:llvm-readobj>
--bolt $<TARGET_FILE:llvm-bolt>
--lit "${LIT_COMMAND}"
--merge-fdata $<TARGET_FILE:merge-fdata>
COMMENT "Optimizing Clang with BOLT"
USES_TERMINAL
VERBATIM
Expand Down
115 changes: 77 additions & 38 deletions clang/utils/perf-training/perf-helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,23 @@ def genOrderFile(args):
return 0


def filter_bolt_optimized(inputs, instrumented_outputs):
new_inputs = []
new_instrumented_ouputs = []
for input, instrumented_output in zip(inputs, instrumented_outputs):
output = subprocess.check_output(
[opts.readelf, "-WS", input], universal_newlines=True
)

# This binary has already been bolt-optimized, so skip further processing.
if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
print(f"Skipping {input}, it's already instrumented")
else:
new_inputs.append(input)
new_instrumented_ouputs.append(instrumented_output)
return new_inputs, new_instrumented_ouputs


def bolt_optimize(args):
parser = argparse.ArgumentParser("%prog [options] ")
parser.add_argument("--method", choices=["INSTRUMENT", "PERF", "LBR"])
Expand All @@ -574,47 +591,67 @@ def bolt_optimize(args):

opts = parser.parse_args(args)

output = subprocess.check_output(
[opts.readelf, "-WS", opts.input], universal_newlines=True
)
inputs = opts.input.split(";")
instrumented_outputs = opts.instrumented_output.split(";")
assert len(inputs) == len(
instrumented_outputs
), "inconsistent --input / --instrumented-output arguments"

# This binary has already been bolt-optimized, so skip further processing.
if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
inputs, instrumented_outputs = filter_bolt_optimized(inputs, instrumented_outputs)
if not inputs:
return 0

environ = os.environ.copy()
if opts.method == "INSTRUMENT":
process = subprocess.run(
[
preloads = []
for input, instrumented_output in zip(inputs, instrumented_outputs):
args = [
opts.bolt,
opts.input,
input,
"-o",
opts.instrumented_output,
instrumented_output,
"-instrument",
"--instrumentation-file-append-pid",
f"--instrumentation-file={opts.fdata}",
],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
]
print("Running: " + " ".join(args))
process = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)

print(process.args)
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()

output = subprocess.check_output(
[opts.readelf, "--file-header", input], universal_newlines=True
)
if re.search(r"Type:\s*((Shared)|(DYN))", output):
# force using the instrumented version
preloads.append(instrumented_output)

if preloads:
print("Patching execution environment for dynamic library")
environ["LD_PRELOAD"] = os.pathsep.join(preloads)
Comment on lines +636 to +638
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The instrumented binary is still linked to the non-instrumented libraries. LD_PRELOAD makes sure we use symbol from the instrumented libraries instead.


args = [
sys.executable,
opts.lit,
"-v",
os.path.join(opts.perf_training_binary_dir, f"bolt-fdata"),
]
print("Running: " + " ".join(args))
process = subprocess.run(
[
sys.executable,
opts.lit,
os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
],
args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
env=environ,
)

print(process.args)
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()
Expand All @@ -624,14 +661,14 @@ def bolt_optimize(args):

merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])

shutil.copy(opts.input, f"{opts.input}-prebolt")
for input in inputs:
shutil.copy(input, f"{input}-prebolt")

process = subprocess.run(
[
args = [
opts.bolt,
f"{opts.input}-prebolt",
f"{input}-prebolt",
"-o",
opts.input,
input,
"-data",
opts.fdata,
"-reorder-blocks=ext-tsp",
Expand All @@ -643,16 +680,18 @@ def bolt_optimize(args):
"-use-gnu-stack",
"-update-debug-sections",
"-nl" if opts.method == "PERF" else "",
],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
]
print("Running: " + " ".join(args))
process = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)

print(process.args)
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()


commands = {
Expand Down