Skip to content

Commit eeeaa81

Browse files
[clang][bolt] Improve CLANG_BOLT setup to support shared libraries
When linking clang with libLLVM and clang-cpp dynamically, bolt post processing only optimizes the clang binary. This patch makes sure it also instruments libLLVM and libclang-cpp, otherwise optimizing just the clang binary yields limited benefits. This currently only works on Linux due to reliance on LD_PRELOAD to have the instrumented binary use the instrumented shared libraries.
1 parent c2d1352 commit eeeaa81

File tree

3 files changed

+124
-64
lines changed

3 files changed

+124
-64
lines changed

clang/cmake/caches/BOLT.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
22
set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
33
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
4+
set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
45

56
set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
67
set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")

clang/tools/driver/CMakeLists.txt

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,28 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
168168
)
169169
set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
170170

171+
set(CLANG_BOLT_INPUTS $<TARGET_FILE:clang>)
172+
set(CLANG_INSTRUMENTED_OUTPUTS ${CLANG_INSTRUMENTED})
173+
174+
# Add in dynamically linked libraries, if needs be. Currently only supported
175+
# on Linux because it relies on LD_PRELOAD for instrumentation.
176+
if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
177+
if (CLANG_LINK_CLANG_DYLIB)
178+
set(CLANG_CPP_BOLT_INSTRUMENTED "clang-cxx-bolt.inst" CACHE STRING
179+
"Name of BOLT-instrumented Clang library")
180+
set(CLANG_CPP_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_CPP_BOLT_INSTRUMENTED})
181+
list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:clang-cpp>)
182+
list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${CLANG_CPP_INSTRUMENTED})
183+
endif()
184+
if (LLVM_LINK_LLVM_DYLIB)
185+
set(LLVM_BOLT_INSTRUMENTED "LLVM-bolt.inst" CACHE STRING
186+
"Name of BOLT-instrumented LLVM library")
187+
set(LLVM_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLVM_BOLT_INSTRUMENTED})
188+
list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:LLVM>)
189+
list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${LLVM_INSTRUMENTED})
190+
endif()
191+
endif()
192+
171193
# This POST_BUILD command is executed unconditionally even if the clang target
172194
# is already built. We need to wrap the whole bolt optimization process in
173195
# a single python wrapper, so that we can first check if the binary has
@@ -176,15 +198,15 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
176198
TARGET clang POST_BUILD
177199
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
178200
bolt-optimize
179-
--method ${CLANG_BOLT}
180-
--input $<TARGET_FILE:clang>
181-
--instrumented-output ${CLANG_INSTRUMENTED}
182-
--fdata ${BOLT_FDATA}
183-
--perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
184-
--readelf $<TARGET_FILE:llvm-readobj>
185-
--bolt $<TARGET_FILE:llvm-bolt>
186-
--lit "${LIT_COMMAND}"
187-
--merge-fdata $<TARGET_FILE:merge-fdata>
201+
--method ${CLANG_BOLT}
202+
--input "${CLANG_BOLT_INPUTS}"
203+
--instrumented-output "${CLANG_INSTRUMENTED_OUTPUTS}"
204+
--fdata ${BOLT_FDATA}
205+
--perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
206+
--readelf $<TARGET_FILE:llvm-readobj>
207+
--bolt $<TARGET_FILE:llvm-bolt>
208+
--lit "${LIT_COMMAND}"
209+
--merge-fdata $<TARGET_FILE:merge-fdata>
188210
COMMENT "Optimizing Clang with BOLT"
189211
USES_TERMINAL
190212
VERBATIM

clang/utils/perf-training/perf-helper.py

Lines changed: 92 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,22 @@ def genOrderFile(args):
559559

560560
return 0
561561

562+
def filter_bolt_optimized(inputs, instrumented_outputs)
563+
new_inputs = []
564+
new_instrumented_ouputs = []
565+
for input, instrumented_output in zip(inputs, instrumented_outputs):
566+
output = subprocess.check_output(
567+
[opts.readelf, "-WS", input], universal_newlines=True
568+
)
569+
570+
# This binary has already been bolt-optimized, so skip further processing.
571+
if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
572+
print(f"Skipping {input}, it's already instrumented")
573+
else:
574+
new_inputs.append(input)
575+
new_instrumented_ouputs.append(instrumented_output)
576+
return new_inputs, new_instrumented_ouputs
577+
562578

563579
def bolt_optimize(args):
564580
parser = argparse.ArgumentParser("%prog [options] ")
@@ -574,47 +590,66 @@ def bolt_optimize(args):
574590

575591
opts = parser.parse_args(args)
576592

577-
output = subprocess.check_output(
578-
[opts.readelf, "-WS", opts.input], universal_newlines=True
579-
)
593+
inputs = opts.input.split(';')
594+
instrumented_outputs = opts.instrumented_output.split(';')
595+
assert len(inputs) == len(instrumented_outputs), "inconsistent --input / --instrumented-output arguments"
580596

581-
# This binary has already been bolt-optimized, so skip further processing.
582-
if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
597+
inputs, instrumented_outputs = filter_bolt_optimized(inputs, instrumented_outputs)
598+
if not inputs:
583599
return 0
584600

601+
environ = os.environ.copy()
585602
if opts.method == "INSTRUMENT":
586-
process = subprocess.run(
587-
[
588-
opts.bolt,
589-
opts.input,
590-
"-o",
591-
opts.instrumented_output,
592-
"-instrument",
593-
"--instrumentation-file-append-pid",
594-
f"--instrumentation-file={opts.fdata}",
595-
],
596-
stdout=subprocess.PIPE,
597-
stderr=subprocess.STDOUT,
598-
text=True,
599-
)
603+
preloads = []
604+
for input, instrumented_output in zip(inputs, instrumented_outputs):
605+
args = [
606+
opts.bolt,
607+
input,
608+
"-o",
609+
instrumented_output,
610+
"-instrument",
611+
"--instrumentation-file-append-pid",
612+
f"--instrumentation-file={opts.fdata}",
613+
]
614+
print("Running: " + " ".join(args))
615+
process = subprocess.run(
616+
args,
617+
stdout=subprocess.PIPE,
618+
stderr=subprocess.STDOUT,
619+
text=True,
620+
)
600621

601-
print(process.args)
602-
for line in process.stdout:
603-
sys.stdout.write(line)
604-
process.check_returncode()
622+
for line in process.stdout:
623+
sys.stdout.write(line)
624+
process.check_returncode()
605625

606-
process = subprocess.run(
607-
[
626+
output = subprocess.check_output(
627+
[opts.readelf, "--file-header", input], universal_newlines=True
628+
)
629+
if re.search(r"Type:\s*((Shared)|(DYN))", output):
630+
# force using the instrumented version
631+
preloads.append(instrumented_output)
632+
633+
if preloads:
634+
print("Patching execution environment for dynamic library")
635+
environ["LD_PRELOAD"] = os.pathsep.join(preloads)
636+
637+
638+
args = [
608639
sys.executable,
609640
opts.lit,
610-
os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
611-
],
641+
"-v",
642+
os.path.join(opts.perf_training_binary_dir, f"bolt-fdata"),
643+
]
644+
print("Running: " + " ".join(args))
645+
process = subprocess.run(
646+
args,
612647
stdout=subprocess.PIPE,
613648
stderr=subprocess.STDOUT,
614649
text=True,
650+
env=environ,
615651
)
616652

617-
print(process.args)
618653
for line in process.stdout:
619654
sys.stdout.write(line)
620655
process.check_returncode()
@@ -624,35 +659,37 @@ def bolt_optimize(args):
624659

625660
merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
626661

627-
shutil.copy(opts.input, f"{opts.input}-prebolt")
662+
for input in inputs:
663+
shutil.copy(input, f"{input}-prebolt")
628664

629-
process = subprocess.run(
630-
[
631-
opts.bolt,
632-
f"{opts.input}-prebolt",
633-
"-o",
634-
opts.input,
635-
"-data",
636-
opts.fdata,
637-
"-reorder-blocks=ext-tsp",
638-
"-reorder-functions=cdsort",
639-
"-split-functions",
640-
"-split-all-cold",
641-
"-split-eh",
642-
"-dyno-stats",
643-
"-use-gnu-stack",
644-
"-update-debug-sections",
645-
"-nl" if opts.method == "PERF" else "",
646-
],
647-
stdout=subprocess.PIPE,
648-
stderr=subprocess.STDOUT,
649-
text=True,
650-
)
665+
args = [
666+
opts.bolt,
667+
f"{input}-prebolt",
668+
"-o",
669+
input,
670+
"-data",
671+
opts.fdata,
672+
"-reorder-blocks=ext-tsp",
673+
"-reorder-functions=cdsort",
674+
"-split-functions",
675+
"-split-all-cold",
676+
"-split-eh",
677+
"-dyno-stats",
678+
"-use-gnu-stack",
679+
"-update-debug-sections",
680+
"-nl" if opts.method == "PERF" else "",
681+
]
682+
print("Running: " + " ".join(args))
683+
process = subprocess.run(
684+
args,
685+
stdout=subprocess.PIPE,
686+
stderr=subprocess.STDOUT,
687+
text=True,
688+
)
651689

652-
print(process.args)
653-
for line in process.stdout:
654-
sys.stdout.write(line)
655-
process.check_returncode()
690+
for line in process.stdout:
691+
sys.stdout.write(line)
692+
process.check_returncode()
656693

657694

658695
commands = {

0 commit comments

Comments
 (0)