Skip to content

Commit 780f69c

Browse files
aaupovMatzeB
andauthored
[Clang][CMake] Add CSSPGO support to LLVM_BUILD_INSTRUMENTED (#79942)
Build on Clang-BOLT infrastructure to collect sample profile for CSSPGO. Add CSSPGO.cmake and BOLT-CSSPGO.cmake to automate CSSPGO/+BOLT Clang builds. Note that `CLANG_PGO_TRAINING_DATA_SOURCE_DIR` is required as built-in training set is inadequate for collecting sampled profile. Hardware compatibility: CSSPGO requires synchronized (0-skid) call and branch stacks, which is only available with Intel PEBS (Sandy Bridge+), AMD Zen3 with BRS, Zen4 with LBRv2+LBR_PMC_FREEZE, and Zen5 with LBRv2. This patch adds support for Intel `br_inst_retired.near_taken:uppp` event. Test Plan: Added BOLT-CSSPGO.cmake with same use as BOLT-PGO.cmake, e.g. for bootstrapped ThinLTO+CSSPGO+BOLT, with CSSPGO profile collected from LLVM build, and BOLT profile collected from Hello World (instrumentation): ``` cmake -B clang-csspgo-bolt -S /path/to/llvm-project/llvm \ -DLLVM_ENABLE_LLD=ON -DBOOTSTRAP_LLVM_ENABLE_LLD=ON \ -DBOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_LLD=ON \ -DPGO_INSTRUMENT_LTO=Thin \ -DBOOTSTRAP_CLANG_PGO_TRAINING_DATA_SOURCE_DIR=/path/to/llvm-project/llvm \ -GNinja -C /path/to/llvm-project/clang/cmake/caches/BOLT-CSSPGO.cmake ninja stage2-clang-bolt ... warning: Sample PGO is estimated to optimize better with 19.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples. ... [2800/2801] Optimizing Clang with BOLT BOLT-INFO: 8189 out of 106942 functions in the binary (7.7%) have non-empty execution profile 1377639 : taken branches (-42.1%) ``` Performance testing with Clang: - Setup: Clang-BOLT testing harness aaupov/llvm-devmtg-2022@9f2b46f - CSSPGO training: building LLVM, - InstrPGO training: building Hello World, - BOLT training: building Hello World, instrumentation, - benchmark: building small LLVM tool (not), - 2S Intel SKX Xeon 6138 with 40C/80T and 256GB RAM, using 20C/40T for build, - Results, wall time, lower is better - Baseline (bootstrapped build): 10.36s, - InstrPGO + ThinLTO: 9.34s, - CSSPGO + ThinLTO: 8.85s. - BOLT results, for reference: - Baseline: 9.09s, - InstrPGO + ThinLTO: 9.09s, - CSSPGO + ThinLTO: 8.58s. --------- Co-authored-by: Matthias Braun <[email protected]>
1 parent 1e4d4bb commit 780f69c

File tree

8 files changed

+136
-21
lines changed

8 files changed

+136
-21
lines changed

clang/CMakeLists.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -754,11 +754,22 @@ if (CLANG_ENABLE_BOOTSTRAP)
754754
if(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED)
755755
add_dependencies(clang-bootstrap-deps llvm-profdata)
756756
set(PGO_OPT -DLLVM_PROFDATA=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-profdata)
757+
string(TOUPPER "${BOOTSTRAP_LLVM_BUILD_INSTRUMENTED}" BOOTSTRAP_LLVM_BUILD_INSTRUMENTED)
758+
if (BOOTSTRAP_LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
759+
add_dependencies(clang-bootstrap-deps llvm-profgen)
760+
list(APPEND PGO_OPT -DLLVM_PROFGEN=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-profgen)
761+
endif()
757762
endif()
758763

759764
if(LLVM_BUILD_INSTRUMENTED)
760-
add_dependencies(clang-bootstrap-deps generate-profdata)
761-
set(PGO_OPT -DLLVM_PROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.profdata)
765+
string(TOUPPER "${LLVM_BUILD_INSTRUMENTED}" LLVM_BUILD_INSTRUMENTED)
766+
if (LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
767+
add_dependencies(clang-bootstrap-deps generate-sprofdata)
768+
set(PGO_OPT -DLLVM_SPROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.sprofdata)
769+
else()
770+
add_dependencies(clang-bootstrap-deps generate-profdata)
771+
set(PGO_OPT -DLLVM_PROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.profdata)
772+
endif()
762773
# Use the current tools for LTO instead of the instrumented ones
763774
list(APPEND _BOOTSTRAP_DEFAULT_PASSTHROUGH
764775
CMAKE_CXX_COMPILER
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
set(BOLT_PGO_CMAKE_CACHE "CSSPGO" CACHE STRING "")
2+
set(BOOTSTRAP_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
3+
include(${CMAKE_CURRENT_LIST_DIR}/BOLT-PGO.cmake)

clang/cmake/caches/BOLT-PGO.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
set(BOLT_PGO_CMAKE_CACHE "PGO" CACHE STRING "")
12
set(LLVM_ENABLE_PROJECTS "bolt;clang;lld" CACHE STRING "")
23

34
set(CLANG_BOOTSTRAP_TARGETS
@@ -14,4 +15,4 @@ set(BOOTSTRAP_CLANG_BOOTSTRAP_TARGETS
1415
set(PGO_BUILD_CONFIGURATION
1516
${CMAKE_CURRENT_LIST_DIR}/BOLT.cmake
1617
CACHE STRING "")
17-
include(${CMAKE_CURRENT_LIST_DIR}/PGO.cmake)
18+
include(${CMAKE_CURRENT_LIST_DIR}/${BOLT_PGO_CMAKE_CACHE}.cmake)

clang/cmake/caches/CSSPGO.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED "CSSPGO" CACHE STRING "")
2+
include(${CMAKE_CURRENT_LIST_DIR}/PGO.cmake)

clang/utils/perf-training/CMakeLists.txt

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ set(CLANG_PGO_TRAINING_DATA "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH
66
set(CLANG_PGO_TRAINING_DATA_SOURCE_DIR OFF CACHE STRING "Path to source directory containing cmake project with source files to use for generating pgo data")
77
set(CLANG_PGO_TRAINING_DEPS "" CACHE STRING "Extra dependencies needed to build the PGO training data.")
88

9+
add_custom_target(clear-perf-data
10+
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
11+
COMMENT "Clearing old perf data")
12+
913
option(CLANG_PGO_TRAINING_USE_LLVM_BUILD "Use LLVM build for generating PGO data" ON)
1014

1115
llvm_canonicalize_cmake_booleans(
@@ -21,7 +25,7 @@ if(LLVM_BUILD_INSTRUMENTED)
2125
add_lit_testsuite(generate-profraw "Generating clang PGO data"
2226
${CMAKE_CURRENT_BINARY_DIR}/pgo-data/
2327
EXCLUDE_FROM_CHECK_ALL
24-
DEPENDS clear-profraw
28+
DEPENDS clear-profraw clang
2529
)
2630

2731
add_custom_target(clear-profraw
@@ -55,6 +59,32 @@ if(LLVM_BUILD_INSTRUMENTED)
5559
USE_TOOLCHAIN EXLUDE_FROM_ALL NO_INSTALL DEPENDS generate-profraw)
5660
add_dependencies(generate-profdata generate-profraw-external)
5761
endif()
62+
63+
if(NOT LLVM_PROFGEN)
64+
find_program(LLVM_PROFGEN llvm-profgen)
65+
endif()
66+
67+
if(NOT LLVM_PROFGEN)
68+
message(STATUS "To enable converting CSSPGO samples LLVM_PROFGEN has to point to llvm-profgen")
69+
elseif(NOT CLANG_PGO_TRAINING_DATA_SOURCE_DIR)
70+
message(STATUS "CLANG_PGO_TRAINING_DATA_SOURCE_DIR must be set to collect CSSPGO samples")
71+
else()
72+
set(PERF_HELPER "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py)
73+
set(CLANG_SPROFDATA ${CMAKE_CURRENT_BINARY_DIR}/clang.sprofdata)
74+
add_custom_command(
75+
OUTPUT ${CLANG_SPROFDATA}
76+
# Execute generate-profraw-external under perf
77+
COMMAND ${PERF_HELPER} perf --csspgo -- ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target generate-profraw-external
78+
# Convert perf profile into profraw
79+
COMMAND ${PERF_HELPER} perf2prof ${LLVM_PROFGEN} $<TARGET_FILE:clang> ${CMAKE_CURRENT_BINARY_DIR}
80+
# Merge profdata
81+
COMMAND ${PERF_HELPER} merge --sample ${LLVM_PROFDATA} ${CLANG_SPROFDATA} ${CMAKE_CURRENT_BINARY_DIR}
82+
DEPENDS clang ${CLANG_PGO_TRAINING_DEPS} clear-perf-data generate-profraw-external-clean
83+
VERBATIM
84+
USES_TERMINAL
85+
)
86+
add_custom_target(generate-sprofdata DEPENDS ${CLANG_SPROFDATA})
87+
endif()
5888
endif()
5989
endif()
6090

@@ -104,8 +134,4 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
104134
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
105135
COMMENT "Clearing old BOLT fdata")
106136

107-
add_custom_target(clear-perf-data
108-
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
109-
COMMENT "Clearing old perf data")
110-
111137
endif()

clang/utils/perf-training/perf-helper.py

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,22 @@ def clean(args):
4545

4646

4747
def merge(args):
48-
if len(args) < 3:
49-
print(
50-
"Usage: %s merge <llvm-profdata> <output> <paths>\n" % __file__
51-
+ "\tMerges all profraw files from path into output."
52-
)
53-
return 1
54-
cmd = [args[0], "merge", "-o", args[1]]
55-
for path in args[2:]:
48+
parser = argparse.ArgumentParser(
49+
prog="perf-helper merge",
50+
description="Merges all profraw files from path(s) into output",
51+
)
52+
parser.add_argument("profdata", help="Path to llvm-profdata tool")
53+
parser.add_argument("output", help="Output filename")
54+
parser.add_argument(
55+
"paths", nargs="+", help="Folder(s) containing input profraw files"
56+
)
57+
parser.add_argument("--sample", action="store_true", help="Sample profile")
58+
opts = parser.parse_args(args)
59+
60+
cmd = [opts.profdata, "merge", "-o", opts.output]
61+
if opts.sample:
62+
cmd += ["--sample"]
63+
for path in opts.paths:
5664
cmd.extend(findFilesWithExtension(path, "profraw"))
5765
subprocess.check_call(cmd)
5866
return 0
@@ -73,25 +81,30 @@ def merge_fdata(args):
7381

7482
def perf(args):
7583
parser = argparse.ArgumentParser(
76-
prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
84+
prog="perf-helper perf",
85+
description="perf wrapper for BOLT/CSSPGO profile collection",
7786
)
7887
parser.add_argument(
7988
"--lbr", action="store_true", help="Use perf with branch stacks"
8089
)
90+
parser.add_argument("--csspgo", action="store_true", help="Enable CSSPGO flags")
8191
parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
8292

8393
opts = parser.parse_args(args)
8494
cmd = opts.cmd[1:]
8595

96+
event = "br_inst_retired.near_taken:uppp" if opts.csspgo else "cycles:u"
8697
perf_args = [
8798
"perf",
8899
"record",
89-
"--event=cycles:u",
100+
f"--event={event}",
90101
"--freq=max",
91102
"--output=%d.perf.data" % os.getpid(),
92103
]
93-
if opts.lbr:
104+
if opts.lbr or opts.csspgo:
94105
perf_args += ["--branch-filter=any,u"]
106+
if opts.csspgo:
107+
perf_args += ["-g", "--call-graph=fp"]
95108
perf_args.extend(cmd)
96109

97110
start_time = time.time()
@@ -127,6 +140,30 @@ def perf2bolt(args):
127140
return 0
128141

129142

143+
def perf2prof(args):
144+
parser = argparse.ArgumentParser(
145+
prog="perf-helper perf2prof",
146+
description="perf to CSSPGO prof conversion wrapper",
147+
)
148+
parser.add_argument("profgen", help="Path to llvm-profgen binary")
149+
parser.add_argument("binary", help="Input binary")
150+
parser.add_argument("paths", nargs="+", help="Path containing perf.data files")
151+
opts = parser.parse_args(args)
152+
153+
profgen_args = [opts.profgen, f"--binary={opts.binary}"]
154+
for path in opts.paths:
155+
for filename in findFilesWithExtension(path, "perf.data"):
156+
subprocess.run(
157+
[
158+
*profgen_args,
159+
f"--perfdata={filename}",
160+
f"--output={filename}.profraw",
161+
],
162+
check=True,
163+
)
164+
return 0
165+
166+
130167
def dtrace(args):
131168
parser = argparse.ArgumentParser(
132169
prog="perf-helper dtrace",
@@ -660,7 +697,10 @@ def bolt_optimize(args):
660697
process.check_returncode()
661698

662699
if opts.method in ["PERF", "LBR"]:
663-
perf2bolt([opts.bolt, opts.perf_training_binary_dir, opts.input])
700+
args = [opts.bolt, opts.perf_training_binary_dir, opts.input]
701+
if opts.method == "LBR":
702+
args.extend("--lbr")
703+
perf2bolt(args)
664704

665705
merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
666706

@@ -707,6 +747,7 @@ def bolt_optimize(args):
707747
"merge-fdata": merge_fdata,
708748
"perf": perf,
709749
"perf2bolt": perf2bolt,
750+
"perf2prof": perf2prof,
710751
}
711752

712753

llvm/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,6 +1011,9 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ${LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_defa
10111011
set(LLVM_PROFDATA_FILE "" CACHE FILEPATH
10121012
"Profiling data file to use when compiling in order to improve runtime performance.")
10131013

1014+
set(LLVM_SPROFDATA_FILE "" CACHE FILEPATH
1015+
"Sampling profiling data file to use when compiling in order to improve runtime performance.")
1016+
10141017
if(LLVM_INCLUDE_TESTS)
10151018
# All LLVM Python files should be compatible down to this minimum version.
10161019
set(LLVM_MINIMUM_PYTHON_VERSION 3.8)

llvm/cmake/modules/HandleLLVMOptions.cmake

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1184,7 +1184,7 @@ if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI)
11841184
message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
11851185
endif()
11861186

1187-
set(LLVM_BUILD_INSTRUMENTED OFF CACHE STRING "Build LLVM and tools with PGO instrumentation. May be specified as IR or Frontend")
1187+
set(LLVM_BUILD_INSTRUMENTED OFF CACHE STRING "Build LLVM and tools with PGO instrumentation. May be specified as IR, Frontend, CSIR, CSSPGO")
11881188
set(LLVM_VP_COUNTERS_PER_SITE "1.5" CACHE STRING "Value profile counters to use per site for IR PGO with Clang")
11891189
mark_as_advanced(LLVM_BUILD_INSTRUMENTED LLVM_VP_COUNTERS_PER_SITE)
11901190
string(TOUPPER "${LLVM_BUILD_INSTRUMENTED}" uppercase_LLVM_BUILD_INSTRUMENTED)
@@ -1217,6 +1217,19 @@ if (LLVM_BUILD_INSTRUMENTED)
12171217
CMAKE_EXE_LINKER_FLAGS
12181218
CMAKE_SHARED_LINKER_FLAGS)
12191219
endif()
1220+
elseif(uppercase_LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
1221+
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
1222+
append("-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -fno-optimize-sibling-calls -fpseudo-probe-for-profiling -fdebug-info-for-profiling"
1223+
CMAKE_CXX_FLAGS
1224+
CMAKE_C_FLAGS)
1225+
if(NOT LINKER_IS_LLD_LINK)
1226+
append("-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -fno-optimize-sibling-calls -fpseudo-probe-for-profiling -fdebug-info-for-profiling"
1227+
CMAKE_EXE_LINKER_FLAGS
1228+
CMAKE_SHARED_LINKER_FLAGS)
1229+
endif()
1230+
else()
1231+
message(FATAL_ERROR "LLVM_BUILD_INSTRUMENTED=CSSPGO can only be specified when compiling with clang")
1232+
endif()
12201233
else()
12211234
append("-fprofile-instr-generate=\"${LLVM_PROFILE_FILE_PATTERN}\""
12221235
CMAKE_CXX_FLAGS
@@ -1269,6 +1282,21 @@ elseif(LLVM_PROFDATA_FILE)
12691282
message(WARNING "LLVM_PROFDATA_FILE specified, but ${LLVM_PROFDATA_FILE} not found")
12701283
endif()
12711284

1285+
if(LLVM_SPROFDATA_FILE AND EXISTS ${LLVM_SPROFDATA_FILE})
1286+
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
1287+
append("-fpseudo-probe-for-profiling -fprofile-sample-use=\"${LLVM_SPROFDATA_FILE}\""
1288+
CMAKE_CXX_FLAGS
1289+
CMAKE_C_FLAGS)
1290+
if(NOT LINKER_IS_LLD_LINK)
1291+
append("-fpseudo-probe-for-profiling -fprofile-sample-use=\"${LLVM_SPROFDATA_FILE}\""
1292+
CMAKE_EXE_LINKER_FLAGS
1293+
CMAKE_SHARED_LINKER_FLAGS)
1294+
endif()
1295+
else()
1296+
message(FATAL_ERROR "LLVM_SPROFDATA_FILE can only be specified when compiling with clang")
1297+
endif()
1298+
endif()
1299+
12721300
option(LLVM_BUILD_INSTRUMENTED_COVERAGE "Build LLVM and tools with Code Coverage instrumentation" Off)
12731301
option(LLVM_INDIVIDUAL_TEST_COVERAGE "Emit individual coverage file for each test case." OFF)
12741302
mark_as_advanced(LLVM_BUILD_INSTRUMENTED_COVERAGE)

0 commit comments

Comments
 (0)