Skip to content

Commit bf00ff4

Browse files
Add extra profiling events to JIT/AOT compilation (#50610)
1 parent 049de79 commit bf00ff4

File tree

4 files changed

+160
-138
lines changed

4 files changed

+160
-138
lines changed

src/aotcompile.cpp

Lines changed: 149 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
309309
params.external_linkage = _external_linkage;
310310
size_t compile_for[] = { jl_typeinf_world, _world };
311311
for (int worlds = 0; worlds < 2; worlds++) {
312+
JL_TIMING(NATIVE_AOT, NATIVE_Codegen);
312313
params.world = compile_for[worlds];
313314
if (!params.world)
314315
continue;
@@ -390,37 +391,40 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
390391

391392
// clones the contents of the module `m` to the shadow_output collector
392393
// while examining and recording what kind of function pointer we have
393-
Linker L(*clone.getModuleUnlocked());
394-
for (auto &def : emitted) {
395-
jl_merge_module(clone, std::move(std::get<0>(def.second)));
396-
jl_code_instance_t *this_code = def.first;
397-
jl_llvm_functions_t decls = std::get<1>(def.second);
398-
StringRef func = decls.functionObject;
399-
StringRef cfunc = decls.specFunctionObject;
400-
uint32_t func_id = 0;
401-
uint32_t cfunc_id = 0;
402-
if (func == "jl_fptr_args") {
403-
func_id = -1;
404-
}
405-
else if (func == "jl_fptr_sparam") {
406-
func_id = -2;
407-
}
408-
else {
409-
//Safe b/c context is locked by params
410-
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(func)));
411-
func_id = data->jl_sysimg_fvars.size();
394+
{
395+
JL_TIMING(NATIVE_AOT, NATIVE_Merge);
396+
Linker L(*clone.getModuleUnlocked());
397+
for (auto &def : emitted) {
398+
jl_merge_module(clone, std::move(std::get<0>(def.second)));
399+
jl_code_instance_t *this_code = def.first;
400+
jl_llvm_functions_t decls = std::get<1>(def.second);
401+
StringRef func = decls.functionObject;
402+
StringRef cfunc = decls.specFunctionObject;
403+
uint32_t func_id = 0;
404+
uint32_t cfunc_id = 0;
405+
if (func == "jl_fptr_args") {
406+
func_id = -1;
407+
}
408+
else if (func == "jl_fptr_sparam") {
409+
func_id = -2;
410+
}
411+
else {
412+
//Safe b/c context is locked by params
413+
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(func)));
414+
func_id = data->jl_sysimg_fvars.size();
415+
}
416+
if (!cfunc.empty()) {
417+
//Safe b/c context is locked by params
418+
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(cfunc)));
419+
cfunc_id = data->jl_sysimg_fvars.size();
420+
}
421+
data->jl_fvar_map[this_code] = std::make_tuple(func_id, cfunc_id);
412422
}
413-
if (!cfunc.empty()) {
414-
//Safe b/c context is locked by params
415-
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(cfunc)));
416-
cfunc_id = data->jl_sysimg_fvars.size();
423+
if (params._shared_module) {
424+
bool error = L.linkInModule(std::move(params._shared_module));
425+
assert(!error && "Error linking in shared module");
426+
(void)error;
417427
}
418-
data->jl_fvar_map[this_code] = std::make_tuple(func_id, cfunc_id);
419-
}
420-
if (params._shared_module) {
421-
bool error = L.linkInModule(std::move(params._shared_module));
422-
assert(!error && "Error linking in shared module");
423-
(void)error;
424428
}
425429

426430
// now get references to the globals in the merged module
@@ -986,58 +990,60 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer
986990
}
987991
assert(!verifyLLVMIR(M));
988992

989-
timers.optimize.startTimer();
993+
{
994+
timers.optimize.startTimer();
990995

991996
#ifndef JL_USE_NEW_PM
992-
legacy::PassManager optimizer;
993-
addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
994-
addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
995-
addMachinePasses(&optimizer, jl_options.opt_level);
997+
legacy::PassManager optimizer;
998+
addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
999+
addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
1000+
addMachinePasses(&optimizer, jl_options.opt_level);
9961001
#else
9971002

998-
auto PMTM = std::unique_ptr<TargetMachine>(
999-
SourceTM.getTarget().createTargetMachine(
1000-
SourceTM.getTargetTriple().str(),
1001-
SourceTM.getTargetCPU(),
1002-
SourceTM.getTargetFeatureString(),
1003-
SourceTM.Options,
1004-
SourceTM.getRelocationModel(),
1005-
SourceTM.getCodeModel(),
1006-
SourceTM.getOptLevel()));
1007-
NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
1003+
auto PMTM = std::unique_ptr<TargetMachine>(
1004+
SourceTM.getTarget().createTargetMachine(
1005+
SourceTM.getTargetTriple().str(),
1006+
SourceTM.getTargetCPU(),
1007+
SourceTM.getTargetFeatureString(),
1008+
SourceTM.Options,
1009+
SourceTM.getRelocationModel(),
1010+
SourceTM.getCodeModel(),
1011+
SourceTM.getOptLevel()));
1012+
NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
10081013
#endif
1009-
optimizer.run(M);
1010-
assert(!verifyLLVMIR(M));
1011-
bool inject_aliases = false;
1012-
for (auto &F : M.functions()) {
1013-
if (!F.isDeclaration() && F.getName() != "_DllMainCRTStartup") {
1014-
inject_aliases = true;
1015-
break;
1014+
optimizer.run(M);
1015+
assert(!verifyLLVMIR(M));
1016+
bool inject_aliases = false;
1017+
for (auto &F : M.functions()) {
1018+
if (!F.isDeclaration() && F.getName() != "_DllMainCRTStartup") {
1019+
inject_aliases = true;
1020+
break;
1021+
}
10161022
}
1017-
}
1018-
// no need to inject aliases if we have no functions
1023+
// no need to inject aliases if we have no functions
10191024

1020-
if (inject_aliases) {
1025+
if (inject_aliases) {
10211026
#if JULIA_FLOAT16_ABI == 1
1022-
// We would like to emit an alias or an weakref alias to redirect these symbols
1023-
// but LLVM doesn't let us emit a GlobalAlias to a declaration...
1024-
// So for now we inject a definition of these functions that calls our runtime
1025-
// functions. We do so after optimization to avoid cloning these functions.
1026-
injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
1027-
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
1028-
injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee",
1029-
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
1030-
injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
1031-
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
1032-
injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee",
1033-
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
1034-
injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2",
1035-
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false));
1027+
// We would like to emit an alias or an weakref alias to redirect these symbols
1028+
// but LLVM doesn't let us emit a GlobalAlias to a declaration...
1029+
// So for now we inject a definition of these functions that calls our runtime
1030+
// functions. We do so after optimization to avoid cloning these functions.
1031+
injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
1032+
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
1033+
injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee",
1034+
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
1035+
injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
1036+
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
1037+
injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee",
1038+
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
1039+
injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2",
1040+
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false));
10361041
#else
1037-
emitFloat16Wrappers(M, false);
1042+
emitFloat16Wrappers(M, false);
10381043
#endif
1044+
}
1045+
timers.optimize.stopTimer();
10391046
}
1040-
timers.optimize.stopTimer();
10411047

10421048
if (opt) {
10431049
timers.opt.startTimer();
@@ -1276,7 +1282,10 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
12761282
// Single-threaded case
12771283
if (threads == 1) {
12781284
output_timer.startTimer();
1279-
outputs[0] = add_output_impl(M, TM, timers[0], unopt_out, opt_out, obj_out, asm_out);
1285+
{
1286+
JL_TIMING(NATIVE_AOT, NATIVE_Opt);
1287+
outputs[0] = add_output_impl(M, TM, timers[0], unopt_out, opt_out, obj_out, asm_out);
1288+
}
12801289
output_timer.stopTimer();
12811290
// Don't need M anymore
12821291
module_released(M);
@@ -1314,40 +1323,43 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
13141323
output_timer.startTimer();
13151324

13161325
// Start all of the worker threads
1317-
std::vector<std::thread> workers(threads);
1318-
for (unsigned i = 0; i < threads; i++) {
1319-
workers[i] = std::thread([&, i]() {
1320-
LLVMContext ctx;
1321-
// Lazily deserialize the entire module
1322-
timers[i].deserialize.startTimer();
1323-
auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
1324-
timers[i].deserialize.stopTimer();
1325-
1326-
timers[i].materialize.startTimer();
1327-
materializePreserved(*M, partitions[i]);
1328-
timers[i].materialize.stopTimer();
1329-
1330-
timers[i].construct.startTimer();
1331-
construct_vars(*M, partitions[i]);
1332-
M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
1333-
// The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file
1334-
// or it may skip emitting debug info for that file. Here set it to ./julia#N
1335-
DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), ".");
1336-
for (DICompileUnit *CU : M->debug_compile_units())
1337-
CU->replaceOperandWith(0, topfile);
1338-
timers[i].construct.stopTimer();
1339-
1340-
timers[i].deletion.startTimer();
1341-
dropUnusedGlobals(*M);
1342-
timers[i].deletion.stopTimer();
1343-
1344-
outputs[i] = add_output_impl(*M, TM, timers[i], unopt_out, opt_out, obj_out, asm_out);
1345-
});
1346-
}
1326+
{
1327+
JL_TIMING(NATIVE_AOT, NATIVE_Opt);
1328+
std::vector<std::thread> workers(threads);
1329+
for (unsigned i = 0; i < threads; i++) {
1330+
workers[i] = std::thread([&, i]() {
1331+
LLVMContext ctx;
1332+
// Lazily deserialize the entire module
1333+
timers[i].deserialize.startTimer();
1334+
auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
1335+
timers[i].deserialize.stopTimer();
1336+
1337+
timers[i].materialize.startTimer();
1338+
materializePreserved(*M, partitions[i]);
1339+
timers[i].materialize.stopTimer();
1340+
1341+
timers[i].construct.startTimer();
1342+
construct_vars(*M, partitions[i]);
1343+
M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
1344+
// The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file
1345+
// or it may skip emitting debug info for that file. Here set it to ./julia#N
1346+
DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), ".");
1347+
for (DICompileUnit *CU : M->debug_compile_units())
1348+
CU->replaceOperandWith(0, topfile);
1349+
timers[i].construct.stopTimer();
1350+
1351+
timers[i].deletion.startTimer();
1352+
dropUnusedGlobals(*M);
1353+
timers[i].deletion.stopTimer();
1354+
1355+
outputs[i] = add_output_impl(*M, TM, timers[i], unopt_out, opt_out, obj_out, asm_out);
1356+
});
1357+
}
13471358

1348-
// Wait for all of the worker threads to finish
1349-
for (auto &w : workers)
1350-
w.join();
1359+
// Wait for all of the worker threads to finish
1360+
for (auto &w : workers)
1361+
w.join();
1362+
}
13511363

13521364
output_timer.stopTimer();
13531365

@@ -1488,6 +1500,7 @@ void jl_dump_native_impl(void *native_code,
14881500
SmallVector<AOTOutputs, 16> data_outputs;
14891501
SmallVector<AOTOutputs, 16> metadata_outputs;
14901502
if (z) {
1503+
JL_TIMING(NATIVE_AOT, NATIVE_Sysimg);
14911504
LLVMContext Context;
14921505
Module sysimgM("sysimg", Context);
14931506
sysimgM.setTargetTriple(TheTriple.str());
@@ -1526,6 +1539,7 @@ void jl_dump_native_impl(void *native_code,
15261539
bool has_veccall = false;
15271540

15281541
data->M.withModuleDo([&](Module &dataM) {
1542+
JL_TIMING(NATIVE_AOT, NATIVE_Setup);
15291543
dataM.setTargetTriple(TheTriple.str());
15301544
dataM.setDataLayout(DL);
15311545
auto &Context = dataM.getContext();
@@ -1616,6 +1630,7 @@ void jl_dump_native_impl(void *native_code,
16161630
}
16171631

16181632
{
1633+
JL_TIMING(NATIVE_AOT, NATIVE_Metadata);
16191634
LLVMContext Context;
16201635
Module metadataM("metadata", Context);
16211636
metadataM.setTargetTriple(TheTriple.str());
@@ -1690,32 +1705,37 @@ void jl_dump_native_impl(void *native_code,
16901705
metadata_outputs = compile(metadataM, "data", 1, [](Module &) {});
16911706
}
16921707

1693-
object::Archive::Kind Kind = getDefaultForHost(TheTriple);
1708+
{
1709+
JL_TIMING(NATIVE_AOT, NATIVE_Write);
1710+
1711+
object::Archive::Kind Kind = getDefaultForHost(TheTriple);
16941712
#define WRITE_ARCHIVE(fname, field, prefix, suffix) \
1695-
if (fname) {\
1696-
std::vector<NewArchiveMember> archive; \
1697-
SmallVector<std::string, 16> filenames; \
1698-
SmallVector<StringRef, 16> buffers; \
1699-
for (size_t i = 0; i < threads; i++) { \
1700-
filenames.push_back((StringRef("text") + prefix + "#" + Twine(i) + suffix).str()); \
1701-
buffers.push_back(StringRef(data_outputs[i].field.data(), data_outputs[i].field.size())); \
1702-
} \
1703-
filenames.push_back("metadata" prefix suffix); \
1704-
buffers.push_back(StringRef(metadata_outputs[0].field.data(), metadata_outputs[0].field.size())); \
1705-
if (z) { \
1706-
filenames.push_back("sysimg" prefix suffix); \
1707-
buffers.push_back(StringRef(sysimg_outputs[0].field.data(), sysimg_outputs[0].field.size())); \
1708-
} \
1709-
for (size_t i = 0; i < filenames.size(); i++) { \
1710-
archive.push_back(NewArchiveMember(MemoryBufferRef(buffers[i], filenames[i]))); \
1711-
} \
1712-
handleAllErrors(writeArchive(fname, archive, true, Kind, true, false), reportWriterError); \
1713-
}
1714-
1715-
WRITE_ARCHIVE(unopt_bc_fname, unopt, "_unopt", ".bc");
1716-
WRITE_ARCHIVE(bc_fname, opt, "_opt", ".bc");
1717-
WRITE_ARCHIVE(obj_fname, obj, "", ".o");
1718-
WRITE_ARCHIVE(asm_fname, asm_, "", ".s");
1713+
if (fname) {\
1714+
std::vector<NewArchiveMember> archive; \
1715+
SmallVector<std::string, 16> filenames; \
1716+
SmallVector<StringRef, 16> buffers; \
1717+
for (size_t i = 0; i < threads; i++) { \
1718+
filenames.push_back((StringRef("text") + prefix + "#" + Twine(i) + suffix).str()); \
1719+
buffers.push_back(StringRef(data_outputs[i].field.data(), data_outputs[i].field.size())); \
1720+
} \
1721+
filenames.push_back("metadata" prefix suffix); \
1722+
buffers.push_back(StringRef(metadata_outputs[0].field.data(), metadata_outputs[0].field.size())); \
1723+
if (z) { \
1724+
filenames.push_back("sysimg" prefix suffix); \
1725+
buffers.push_back(StringRef(sysimg_outputs[0].field.data(), sysimg_outputs[0].field.size())); \
1726+
} \
1727+
for (size_t i = 0; i < filenames.size(); i++) { \
1728+
archive.push_back(NewArchiveMember(MemoryBufferRef(buffers[i], filenames[i]))); \
1729+
} \
1730+
handleAllErrors(writeArchive(fname, archive, true, Kind, true, false), reportWriterError); \
1731+
}
1732+
1733+
WRITE_ARCHIVE(unopt_bc_fname, unopt, "_unopt", ".bc");
1734+
WRITE_ARCHIVE(bc_fname, opt, "_opt", ".bc");
1735+
WRITE_ARCHIVE(obj_fname, obj, "", ".o");
1736+
WRITE_ARCHIVE(asm_fname, asm_, "", ".s");
1737+
#undef WRITE_ARCHIVE
1738+
}
17191739
}
17201740

17211741
void addTargetPasses(legacy::PassManagerBase *PM, const Triple &triple, TargetIRAnalysis analysis)

src/jitlayers.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1214,12 +1214,13 @@ namespace {
12141214
}
12151215
}
12161216

1217-
JL_TIMING(LLVM_OPT, LLVM_OPT);
1218-
1219-
//Run the optimization
1220-
assert(!verifyLLVMIR(M));
1221-
(***PMs).run(M);
1222-
assert(!verifyLLVMIR(M));
1217+
{
1218+
JL_TIMING(LLVM_JIT, JIT_Opt);
1219+
//Run the optimization
1220+
assert(!verifyLLVMIR(M));
1221+
(***PMs).run(M);
1222+
assert(!verifyLLVMIR(M));
1223+
}
12231224

12241225
uint64_t end_time = 0;
12251226
{
@@ -1272,6 +1273,7 @@ namespace {
12721273
: orc::IRCompileLayer::IRCompiler(MO), TMs(TMCreator(TM, optlevel)) {}
12731274

12741275
Expected<std::unique_ptr<MemoryBuffer>> operator()(Module &M) override {
1276+
JL_TIMING(LLVM_JIT, JIT_Compile);
12751277
return orc::SimpleCompiler(***TMs)(M);
12761278
}
12771279

@@ -1459,7 +1461,7 @@ void JuliaOJIT::addGlobalMapping(StringRef Name, uint64_t Addr)
14591461

14601462
void JuliaOJIT::addModule(orc::ThreadSafeModule TSM)
14611463
{
1462-
JL_TIMING(LLVM_ORC, LLVM_ORC);
1464+
JL_TIMING(LLVM_JIT, JIT_Total);
14631465
++ModulesAdded;
14641466
orc::SymbolLookupSet NewExports;
14651467
TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT {

0 commit comments

Comments
 (0)