From b18bb596c9345eb85231490436498ab434539c4b Mon Sep 17 00:00:00 2001 From: Shatian Wang Date: Mon, 13 Jan 2025 11:17:55 -0800 Subject: [PATCH 1/4] [BOLT] Flow conservation scores --- bolt/include/bolt/Passes/ContinuityStats.h | 61 -- .../include/bolt/Passes/ProfileQualityStats.h | 98 +++ bolt/lib/Passes/CMakeLists.txt | 2 +- bolt/lib/Passes/ContinuityStats.cpp | 250 -------- bolt/lib/Passes/ProfileQualityStats.cpp | 579 ++++++++++++++++++ bolt/lib/Rewrite/BinaryPassManager.cpp | 46 +- .../test/X86/cfg-discontinuity-reporting.test | 4 - bolt/test/X86/profile-quality-reporting.test | 6 + 8 files changed, 707 insertions(+), 339 deletions(-) delete mode 100644 bolt/include/bolt/Passes/ContinuityStats.h create mode 100644 bolt/include/bolt/Passes/ProfileQualityStats.h delete mode 100644 bolt/lib/Passes/ContinuityStats.cpp create mode 100644 bolt/lib/Passes/ProfileQualityStats.cpp delete mode 100644 bolt/test/X86/cfg-discontinuity-reporting.test create mode 100644 bolt/test/X86/profile-quality-reporting.test diff --git a/bolt/include/bolt/Passes/ContinuityStats.h b/bolt/include/bolt/Passes/ContinuityStats.h deleted file mode 100644 index bd4d491ad4a55..0000000000000 --- a/bolt/include/bolt/Passes/ContinuityStats.h +++ /dev/null @@ -1,61 +0,0 @@ -//===- bolt/Passes/ContinuityStats.h ----------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass checks how well the BOLT input profile satisfies the following -// "CFG continuity" property of a perfect profile: -// -// Each positive-execution-count block in the function’s CFG -// should be *reachable* from a positive-execution-count function -// entry block through a positive-execution-count path. -// -// More specifically, for each of the hottest 1000 functions, the pass -// calculates the function’s fraction of basic block execution counts -// that is *unreachable*. It then reports the 95th percentile of the -// distribution of the 1000 unreachable fractions in a single BOLT-INFO line. -// The smaller the reported value is, the better the BOLT profile -// satisfies the CFG continuity property. - -// The default value of 1000 above can be changed via the hidden BOLT option -// `-num-functions-for-continuity-check=[N]`. -// If more detailed stats are needed, `-v=1` can be used: the hottest N -// functions will be grouped into 5 equally-sized buckets, from the hottest -// to the coldest; for each bucket, various summary statistics of the -// distribution of the unreachable fractions and the raw unreachable execution -// counts will be reported. -// -//===----------------------------------------------------------------------===// - -#ifndef BOLT_PASSES_CONTINUITYSTATS_H -#define BOLT_PASSES_CONTINUITYSTATS_H - -#include "bolt/Passes/BinaryPasses.h" -#include - -namespace llvm { - -class raw_ostream; - -namespace bolt { -class BinaryContext; - -/// Compute and report to the user the function CFG continuity quality -class PrintContinuityStats : public BinaryFunctionPass { -public: - explicit PrintContinuityStats(const cl::opt &PrintPass) - : BinaryFunctionPass(PrintPass) {} - - bool shouldOptimize(const BinaryFunction &BF) const override; - const char *getName() const override { return "continuity-stats"; } - bool shouldPrint(const BinaryFunction &) const override { return false; } - Error runOnFunctions(BinaryContext &BC) override; -}; - -} // namespace bolt -} // namespace llvm - -#endif // BOLT_PASSES_CONTINUITYSTATS_H diff --git a/bolt/include/bolt/Passes/ProfileQualityStats.h b/bolt/include/bolt/Passes/ProfileQualityStats.h new file mode 100644 index 0000000000000..761fd33431a46 --- /dev/null +++ b/bolt/include/bolt/Passes/ProfileQualityStats.h @@ -0,0 +1,98 @@ +//===- bolt/Passes/ProfileQualityStats.h ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass checks the BOLT input profile quality. +// +// Check 1: how well the input profile satisfies the following +// "CFG continuity" property of a perfect profile: +// +// Each positive-execution-count block in the function’s CFG +// is *reachable* from a positive-execution-count function +// entry block through a positive-execution-count path. +// +// More specifically, for each of the hottest 1000 functions, the pass +// calculates the function’s fraction of basic block execution counts +// that is *unreachable*. It then reports the 95th percentile of the +// distribution of the 1000 unreachable fractions in a single BOLT-INFO line. +// The smaller the reported value is, the better the BOLT profile +// satisfies the CFG continuity property. +// +// Check 2: how well the input profile satisfies the "call graph flow +// conservation" property of a perfect profile: +// +// For each function that is not a program entry, the number of times the +// function is called is equal to the net CFG outflow of the +// function's entry block(s). +// +// More specifically, for each of the hottest 1000 functions, the pass obtains +// A = number of times the function is called, B = the function's entry blocks' +// inflow, C = the function's entry blocks' outflow, where B and C are computed +// using the function's weighted CFG. It then computes gap = 1 - MIN(A,C-B) / +// MAX(A, C-B). The pass reports the 95th percentile of the distribution of the +// 1000 gaps in a single BOLT-INFO line. The smaller the reported value is, the +// better the BOLT profile satisfies the call graph flow conservation property. +// +// Check 3: how well the input profile satisfies the "function CFG flow +// conservation property" of a perfect profile: +// +// A non-entry non-exit basic block's inflow is equal to its outflow. +// +// More specifically, for each of the hottest 1000 functions, the pass loops +// over its basic blocks that are non-entry and non-exit, and for each block +// obtains a block gap = 1 - MIN(block inflow, block outflow, block call count +// if any) / MAX(block inflow, block outflow, block call count if any). It then +// aggregates the block gaps into 2 values for the function: "weighted" is the +// weighted average of the block conservation gaps, where the weights depend on +// each block's execution count and instruction count; "worst" is the worst +// (biggest) block gap acorss all basic blocks in the function with an execution +// count of > 500. The pass then reports the 95th percentile of the weighted and +// worst values of the 1000 functions in a single BOLT-INFO line. The smaller +// the reported values are, the better the BOLT profile satisfies the function +// CFG flow conservation property. +// +// The default value of 1000 above can be changed via the hidden BOLT option +// `-num-functions-for-profile-quality-check=[N]`. +// The default reporting of the 95th percentile can be changed via the hidden +// BOLT option `-percentile-for-profile-quality-check=[M]`. +// +// If more detailed stats are needed, `-v=1` can be used: the hottest N +// functions will be grouped into 5 equally-sized buckets, from the hottest +// to the coldest; for each bucket, various summary statistics of the +// profile quality will be reported. +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_PASSES_PROFILEQUALITYSTATS_H +#define BOLT_PASSES_PROFILEQUALITYSTATS_H + +#include "bolt/Passes/BinaryPasses.h" +#include + +namespace llvm { + +class raw_ostream; + +namespace bolt { +class BinaryContext; + +/// Compute and report to the user the profile quality +class PrintProfileQualityStats : public BinaryFunctionPass { +public: + explicit PrintProfileQualityStats(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + bool shouldOptimize(const BinaryFunction &BF) const override; + const char *getName() const override { return "profile-quality-stats"; } + bool shouldPrint(const BinaryFunction &) const override { return false; } + Error runOnFunctions(BinaryContext &BC) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif // BOLT_PASSES_PROFILEQUALITYSTATS_H diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index 1e3289484a5ba..692c82f2a5336 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -26,7 +26,7 @@ add_llvm_library(LLVMBOLTPasses PatchEntries.cpp PettisAndHansen.cpp PLTCall.cpp - ContinuityStats.cpp + ProfileQualityStats.cpp RegAnalysis.cpp RegReAssign.cpp ReorderAlgorithm.cpp diff --git a/bolt/lib/Passes/ContinuityStats.cpp b/bolt/lib/Passes/ContinuityStats.cpp deleted file mode 100644 index b32365b59065d..0000000000000 --- a/bolt/lib/Passes/ContinuityStats.cpp +++ /dev/null @@ -1,250 +0,0 @@ -//===- bolt/Passes/ContinuityStats.cpp --------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements the continuity stats calculation pass. -// -//===----------------------------------------------------------------------===// - -#include "bolt/Passes/ContinuityStats.h" -#include "bolt/Core/BinaryBasicBlock.h" -#include "bolt/Core/BinaryFunction.h" -#include "bolt/Utils/CommandLineOpts.h" -#include "llvm/Support/CommandLine.h" -#include -#include -#include - -#define DEBUG_TYPE "bolt-opts" - -using namespace llvm; -using namespace bolt; - -namespace opts { -extern cl::opt Verbosity; -cl::opt NumFunctionsForContinuityCheck( - "num-functions-for-continuity-check", - cl::desc("number of hottest functions to print aggregated " - "CFG discontinuity stats of."), - cl::init(1000), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); -} // namespace opts - -namespace { -using FunctionListType = std::vector; -using function_iterator = FunctionListType::iterator; - -template -void printDistribution(raw_ostream &OS, std::vector &values, - bool Fraction = false) { - if (values.empty()) - return; - // Sort values from largest to smallest and print the MAX, TOP 1%, 5%, 10%, - // 20%, 50%, 80%, MIN. If Fraction is true, then values are printed as - // fractions instead of integers. - std::sort(values.begin(), values.end()); - - auto printLine = [&](std::string Text, double Percent) { - int Rank = int(values.size() * (1.0 - Percent / 100)); - if (Percent == 0) - Rank = values.size() - 1; - if (Fraction) - OS << " " << Text << std::string(9 - Text.length(), ' ') << ": " - << format("%.2lf%%", values[Rank] * 100) << "\n"; - else - OS << " " << Text << std::string(9 - Text.length(), ' ') << ": " - << values[Rank] << "\n"; - }; - - printLine("MAX", 0); - const int percentages[] = {1, 5, 10, 20, 50, 80}; - for (size_t i = 0; i < sizeof(percentages) / sizeof(percentages[0]); ++i) { - printLine("TOP " + std::to_string(percentages[i]) + "%", percentages[i]); - } - printLine("MIN", 100); -} - -void printCFGContinuityStats(raw_ostream &OS, - iterator_range &Functions) { - // Given a perfect profile, every positive-execution-count BB should be - // connected to an entry of the function through a positive-execution-count - // directed path in the control flow graph. - std::vector NumUnreachables; - std::vector SumECUnreachables; - std::vector FractionECUnreachables; - - for (auto it = Functions.begin(); it != Functions.end(); ++it) { - const BinaryFunction *Function = *it; - if (Function->size() <= 1) - continue; - - // Compute the sum of all BB execution counts (ECs). - size_t NumPosECBBs = 0; - size_t SumAllBBEC = 0; - for (const BinaryBasicBlock &BB : *Function) { - const size_t BBEC = BB.getKnownExecutionCount(); - NumPosECBBs += BBEC > 0 ? 1 : 0; - SumAllBBEC += BBEC; - } - - // Perform BFS on subgraph of CFG induced by positive weight edges. - // Compute the number of BBs reachable from the entry(s) of the function and - // the sum of their execution counts (ECs). - std::unordered_map IndexToBB; - std::unordered_set Visited; - std::queue Queue; - for (const BinaryBasicBlock &BB : *Function) { - // Make sure BB.getIndex() is not already in IndexToBB. - assert(IndexToBB.find(BB.getIndex()) == IndexToBB.end()); - IndexToBB[BB.getIndex()] = &BB; - if (BB.isEntryPoint() && BB.getKnownExecutionCount() > 0) { - Queue.push(BB.getIndex()); - Visited.insert(BB.getIndex()); - } - } - while (!Queue.empty()) { - const unsigned BBIndex = Queue.front(); - const BinaryBasicBlock *BB = IndexToBB[BBIndex]; - Queue.pop(); - auto SuccBIIter = BB->branch_info_begin(); - for (const BinaryBasicBlock *Succ : BB->successors()) { - const uint64_t Count = SuccBIIter->Count; - if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { - ++SuccBIIter; - continue; - } - if (!Visited.insert(Succ->getIndex()).second) { - ++SuccBIIter; - continue; - } - Queue.push(Succ->getIndex()); - ++SuccBIIter; - } - } - - const size_t NumReachableBBs = Visited.size(); - - // Loop through Visited, and sum the corresponding BBs' execution counts - // (ECs). - size_t SumReachableBBEC = 0; - for (const unsigned BBIndex : Visited) { - const BinaryBasicBlock *BB = IndexToBB[BBIndex]; - SumReachableBBEC += BB->getKnownExecutionCount(); - } - - const size_t NumPosECBBsUnreachableFromEntry = - NumPosECBBs - NumReachableBBs; - const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC; - const double FractionECUnreachable = - (double)SumUnreachableBBEC / SumAllBBEC; - - if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) { - OS << "Non-trivial CFG discontinuity observed in function " - << Function->getPrintName() << "\n"; - LLVM_DEBUG(Function->dump()); - } - - NumUnreachables.push_back(NumPosECBBsUnreachableFromEntry); - SumECUnreachables.push_back(SumUnreachableBBEC); - FractionECUnreachables.push_back(FractionECUnreachable); - } - - if (FractionECUnreachables.empty()) - return; - - std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end()); - const int Rank = int(FractionECUnreachables.size() * 0.95); - OS << format("top 5%% function CFG discontinuity is %.2lf%%\n", - FractionECUnreachables[Rank] * 100); - - if (opts::Verbosity >= 1) { - OS << "abbreviations: EC = execution count, POS BBs = positive EC BBs\n" - << "distribution of NUM(unreachable POS BBs) among all focal " - "functions\n"; - printDistribution(OS, NumUnreachables); - - OS << "distribution of SUM_EC(unreachable POS BBs) among all focal " - "functions\n"; - printDistribution(OS, SumECUnreachables); - - OS << "distribution of [(SUM_EC(unreachable POS BBs) / SUM_EC(all " - "POS BBs))] among all focal functions\n"; - printDistribution(OS, FractionECUnreachables, /*Fraction=*/true); - } -} - -void printAll(BinaryContext &BC, FunctionListType &ValidFunctions, - size_t NumTopFunctions) { - // Sort the list of functions by execution counts (reverse). - llvm::sort(ValidFunctions, - [&](const BinaryFunction *A, const BinaryFunction *B) { - return A->getKnownExecutionCount() > B->getKnownExecutionCount(); - }); - - const size_t RealNumTopFunctions = - std::min(NumTopFunctions, ValidFunctions.size()); - - iterator_range Functions( - ValidFunctions.begin(), ValidFunctions.begin() + RealNumTopFunctions); - - BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", - RealNumTopFunctions); - printCFGContinuityStats(BC.outs(), Functions); - - // Print more detailed bucketed stats if requested. - if (opts::Verbosity >= 1 && RealNumTopFunctions >= 5) { - const size_t PerBucketSize = RealNumTopFunctions / 5; - BC.outs() << format( - "Detailed stats for 5 buckets, each with %zu functions:\n", - PerBucketSize); - - // For each bucket, print the CFG continuity stats of the functions in the - // bucket. - for (size_t BucketIndex = 0; BucketIndex < 5; ++BucketIndex) { - const size_t StartIndex = BucketIndex * PerBucketSize; - const size_t EndIndex = StartIndex + PerBucketSize; - iterator_range Functions( - ValidFunctions.begin() + StartIndex, - ValidFunctions.begin() + EndIndex); - const size_t MaxFunctionExecutionCount = - ValidFunctions[StartIndex]->getKnownExecutionCount(); - const size_t MinFunctionExecutionCount = - ValidFunctions[EndIndex - 1]->getKnownExecutionCount(); - BC.outs() << format("----------------\n| Bucket %zu: " - "|\n----------------\n", - BucketIndex + 1) - << format( - "execution counts of the %zu functions in the bucket: " - "%zu-%zu\n", - EndIndex - StartIndex, MinFunctionExecutionCount, - MaxFunctionExecutionCount); - printCFGContinuityStats(BC.outs(), Functions); - } - } -} -} // namespace - -bool PrintContinuityStats::shouldOptimize(const BinaryFunction &BF) const { - if (BF.empty() || !BF.hasValidProfile()) - return false; - - return BinaryFunctionPass::shouldOptimize(BF); -} - -Error PrintContinuityStats::runOnFunctions(BinaryContext &BC) { - // Create a list of functions with valid profiles. - FunctionListType ValidFunctions; - for (const auto &BFI : BC.getBinaryFunctions()) { - const BinaryFunction *Function = &BFI.second; - if (PrintContinuityStats::shouldOptimize(*Function)) - ValidFunctions.push_back(Function); - } - if (ValidFunctions.empty() || opts::NumFunctionsForContinuityCheck == 0) - return Error::success(); - - printAll(BC, ValidFunctions, opts::NumFunctionsForContinuityCheck); - return Error::success(); -} diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp new file mode 100644 index 0000000000000..a42b4dc27947a --- /dev/null +++ b/bolt/lib/Passes/ProfileQualityStats.cpp @@ -0,0 +1,579 @@ +//===- bolt/Passes/ProfileQualityStats.cpp ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the profile quality stats calculation pass. +// +//===----------------------------------------------------------------------===// + +#include "bolt/Passes/ProfileQualityStats.h" +#include "bolt/Core/BinaryBasicBlock.h" +#include "bolt/Core/BinaryFunction.h" +#include "bolt/Utils/CommandLineOpts.h" +#include "llvm/Support/CommandLine.h" +#include +#include +#include + +#define DEBUG_TYPE "bolt-opts" + +using namespace llvm; +using namespace bolt; + +namespace opts { +extern cl::opt Verbosity; +cl::opt NumFunctionsForProfileQualityCheck( + "num-functions-for-profile-quality-check", + cl::desc("number of hottest functions to print aggregated " + "profile quality stats of."), + cl::init(1000), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); +cl::opt PercentileForProfileQualityCheck( + "percentile-for-profile-quality-check", + cl::desc("Percentile of profile quality distributions over hottest " + "functions to display."), + cl::init(95), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); +} // namespace opts + +namespace { +using FunctionListType = std::vector; +using function_iterator = FunctionListType::iterator; + +// BB index -> flow count +using FlowMapTy = std::unordered_map; +// Function number -> FlowMapTy +using TotalFlowMapTy = std::unordered_map; +// Function number -> flow count +using FunctionFlowMapTy = std::unordered_map; +struct FlowInfo { + TotalFlowMapTy TotalIncomingMaps; + TotalFlowMapTy TotalOutgoingMaps; + TotalFlowMapTy TotalMaxCountMaps; + TotalFlowMapTy TotalMinCountMaps; + FunctionFlowMapTy CallGraphIncomingMap; +}; + +template +void printDistribution(raw_ostream &OS, std::vector &values, + bool Fraction = false) { + // Assume values are sorted. + if (values.empty()) + return; + + OS << " Length : " << values.size() << "\n"; + + auto printLine = [&](std::string Text, double Percent) { + int Rank = int(values.size() * (100 - Percent) / 100); + if (Percent == 0) + Rank = values.size() - 1; + if (Fraction) + OS << " " << Text << std::string(11 - Text.length(), ' ') << ": " + << format("%.2lf%%", values[Rank] * 100) << "\n"; + else + OS << " " << Text << std::string(11 - Text.length(), ' ') << ": " + << values[Rank] << "\n"; + }; + + printLine("MAX", 0); + const int percentages[] = {1, 5, 10, 20, 50, 80}; + for (size_t i = 0; i < sizeof(percentages) / sizeof(percentages[0]); ++i) { + printLine("TOP " + std::to_string(percentages[i]) + "%", percentages[i]); + } + printLine("MIN", 100); +} + +void printCFGContinuityStats(raw_ostream &OS, + iterator_range &Functions) { + // Given a perfect profile, every positive-execution-count BB should be + // connected to an entry of the function through a positive-execution-count + // directed path in the control flow graph. + std::vector NumUnreachables; + std::vector SumECUnreachables; + std::vector FractionECUnreachables; + + for (auto it = Functions.begin(); it != Functions.end(); ++it) { + const BinaryFunction *Function = *it; + if (Function->size() <= 1) + continue; + + // Compute the sum of all BB execution counts (ECs). + size_t NumPosECBBs = 0; + size_t SumAllBBEC = 0; + for (const BinaryBasicBlock &BB : *Function) { + const size_t BBEC = BB.getKnownExecutionCount(); + NumPosECBBs += BBEC > 0 ? 1 : 0; + SumAllBBEC += BBEC; + } + + // Perform BFS on subgraph of CFG induced by positive weight edges. + // Compute the number of BBs reachable from the entry(s) of the function and + // the sum of their execution counts (ECs). + std::unordered_map IndexToBB; + std::unordered_set Visited; + std::queue Queue; + for (const BinaryBasicBlock &BB : *Function) { + // Make sure BB.getIndex() is not already in IndexToBB. + assert(IndexToBB.find(BB.getIndex()) == IndexToBB.end()); + IndexToBB[BB.getIndex()] = &BB; + if (BB.isEntryPoint() && BB.getKnownExecutionCount() > 0) { + Queue.push(BB.getIndex()); + Visited.insert(BB.getIndex()); + } + } + while (!Queue.empty()) { + const unsigned BBIndex = Queue.front(); + const BinaryBasicBlock *BB = IndexToBB[BBIndex]; + Queue.pop(); + auto SuccBIIter = BB->branch_info_begin(); + for (const BinaryBasicBlock *Succ : BB->successors()) { + const uint64_t Count = SuccBIIter->Count; + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { + ++SuccBIIter; + continue; + } + if (!Visited.insert(Succ->getIndex()).second) { + ++SuccBIIter; + continue; + } + Queue.push(Succ->getIndex()); + ++SuccBIIter; + } + } + + const size_t NumReachableBBs = Visited.size(); + + // Loop through Visited, and sum the corresponding BBs' execution counts + // (ECs). + size_t SumReachableBBEC = 0; + for (const unsigned BBIndex : Visited) { + const BinaryBasicBlock *BB = IndexToBB[BBIndex]; + SumReachableBBEC += BB->getKnownExecutionCount(); + } + + const size_t NumPosECBBsUnreachableFromEntry = + NumPosECBBs - NumReachableBBs; + const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC; + const double FractionECUnreachable = + (double)SumUnreachableBBEC / SumAllBBEC; + + if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) { + OS << "Non-trivial CFG discontinuity observed in function " + << Function->getPrintName() << "\n"; + LLVM_DEBUG(Function->dump()); + } + + NumUnreachables.push_back(NumPosECBBsUnreachableFromEntry); + SumECUnreachables.push_back(SumUnreachableBBEC); + FractionECUnreachables.push_back(FractionECUnreachable); + } + + if (FractionECUnreachables.empty()) + return; + + std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end()); + const int Rank = int(FractionECUnreachables.size() * + opts::PercentileForProfileQualityCheck / 100); + OS << format("top %zu%% function CFG discontinuity is %.2lf%%\n", + 100 - opts::PercentileForProfileQualityCheck, + FractionECUnreachables[Rank] * 100); + if (opts::Verbosity >= 1) { + OS << "abbreviations: EC = execution count, POS BBs = positive EC BBs\n" + << "distribution of NUM(unreachable POS BBs) per function\n"; + std::sort(NumUnreachables.begin(), NumUnreachables.end()); + printDistribution(OS, NumUnreachables); + + OS << "distribution of SUM_EC(unreachable POS BBs) per function\n"; + std::sort(SumECUnreachables.begin(), SumECUnreachables.end()); + printDistribution(OS, SumECUnreachables); + + OS << "distribution of [(SUM_EC(unreachable POS BBs) / SUM_EC(all " + "POS BBs))] per function\n"; + printDistribution(OS, FractionECUnreachables, /*Fraction=*/true); + } +} + +void printCallGraphFlowConservationStats( + raw_ostream &OS, iterator_range &Functions, + FlowInfo &TotalFlowMap) { + std::vector CallGraphGaps; + + for (auto it = Functions.begin(); it != Functions.end(); ++it) { + const BinaryFunction *Function = *it; + if (Function->size() <= 1 || !Function->isSimple()) + continue; + + const uint64_t FunctionNum = Function->getFunctionNumber(); + FlowMapTy &IncomingMap = TotalFlowMap.TotalIncomingMaps[FunctionNum]; + FlowMapTy &OutgoingMap = TotalFlowMap.TotalOutgoingMaps[FunctionNum]; + FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap; + + // Only consider functions that are not a program entry. + if (CallGraphIncomingMap.find(FunctionNum) != CallGraphIncomingMap.end()) { + uint64_t EntryInflow = 0; + uint64_t EntryOutflow = 0; + uint32_t NumConsideredEntryBlocks = 0; + for (const BinaryBasicBlock &BB : *Function) { + if (BB.isEntryPoint()) { + // If entry is an exit, then we don't consider it for flow + // conservation + if (BB.succ_size() == 0) + continue; + NumConsideredEntryBlocks++; + + EntryInflow += IncomingMap[BB.getIndex()]; + EntryOutflow += OutgoingMap[BB.getIndex()]; + } + } + uint64_t NetEntryOutflow = 0; + if (EntryOutflow < EntryInflow) { + if (opts::Verbosity >= 1) { + // We expect entry blocks' CFG outflow >= inflow, i.e., it has a + // non-negative net outflow. If this is not the case, then raise a + // warning if requested. + OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow in " + "function " + << Function->getPrintName() << "\n"; + LLVM_DEBUG(Function->dump()); + } + } else { + NetEntryOutflow = EntryOutflow - EntryInflow; + } + if (NumConsideredEntryBlocks > 0) { + const uint64_t CallGraphInflow = + TotalFlowMap.CallGraphIncomingMap[Function->getFunctionNumber()]; + const uint64_t Min = std::min(NetEntryOutflow, CallGraphInflow); + const uint64_t Max = std::max(NetEntryOutflow, CallGraphInflow); + const double CallGraphGap = 1 - (double)Min / Max; + + if (opts::Verbosity >= 2 && CallGraphGap >= 0.5) { + OS << "Nontrivial call graph gap of size " + << format("%.2lf%%", 100 * CallGraphGap) + << " observed in function " << Function->getPrintName() << "\n"; + LLVM_DEBUG(Function->dump()); + } + + CallGraphGaps.push_back(CallGraphGap); + } + } + } + + if (!CallGraphGaps.empty()) { + std::sort(CallGraphGaps.begin(), CallGraphGaps.end()); + const int Rank = int(CallGraphGaps.size() * + opts::PercentileForProfileQualityCheck / 100); + OS << format("top %zu%% call graph flow conservation gap is %.2lf%%\n", + 100 - opts::PercentileForProfileQualityCheck, + CallGraphGaps[Rank] * 100); + if (opts::Verbosity >= 1) { + OS << "distribution of function entry flow conservation gaps\n"; + printDistribution(OS, CallGraphGaps, /*Fraction=*/true); + } + } +} + +void printCFGFlowConservationStats(raw_ostream &OS, + iterator_range &Functions, + FlowInfo &TotalFlowMap) { + std::vector CFGGapsWeightedAvg; + std::vector CFGGapsWorst; + std::vector CFGGapsWorstAbs; + + for (auto it = Functions.begin(); it != Functions.end(); ++it) { + const BinaryFunction *Function = *it; + if (Function->size() <= 1 || !Function->isSimple()) + continue; + + const uint64_t FunctionNum = Function->getFunctionNumber(); + FlowMapTy &MaxCountMaps = TotalFlowMap.TotalMaxCountMaps[FunctionNum]; + FlowMapTy &MinCountMaps = TotalFlowMap.TotalMinCountMaps[FunctionNum]; + double WeightedGapSum = 0.0; + double WeightSum = 0.0; + double WorstGap = 0.0; + uint64_t WorstGapAbs = 0; + BinaryBasicBlock *BBWorstGap = nullptr; + BinaryBasicBlock *BBWorstGapAbs = nullptr; + for (BinaryBasicBlock &BB : *Function) { + // We don't consider function entry or exit blocks for CFG flow + // conservation + if (BB.isEntryPoint() || BB.succ_size() == 0) + continue; + + const uint64_t Max = MaxCountMaps[BB.getIndex()]; + const uint64_t Min = MinCountMaps[BB.getIndex()]; + const double Gap = 1 - (double)Min / Max; + double Weight = BB.getKnownExecutionCount() * BB.getNumNonPseudos(); + if (Weight == 0) + continue; + // We use log to prevent the stats from being dominated by extremely hot + // blocks + Weight = log(Weight); + WeightedGapSum += Gap * Weight; + WeightSum += Weight; + if (BB.getKnownExecutionCount() > 500 && Gap > WorstGap) { + WorstGap = Gap; + BBWorstGap = &BB; + } + if (BB.getKnownExecutionCount() > 500 && Max - Min > WorstGapAbs) { + WorstGapAbs = Max - Min; + BBWorstGapAbs = &BB; + } + } + if (WeightSum > 0) { + const double WeightedGap = WeightedGapSum / WeightSum; + if (opts::Verbosity >= 2 && (WeightedGap >= 0.1 || WorstGap >= 0.9)) { + OS << "Nontrivial CFG gap observed in function " + << Function->getPrintName() << "\n" + << "Weighted gap: " << format("%.2lf%%", 100 * WeightedGap) << "\n"; + if (BBWorstGap) + OS << "Worst gap: " << format("%.2lf%%", 100 * WorstGap) + << " at BB with input offset: 0x" + << Twine::utohexstr(BBWorstGap->getInputOffset()) << "\n"; + if (BBWorstGapAbs) + OS << "Worst gap (absolute value): " << WorstGapAbs << " at BB with " + << "input offset 0x" + << Twine::utohexstr(BBWorstGapAbs->getInputOffset()) << "\n"; + LLVM_DEBUG(Function->dump()); + } + + CFGGapsWeightedAvg.push_back(WeightedGap); + CFGGapsWorst.push_back(WorstGap); + CFGGapsWorstAbs.push_back(WorstGapAbs); + } + } + + if (!CFGGapsWeightedAvg.empty()) { + std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end()); + const int RankWA = int(CFGGapsWeightedAvg.size() * + opts::PercentileForProfileQualityCheck / 100); + std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end()); + const int RankW = + int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100); + OS << format( + "top %zu%% CFG flow conservation gap is %.2lf%% (weighted) and " + "%.2lf%% (worst)\n", + 100 - opts::PercentileForProfileQualityCheck, + CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100); + if (opts::Verbosity >= 1) { + OS << "distribution of weighted CFG flow conservation gaps\n"; + printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true); + OS << "Consider only blocks with execution counts > 500:\n" + << "distribution of worst block flow conservation gap per " + "function \n"; + printDistribution(OS, CFGGapsWorst, /*Fraction=*/true); + OS << "distribution of worst block flow conservation gap (absolute " + "value) per function\n"; + std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end()); + printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false); + } + } +} + +void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { + // Increment block inflow and outflow with CFG jump counts. + TotalFlowMapTy &TotalIncomingMaps = TotalFlowMap.TotalIncomingMaps; + TotalFlowMapTy &TotalOutgoingMaps = TotalFlowMap.TotalOutgoingMaps; + for (const auto &BFI : BC.getBinaryFunctions()) { + const BinaryFunction *Function = &BFI.second; + if (Function->empty() || !Function->hasValidProfile()) + continue; + FlowMapTy &IncomingMap = TotalIncomingMaps[Function->getFunctionNumber()]; + FlowMapTy &OutgoingMap = TotalOutgoingMaps[Function->getFunctionNumber()]; + for (const BinaryBasicBlock &BB : *Function) { + uint64_t TotalOutgoing = 0ULL; + auto SuccBIIter = BB.branch_info_begin(); + for (BinaryBasicBlock *Succ : BB.successors()) { + const uint64_t Count = SuccBIIter->Count; + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { + ++SuccBIIter; + continue; + } + TotalOutgoing += Count; + IncomingMap[Succ->getIndex()] += Count; + ++SuccBIIter; + } + OutgoingMap[BB.getIndex()] = TotalOutgoing; + } + } + + // Initialize TotalMaxCountMaps and TotalMinCountMaps using + // TotalIncomingMaps and TotalOutgoingMaps + TotalFlowMapTy &TotalMaxCountMaps = TotalFlowMap.TotalMaxCountMaps; + TotalFlowMapTy &TotalMinCountMaps = TotalFlowMap.TotalMinCountMaps; + for (const auto &BFI : BC.getBinaryFunctions()) { + const BinaryFunction *Function = &BFI.second; + if (Function->empty() || !Function->hasValidProfile()) + continue; + uint64_t FunctionNum = Function->getFunctionNumber(); + FlowMapTy &IncomingMap = TotalIncomingMaps[FunctionNum]; + FlowMapTy &OutgoingMap = TotalOutgoingMaps[FunctionNum]; + FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum]; + FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum]; + for (const BinaryBasicBlock &BB : *Function) { + uint64_t BBNum = BB.getIndex(); + MaxCountMap[BBNum] = std::max(IncomingMap[BBNum], OutgoingMap[BBNum]); + MinCountMap[BBNum] = std::min(IncomingMap[BBNum], OutgoingMap[BBNum]); + } + } + + // Modify TotalMaxCountMaps and TotalMinCountMaps using call counts and + // fill out CallGraphIncomingMap + FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap; + for (const auto &BFI : BC.getBinaryFunctions()) { + const BinaryFunction *Function = &BFI.second; + uint64_t FunctionNum = Function->getFunctionNumber(); + FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum]; + FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum]; + + // Update MaxCountMap, MinCountMap, and CallGraphIncomingMap + auto recordCall = [&](const BinaryBasicBlock *SourceBB, + const MCSymbol *DestSymbol, uint64_t Count) { + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE) + Count = 0; + const BinaryFunction *DstFunc = + DestSymbol ? BC.getFunctionForSymbol(DestSymbol) : nullptr; + if (DstFunc) + CallGraphIncomingMap[DstFunc->getFunctionNumber()] += Count; + if (SourceBB) { + unsigned BlockIndex = SourceBB->getIndex(); + MaxCountMap[BlockIndex] = std::max(MaxCountMap[BlockIndex], Count); + MinCountMap[BlockIndex] = std::min(MinCountMap[BlockIndex], Count); + } + }; + + // Get pairs of (symbol, count) for each target at this callsite. + // If the call is to an unknown function the symbol will be nullptr. + // If there is no profiling data the count will be COUNT_NO_PROFILE. + using TargetDesc = std::pair; + using CallInfoTy = std::vector; + auto getCallInfo = [&](const BinaryBasicBlock *BB, const MCInst &Inst) { + CallInfoTy Counts; + const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); + + // If this is an indirect call use perf data directly. + if (!DstSym && BC.MIB->hasAnnotation(Inst, "CallProfile")) { + const auto &ICSP = BC.MIB->getAnnotationAs( + Inst, "CallProfile"); + for (const IndirectCallProfile &CSI : ICSP) + if (CSI.Symbol) + Counts.emplace_back(CSI.Symbol, CSI.Count); + } else { + const uint64_t Count = BB->getExecutionCount(); + Counts.emplace_back(DstSym, Count); + } + + return Counts; + }; + + // If the function has an invalid profile, try to use the perf data + // directly. The call EC is only used to update CallGraphIncomingMap. + if (!Function->hasValidProfile() && !Function->getAllCallSites().empty()) { + for (const IndirectCallProfile &CSI : Function->getAllCallSites()) { + if (!CSI.Symbol) + continue; + recordCall(nullptr, CSI.Symbol, CSI.Count); + } + continue; + } else { + // If the function has a valid profile + for (BinaryBasicBlock &BB : *Function) { + for (MCInst &Inst : BB) { + if (!BC.MIB->isCall(Inst)) + continue; + // Find call instructions and extract target symbols from each + // one. + const CallInfoTy CallInfo = getCallInfo(&BB, Inst); + for (const TargetDesc &CI : CallInfo) { + recordCall(&BB, CI.first, CI.second); + } + } + } + } + } +} + +void printAll(BinaryContext &BC, FunctionListType &ValidFunctions, + size_t NumTopFunctions) { + // Sort the list of functions by execution counts (reverse). + llvm::sort(ValidFunctions, + [&](const BinaryFunction *A, const BinaryFunction *B) { + return A->getKnownExecutionCount() > B->getKnownExecutionCount(); + }); + + const size_t RealNumTopFunctions = + std::min(NumTopFunctions, ValidFunctions.size()); + + iterator_range Functions( + ValidFunctions.begin(), ValidFunctions.begin() + RealNumTopFunctions); + + FlowInfo TotalFlowMap; + computeFlowMappings(BC, TotalFlowMap); + + BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", + RealNumTopFunctions); + printCFGContinuityStats(BC.outs(), Functions); + BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", + RealNumTopFunctions); + printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap); + BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", + RealNumTopFunctions); + printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap); + + // Print more detailed bucketed stats if requested. + if (opts::Verbosity >= 1 && RealNumTopFunctions >= 5) { + const size_t PerBucketSize = RealNumTopFunctions / 5; + BC.outs() << format( + "Detailed stats for 5 buckets, each with %zu functions:\n", + PerBucketSize); + + // For each bucket, print the CFG continuity stats of the functions in + // the bucket. + for (size_t BucketIndex = 0; BucketIndex < 5; ++BucketIndex) { + const size_t StartIndex = BucketIndex * PerBucketSize; + const size_t EndIndex = StartIndex + PerBucketSize; + iterator_range Functions( + ValidFunctions.begin() + StartIndex, + ValidFunctions.begin() + EndIndex); + const size_t MaxFunctionExecutionCount = + ValidFunctions[StartIndex]->getKnownExecutionCount(); + const size_t MinFunctionExecutionCount = + ValidFunctions[EndIndex - 1]->getKnownExecutionCount(); + BC.outs() << format("----------------\n| Bucket %zu: " + "|\n----------------\n", + BucketIndex + 1) + << format( + "execution counts of the %zu functions in the bucket: " + "%zu-%zu\n", + EndIndex - StartIndex, MinFunctionExecutionCount, + MaxFunctionExecutionCount); + printCFGContinuityStats(BC.outs(), Functions); + printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap); + printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap); + } + } +} +} // namespace + +bool PrintProfileQualityStats::shouldOptimize(const BinaryFunction &BF) const { + if (BF.empty() || !BF.hasValidProfile()) + return false; + + return BinaryFunctionPass::shouldOptimize(BF); +} + +Error PrintProfileQualityStats::runOnFunctions(BinaryContext &BC) { + // Create a list of functions with valid profiles. + FunctionListType ValidFunctions; + for (const auto &BFI : BC.getBinaryFunctions()) { + const BinaryFunction *Function = &BFI.second; + if (PrintProfileQualityStats::shouldOptimize(*Function)) + ValidFunctions.push_back(Function); + } + if (ValidFunctions.empty() || opts::NumFunctionsForProfileQualityCheck == 0) + return Error::success(); + + printAll(BC, ValidFunctions, opts::NumFunctionsForProfileQualityCheck); + return Error::success(); +} diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 2d851c751ae10..6f750b95adf9a 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -12,7 +12,6 @@ #include "bolt/Passes/AllocCombiner.h" #include "bolt/Passes/AsmDump.h" #include "bolt/Passes/CMOVConversion.h" -#include "bolt/Passes/ContinuityStats.h" #include "bolt/Passes/FixRISCVCallsPass.h" #include "bolt/Passes/FixRelaxationPass.h" #include "bolt/Passes/FrameOptimizer.h" @@ -27,6 +26,7 @@ #include "bolt/Passes/MCF.h" #include "bolt/Passes/PLTCall.h" #include "bolt/Passes/PatchEntries.h" +#include "bolt/Passes/ProfileQualityStats.h" #include "bolt/Passes/RegReAssign.h" #include "bolt/Passes/ReorderData.h" #include "bolt/Passes/ReorderFunctions.h" @@ -58,10 +58,10 @@ extern cl::opt ICF; -static cl::opt -DynoStatsAll("dyno-stats-all", - cl::desc("print dyno stats after each stage"), - cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory)); +static cl::opt DynoStatsAll("dyno-stats-all", + cl::desc("print dyno stats after each stage"), + cl::ZeroOrMore, cl::Hidden, + cl::cat(BoltCategory)); static cl::opt EliminateUnreachable("eliminate-unreachable", @@ -82,25 +82,24 @@ cl::opt cl::opt NeverPrint("never-print", cl::desc("never print"), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt -PrintAfterBranchFixup("print-after-branch-fixup", - cl::desc("print function after fixing local branches"), - cl::Hidden, cl::cat(BoltOptCategory)); +cl::opt PrintAfterBranchFixup( + "print-after-branch-fixup", + cl::desc("print function after fixing local branches"), cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintAfterLowering("print-after-lowering", - cl::desc("print function after instruction lowering"), - cl::Hidden, cl::cat(BoltOptCategory)); + PrintAfterLowering("print-after-lowering", + cl::desc("print function after instruction lowering"), + cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt PrintEstimateEdgeCounts( "print-estimate-edge-counts", cl::desc("print function after edge counts are set for no-LBR profile"), cl::Hidden, cl::cat(BoltOptCategory)); -cl::opt -PrintFinalized("print-finalized", - cl::desc("print function after CFG is finalized"), - cl::Hidden, cl::cat(BoltOptCategory)); +cl::opt PrintFinalized("print-finalized", + cl::desc("print function after CFG is finalized"), + cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt PrintFOP("print-fop", @@ -235,12 +234,13 @@ static cl::opt SimplifyRODataLoads( "operand with the constant found in the corresponding section"), cl::cat(BoltOptCategory)); -static cl::list -SpecializeMemcpy1("memcpy1-spec", - cl::desc("list of functions with call sites for which to specialize memcpy() " - "for size 1"), - cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."), - cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::list SpecializeMemcpy1( + "memcpy1-spec", + cl::desc( + "list of functions with call sites for which to specialize memcpy() " + "for size 1"), + cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."), cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt Stoke("stoke", cl::desc("turn on the stoke analysis"), cl::cat(BoltOptCategory)); @@ -379,7 +379,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { if (opts::PrintProfileStats) Manager.registerPass(std::make_unique(NeverPrint)); - Manager.registerPass(std::make_unique(NeverPrint)); + Manager.registerPass(std::make_unique(NeverPrint)); Manager.registerPass(std::make_unique(NeverPrint)); diff --git a/bolt/test/X86/cfg-discontinuity-reporting.test b/bolt/test/X86/cfg-discontinuity-reporting.test deleted file mode 100644 index 4d7d3305cdb75..0000000000000 --- a/bolt/test/X86/cfg-discontinuity-reporting.test +++ /dev/null @@ -1,4 +0,0 @@ -## Check profile discontinuity reporting -RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe -RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s -CHECK: among the hottest 5 functions top 5% function CFG discontinuity is 100.00% diff --git a/bolt/test/X86/profile-quality-reporting.test b/bolt/test/X86/profile-quality-reporting.test new file mode 100644 index 0000000000000..ee4ecb9853dff --- /dev/null +++ b/bolt/test/X86/profile-quality-reporting.test @@ -0,0 +1,6 @@ +## Check profile quality stats reporting +RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe +RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s +CHECK: among the hottest 5 functions top 5% function CFG discontinuity is 100.00% +CHECK: among the hottest 5 functions top 5% call graph flow conservation gap is 60.00% +CHECK: among the hottest 5 functions top 5% CFG flow conservation gap is 45.53% (weighted) and 96.87% (worst) From 41a1c3867bcae29c41f92050eb13075e9cc21869 Mon Sep 17 00:00:00 2001 From: Shatian Wang Date: Wed, 26 Feb 2025 09:25:26 -0800 Subject: [PATCH 2/4] fixup! [BOLT] Flow conservation scores --- bolt/lib/Passes/ProfileQualityStats.cpp | 204 +++++++++---------- bolt/test/X86/profile-quality-reporting.test | 4 +- 2 files changed, 101 insertions(+), 107 deletions(-) diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp index a42b4dc27947a..2f827847913ae 100644 --- a/bolt/lib/Passes/ProfileQualityStats.cpp +++ b/bolt/lib/Passes/ProfileQualityStats.cpp @@ -19,8 +19,6 @@ #include #include -#define DEBUG_TYPE "bolt-opts" - using namespace llvm; using namespace bolt; @@ -34,7 +32,7 @@ cl::opt NumFunctionsForProfileQualityCheck( cl::opt PercentileForProfileQualityCheck( "percentile-for-profile-quality-check", cl::desc("Percentile of profile quality distributions over hottest " - "functions to display."), + "functions to report."), cl::init(95), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); } // namespace opts @@ -94,8 +92,7 @@ void printCFGContinuityStats(raw_ostream &OS, std::vector SumECUnreachables; std::vector FractionECUnreachables; - for (auto it = Functions.begin(); it != Functions.end(); ++it) { - const BinaryFunction *Function = *it; + for (const BinaryFunction *Function : Functions) { if (Function->size() <= 1) continue; @@ -104,28 +101,32 @@ void printCFGContinuityStats(raw_ostream &OS, size_t SumAllBBEC = 0; for (const BinaryBasicBlock &BB : *Function) { const size_t BBEC = BB.getKnownExecutionCount(); - NumPosECBBs += BBEC > 0 ? 1 : 0; + NumPosECBBs += !!BBEC; SumAllBBEC += BBEC; } // Perform BFS on subgraph of CFG induced by positive weight edges. // Compute the number of BBs reachable from the entry(s) of the function and // the sum of their execution counts (ECs). - std::unordered_map IndexToBB; std::unordered_set Visited; std::queue Queue; - for (const BinaryBasicBlock &BB : *Function) { - // Make sure BB.getIndex() is not already in IndexToBB. - assert(IndexToBB.find(BB.getIndex()) == IndexToBB.end()); - IndexToBB[BB.getIndex()] = &BB; - if (BB.isEntryPoint() && BB.getKnownExecutionCount() > 0) { - Queue.push(BB.getIndex()); - Visited.insert(BB.getIndex()); + size_t SumReachableBBEC = 0; + + Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) { + const BinaryBasicBlock *EntryBB = Function->getBasicBlockAtOffset(Offset); + if (EntryBB && EntryBB->getKnownExecutionCount() > 0) { + Queue.push(EntryBB->getLayoutIndex()); + Visited.insert(EntryBB->getLayoutIndex()); + SumReachableBBEC += EntryBB->getKnownExecutionCount(); } - } + return true; + }); + + const FunctionLayout &Layout = Function->getLayout(); + while (!Queue.empty()) { const unsigned BBIndex = Queue.front(); - const BinaryBasicBlock *BB = IndexToBB[BBIndex]; + const BinaryBasicBlock *BB = Layout.getBlock(BBIndex); Queue.pop(); auto SuccBIIter = BB->branch_info_begin(); for (const BinaryBasicBlock *Succ : BB->successors()) { @@ -134,25 +135,18 @@ void printCFGContinuityStats(raw_ostream &OS, ++SuccBIIter; continue; } - if (!Visited.insert(Succ->getIndex()).second) { + if (!Visited.insert(Succ->getLayoutIndex()).second) { ++SuccBIIter; continue; } - Queue.push(Succ->getIndex()); + SumReachableBBEC += Succ->getKnownExecutionCount(); + Queue.push(Succ->getLayoutIndex()); ++SuccBIIter; } } const size_t NumReachableBBs = Visited.size(); - // Loop through Visited, and sum the corresponding BBs' execution counts - // (ECs). - size_t SumReachableBBEC = 0; - for (const unsigned BBIndex : Visited) { - const BinaryBasicBlock *BB = IndexToBB[BBIndex]; - SumReachableBBEC += BB->getKnownExecutionCount(); - } - const size_t NumPosECBBsUnreachableFromEntry = NumPosECBBs - NumReachableBBs; const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC; @@ -162,7 +156,8 @@ void printCFGContinuityStats(raw_ostream &OS, if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) { OS << "Non-trivial CFG discontinuity observed in function " << Function->getPrintName() << "\n"; - LLVM_DEBUG(Function->dump()); + if (opts::Verbosity >= 3) + Function->dump(); } NumUnreachables.push_back(NumPosECBBsUnreachableFromEntry); @@ -176,11 +171,10 @@ void printCFGContinuityStats(raw_ostream &OS, std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end()); const int Rank = int(FractionECUnreachables.size() * opts::PercentileForProfileQualityCheck / 100); - OS << format("top %zu%% function CFG discontinuity is %.2lf%%\n", - 100 - opts::PercentileForProfileQualityCheck, + OS << format("function CFG discontinuity %.2lf%%; ", FractionECUnreachables[Rank] * 100); if (opts::Verbosity >= 1) { - OS << "abbreviations: EC = execution count, POS BBs = positive EC BBs\n" + OS << "\nabbreviations: EC = execution count, POS BBs = positive EC BBs\n" << "distribution of NUM(unreachable POS BBs) per function\n"; std::sort(NumUnreachables.begin(), NumUnreachables.end()); printDistribution(OS, NumUnreachables); @@ -200,8 +194,7 @@ void printCallGraphFlowConservationStats( FlowInfo &TotalFlowMap) { std::vector CallGraphGaps; - for (auto it = Functions.begin(); it != Functions.end(); ++it) { - const BinaryFunction *Function = *it; + for (const BinaryFunction *Function : Functions) { if (Function->size() <= 1 || !Function->isSimple()) continue; @@ -223,20 +216,22 @@ void printCallGraphFlowConservationStats( continue; NumConsideredEntryBlocks++; - EntryInflow += IncomingMap[BB.getIndex()]; - EntryOutflow += OutgoingMap[BB.getIndex()]; + EntryInflow += IncomingMap[BB.getLayoutIndex()]; + EntryOutflow += OutgoingMap[BB.getLayoutIndex()]; } } uint64_t NetEntryOutflow = 0; if (EntryOutflow < EntryInflow) { - if (opts::Verbosity >= 1) { + if (opts::Verbosity >= 2) { // We expect entry blocks' CFG outflow >= inflow, i.e., it has a // non-negative net outflow. If this is not the case, then raise a // warning if requested. - OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow in " + OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow " + "in " "function " << Function->getPrintName() << "\n"; - LLVM_DEBUG(Function->dump()); + if (opts::Verbosity >= 3) + Function->dump(); } } else { NetEntryOutflow = EntryOutflow - EntryInflow; @@ -252,7 +247,8 @@ void printCallGraphFlowConservationStats( OS << "Nontrivial call graph gap of size " << format("%.2lf%%", 100 * CallGraphGap) << " observed in function " << Function->getPrintName() << "\n"; - LLVM_DEBUG(Function->dump()); + if (opts::Verbosity >= 3) + Function->dump(); } CallGraphGaps.push_back(CallGraphGap); @@ -260,17 +256,17 @@ void printCallGraphFlowConservationStats( } } - if (!CallGraphGaps.empty()) { - std::sort(CallGraphGaps.begin(), CallGraphGaps.end()); - const int Rank = int(CallGraphGaps.size() * - opts::PercentileForProfileQualityCheck / 100); - OS << format("top %zu%% call graph flow conservation gap is %.2lf%%\n", - 100 - opts::PercentileForProfileQualityCheck, - CallGraphGaps[Rank] * 100); - if (opts::Verbosity >= 1) { - OS << "distribution of function entry flow conservation gaps\n"; - printDistribution(OS, CallGraphGaps, /*Fraction=*/true); - } + if (CallGraphGaps.empty()) + return; + + std::sort(CallGraphGaps.begin(), CallGraphGaps.end()); + const int Rank = + int(CallGraphGaps.size() * opts::PercentileForProfileQualityCheck / 100); + OS << format("call graph flow conservation gap %.2lf%%; ", + CallGraphGaps[Rank] * 100); + if (opts::Verbosity >= 1) { + OS << "\ndistribution of function entry flow conservation gaps\n"; + printDistribution(OS, CallGraphGaps, /*Fraction=*/true); } } @@ -281,8 +277,7 @@ void printCFGFlowConservationStats(raw_ostream &OS, std::vector CFGGapsWorst; std::vector CFGGapsWorstAbs; - for (auto it = Functions.begin(); it != Functions.end(); ++it) { - const BinaryFunction *Function = *it; + for (const BinaryFunction *Function : Functions) { if (Function->size() <= 1 || !Function->isSimple()) continue; @@ -301,8 +296,8 @@ void printCFGFlowConservationStats(raw_ostream &OS, if (BB.isEntryPoint() || BB.succ_size() == 0) continue; - const uint64_t Max = MaxCountMaps[BB.getIndex()]; - const uint64_t Min = MinCountMaps[BB.getIndex()]; + const uint64_t Max = MaxCountMaps[BB.getLayoutIndex()]; + const uint64_t Min = MinCountMaps[BB.getLayoutIndex()]; const double Gap = 1 - (double)Min / Max; double Weight = BB.getKnownExecutionCount() * BB.getNumNonPseudos(); if (Weight == 0) @@ -335,7 +330,8 @@ void printCFGFlowConservationStats(raw_ostream &OS, OS << "Worst gap (absolute value): " << WorstGapAbs << " at BB with " << "input offset 0x" << Twine::utohexstr(BBWorstGapAbs->getInputOffset()) << "\n"; - LLVM_DEBUG(Function->dump()); + if (opts::Verbosity >= 3) + Function->dump(); } CFGGapsWeightedAvg.push_back(WeightedGap); @@ -344,30 +340,27 @@ void printCFGFlowConservationStats(raw_ostream &OS, } } - if (!CFGGapsWeightedAvg.empty()) { - std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end()); - const int RankWA = int(CFGGapsWeightedAvg.size() * - opts::PercentileForProfileQualityCheck / 100); - std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end()); - const int RankW = - int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100); - OS << format( - "top %zu%% CFG flow conservation gap is %.2lf%% (weighted) and " - "%.2lf%% (worst)\n", - 100 - opts::PercentileForProfileQualityCheck, - CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100); - if (opts::Verbosity >= 1) { - OS << "distribution of weighted CFG flow conservation gaps\n"; - printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true); - OS << "Consider only blocks with execution counts > 500:\n" - << "distribution of worst block flow conservation gap per " - "function \n"; - printDistribution(OS, CFGGapsWorst, /*Fraction=*/true); - OS << "distribution of worst block flow conservation gap (absolute " - "value) per function\n"; - std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end()); - printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false); - } + if (CFGGapsWeightedAvg.empty()) + return; + std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end()); + const int RankWA = int(CFGGapsWeightedAvg.size() * + opts::PercentileForProfileQualityCheck / 100); + std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end()); + const int RankW = + int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100); + OS << format("CFG flow conservation gap %.2lf%% (weighted) %.2lf%% (worst)\n", + CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100); + if (opts::Verbosity >= 1) { + OS << "distribution of weighted CFG flow conservation gaps\n"; + printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true); + OS << "Consider only blocks with execution counts > 500:\n" + << "distribution of worst block flow conservation gap per " + "function \n"; + printDistribution(OS, CFGGapsWorst, /*Fraction=*/true); + OS << "distribution of worst block flow conservation gap (absolute " + "value) per function\n"; + std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end()); + printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false); } } @@ -391,10 +384,10 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { continue; } TotalOutgoing += Count; - IncomingMap[Succ->getIndex()] += Count; + IncomingMap[Succ->getLayoutIndex()] += Count; ++SuccBIIter; } - OutgoingMap[BB.getIndex()] = TotalOutgoing; + OutgoingMap[BB.getLayoutIndex()] = TotalOutgoing; } } @@ -412,7 +405,7 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum]; FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum]; for (const BinaryBasicBlock &BB : *Function) { - uint64_t BBNum = BB.getIndex(); + uint64_t BBNum = BB.getLayoutIndex(); MaxCountMap[BBNum] = std::max(IncomingMap[BBNum], OutgoingMap[BBNum]); MinCountMap[BBNum] = std::min(IncomingMap[BBNum], OutgoingMap[BBNum]); } @@ -429,7 +422,8 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { // Update MaxCountMap, MinCountMap, and CallGraphIncomingMap auto recordCall = [&](const BinaryBasicBlock *SourceBB, - const MCSymbol *DestSymbol, uint64_t Count) { + const MCSymbol *DestSymbol, uint64_t Count, + uint64_t TotalCallCount) { if (Count == BinaryBasicBlock::COUNT_NO_PROFILE) Count = 0; const BinaryFunction *DstFunc = @@ -437,9 +431,11 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { if (DstFunc) CallGraphIncomingMap[DstFunc->getFunctionNumber()] += Count; if (SourceBB) { - unsigned BlockIndex = SourceBB->getIndex(); - MaxCountMap[BlockIndex] = std::max(MaxCountMap[BlockIndex], Count); - MinCountMap[BlockIndex] = std::min(MinCountMap[BlockIndex], Count); + unsigned BlockIndex = SourceBB->getLayoutIndex(); + MaxCountMap[BlockIndex] = + std::max(MaxCountMap[BlockIndex], TotalCallCount); + MinCountMap[BlockIndex] = + std::min(MinCountMap[BlockIndex], TotalCallCount); } }; @@ -452,7 +448,6 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { CallInfoTy Counts; const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); - // If this is an indirect call use perf data directly. if (!DstSym && BC.MIB->hasAnnotation(Inst, "CallProfile")) { const auto &ICSP = BC.MIB->getAnnotationAs( Inst, "CallProfile"); @@ -471,22 +466,25 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { // directly. The call EC is only used to update CallGraphIncomingMap. if (!Function->hasValidProfile() && !Function->getAllCallSites().empty()) { for (const IndirectCallProfile &CSI : Function->getAllCallSites()) { - if (!CSI.Symbol) - continue; - recordCall(nullptr, CSI.Symbol, CSI.Count); + if (CSI.Symbol) + recordCall(nullptr, CSI.Symbol, CSI.Count, CSI.Count); } continue; } else { // If the function has a valid profile - for (BinaryBasicBlock &BB : *Function) { - for (MCInst &Inst : BB) { - if (!BC.MIB->isCall(Inst)) - continue; - // Find call instructions and extract target symbols from each - // one. - const CallInfoTy CallInfo = getCallInfo(&BB, Inst); - for (const TargetDesc &CI : CallInfo) { - recordCall(&BB, CI.first, CI.second); + for (const BinaryBasicBlock &BB : *Function) { + for (const MCInst &Inst : BB) { + if (BC.MIB->isCall(Inst)) { + // Find call instructions and extract target symbols from each + // one. + const CallInfoTy CallInfo = getCallInfo(&BB, Inst); + // We need the total call count to update MaxCountMap and + // MinCountMap in recordCall for indirect calls + uint64_t TotalCallCount = 0; + for (const TargetDesc &CI : CallInfo) + TotalCallCount += CI.second; + for (const TargetDesc &CI : CallInfo) + recordCall(&BB, CI.first, CI.second, TotalCallCount); } } } @@ -511,14 +509,12 @@ void printAll(BinaryContext &BC, FunctionListType &ValidFunctions, FlowInfo TotalFlowMap; computeFlowMappings(BC, TotalFlowMap); - BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", - RealNumTopFunctions); + BC.outs() << format("BOLT-INFO: profile quality metrics for the hottest %zu " + "functions (reporting top %zu%% values): ", + RealNumTopFunctions, + 100 - opts::PercentileForProfileQualityCheck); printCFGContinuityStats(BC.outs(), Functions); - BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", - RealNumTopFunctions); printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap); - BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", - RealNumTopFunctions); printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap); // Print more detailed bucketed stats if requested. diff --git a/bolt/test/X86/profile-quality-reporting.test b/bolt/test/X86/profile-quality-reporting.test index ee4ecb9853dff..2e15a6b245afa 100644 --- a/bolt/test/X86/profile-quality-reporting.test +++ b/bolt/test/X86/profile-quality-reporting.test @@ -1,6 +1,4 @@ ## Check profile quality stats reporting RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s -CHECK: among the hottest 5 functions top 5% function CFG discontinuity is 100.00% -CHECK: among the hottest 5 functions top 5% call graph flow conservation gap is 60.00% -CHECK: among the hottest 5 functions top 5% CFG flow conservation gap is 45.53% (weighted) and 96.87% (worst) +CHECK: profile quality metrics for the hottest 5 functions (reporting top 5% values): function CFG discontinuity 100.00%; call graph flow conservation gap 60.00%; CFG flow conservation gap 45.53% (weighted) 96.87% (worst) From 94eee67048447b79c3ad8d5ee9387a7bb8b3a96d Mon Sep 17 00:00:00 2001 From: Shatian Wang Date: Thu, 27 Feb 2025 11:53:02 -0800 Subject: [PATCH 3/4] fixup! fixup! [BOLT] Flow conservation scores --- .../include/bolt/Passes/ProfileQualityStats.h | 2 +- bolt/lib/Passes/ProfileQualityStats.cpp | 224 +++++++++--------- bolt/lib/Rewrite/BinaryPassManager.cpp | 42 ++-- 3 files changed, 135 insertions(+), 133 deletions(-) diff --git a/bolt/include/bolt/Passes/ProfileQualityStats.h b/bolt/include/bolt/Passes/ProfileQualityStats.h index 761fd33431a46..86fc88cefc10e 100644 --- a/bolt/include/bolt/Passes/ProfileQualityStats.h +++ b/bolt/include/bolt/Passes/ProfileQualityStats.h @@ -56,7 +56,7 @@ // CFG flow conservation property. // // The default value of 1000 above can be changed via the hidden BOLT option -// `-num-functions-for-profile-quality-check=[N]`. +// `-top-functions-for-profile-quality-check=[N]`. // The default reporting of the 95th percentile can be changed via the hidden // BOLT option `-percentile-for-profile-quality-check=[M]`. // diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp index 2f827847913ae..a5d0a621c37ce 100644 --- a/bolt/lib/Passes/ProfileQualityStats.cpp +++ b/bolt/lib/Passes/ProfileQualityStats.cpp @@ -24,8 +24,8 @@ using namespace bolt; namespace opts { extern cl::opt Verbosity; -cl::opt NumFunctionsForProfileQualityCheck( - "num-functions-for-profile-quality-check", +cl::opt TopFunctionsForProfileQualityCheck( + "top-functions-for-profile-quality-check", cl::desc("number of hottest functions to print aggregated " "profile quality stats of."), cl::init(1000), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); @@ -40,18 +40,16 @@ namespace { using FunctionListType = std::vector; using function_iterator = FunctionListType::iterator; -// BB index -> flow count -using FlowMapTy = std::unordered_map; -// Function number -> FlowMapTy -using TotalFlowMapTy = std::unordered_map; +// Function number -> vector of flows for BBs in the function +using TotalFlowMapTy = std::unordered_map>; // Function number -> flow count using FunctionFlowMapTy = std::unordered_map; struct FlowInfo { - TotalFlowMapTy TotalIncomingMaps; - TotalFlowMapTy TotalOutgoingMaps; + TotalFlowMapTy TotalIncomingFlows; + TotalFlowMapTy TotalOutgoingFlows; TotalFlowMapTy TotalMaxCountMaps; TotalFlowMapTy TotalMinCountMaps; - FunctionFlowMapTy CallGraphIncomingMap; + FunctionFlowMapTy CallGraphIncomingFlows; }; template @@ -69,7 +67,7 @@ void printDistribution(raw_ostream &OS, std::vector &values, Rank = values.size() - 1; if (Fraction) OS << " " << Text << std::string(11 - Text.length(), ' ') << ": " - << format("%.2lf%%", values[Rank] * 100) << "\n"; + << formatv("{0:P}", values[Rank]) << "\n"; else OS << " " << Text << std::string(11 - Text.length(), ' ') << ": " << values[Rank] << "\n"; @@ -114,11 +112,11 @@ void printCFGContinuityStats(raw_ostream &OS, Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) { const BinaryBasicBlock *EntryBB = Function->getBasicBlockAtOffset(Offset); - if (EntryBB && EntryBB->getKnownExecutionCount() > 0) { - Queue.push(EntryBB->getLayoutIndex()); - Visited.insert(EntryBB->getLayoutIndex()); - SumReachableBBEC += EntryBB->getKnownExecutionCount(); - } + if (!EntryBB || EntryBB->getKnownExecutionCount() == 0) + return true; + Queue.push(EntryBB->getLayoutIndex()); + Visited.insert(EntryBB->getLayoutIndex()); + SumReachableBBEC += EntryBB->getKnownExecutionCount(); return true; }); @@ -128,20 +126,15 @@ void printCFGContinuityStats(raw_ostream &OS, const unsigned BBIndex = Queue.front(); const BinaryBasicBlock *BB = Layout.getBlock(BBIndex); Queue.pop(); - auto SuccBIIter = BB->branch_info_begin(); - for (const BinaryBasicBlock *Succ : BB->successors()) { - const uint64_t Count = SuccBIIter->Count; - if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { - ++SuccBIIter; - continue; - } - if (!Visited.insert(Succ->getLayoutIndex()).second) { - ++SuccBIIter; + for (const auto &Tuple : llvm::zip(BB->successors(), BB->branch_info())) { + const auto *Succ = std::get<0>(Tuple); + const auto &BI = std::get<1>(Tuple); + const uint64_t Count = BI.Count; + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0 || + !Visited.insert(Succ->getLayoutIndex()).second) continue; - } SumReachableBBEC += Succ->getKnownExecutionCount(); Queue.push(Succ->getLayoutIndex()); - ++SuccBIIter; } } @@ -168,19 +161,19 @@ void printCFGContinuityStats(raw_ostream &OS, if (FractionECUnreachables.empty()) return; - std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end()); + llvm::sort(FractionECUnreachables); const int Rank = int(FractionECUnreachables.size() * opts::PercentileForProfileQualityCheck / 100); - OS << format("function CFG discontinuity %.2lf%%; ", - FractionECUnreachables[Rank] * 100); + OS << formatv("function CFG discontinuity {0:P}; ", + FractionECUnreachables[Rank]); if (opts::Verbosity >= 1) { OS << "\nabbreviations: EC = execution count, POS BBs = positive EC BBs\n" << "distribution of NUM(unreachable POS BBs) per function\n"; - std::sort(NumUnreachables.begin(), NumUnreachables.end()); + llvm::sort(NumUnreachables); printDistribution(OS, NumUnreachables); OS << "distribution of SUM_EC(unreachable POS BBs) per function\n"; - std::sort(SumECUnreachables.begin(), SumECUnreachables.end()); + llvm::sort(SumECUnreachables); printDistribution(OS, SumECUnreachables); OS << "distribution of [(SUM_EC(unreachable POS BBs) / SUM_EC(all " @@ -199,27 +192,31 @@ void printCallGraphFlowConservationStats( continue; const uint64_t FunctionNum = Function->getFunctionNumber(); - FlowMapTy &IncomingMap = TotalFlowMap.TotalIncomingMaps[FunctionNum]; - FlowMapTy &OutgoingMap = TotalFlowMap.TotalOutgoingMaps[FunctionNum]; - FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap; + std::vector &IncomingFlows = + TotalFlowMap.TotalIncomingFlows[FunctionNum]; + std::vector &OutgoingFlows = + TotalFlowMap.TotalOutgoingFlows[FunctionNum]; + FunctionFlowMapTy &CallGraphIncomingFlows = + TotalFlowMap.CallGraphIncomingFlows; // Only consider functions that are not a program entry. - if (CallGraphIncomingMap.find(FunctionNum) != CallGraphIncomingMap.end()) { + if (CallGraphIncomingFlows.find(FunctionNum) != + CallGraphIncomingFlows.end()) { uint64_t EntryInflow = 0; uint64_t EntryOutflow = 0; uint32_t NumConsideredEntryBlocks = 0; - for (const BinaryBasicBlock &BB : *Function) { - if (BB.isEntryPoint()) { - // If entry is an exit, then we don't consider it for flow - // conservation - if (BB.succ_size() == 0) - continue; - NumConsideredEntryBlocks++; - EntryInflow += IncomingMap[BB.getLayoutIndex()]; - EntryOutflow += OutgoingMap[BB.getLayoutIndex()]; - } - } + Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) { + const BinaryBasicBlock *EntryBB = + Function->getBasicBlockAtOffset(Offset); + if (!EntryBB || EntryBB->succ_size() == 0) + return true; + NumConsideredEntryBlocks++; + EntryInflow += IncomingFlows[EntryBB->getLayoutIndex()]; + EntryOutflow += OutgoingFlows[EntryBB->getLayoutIndex()]; + return true; + }); + uint64_t NetEntryOutflow = 0; if (EntryOutflow < EntryInflow) { if (opts::Verbosity >= 2) { @@ -227,8 +224,7 @@ void printCallGraphFlowConservationStats( // non-negative net outflow. If this is not the case, then raise a // warning if requested. OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow " - "in " - "function " + "in function " << Function->getPrintName() << "\n"; if (opts::Verbosity >= 3) Function->dump(); @@ -238,15 +234,15 @@ void printCallGraphFlowConservationStats( } if (NumConsideredEntryBlocks > 0) { const uint64_t CallGraphInflow = - TotalFlowMap.CallGraphIncomingMap[Function->getFunctionNumber()]; + TotalFlowMap.CallGraphIncomingFlows[Function->getFunctionNumber()]; const uint64_t Min = std::min(NetEntryOutflow, CallGraphInflow); const uint64_t Max = std::max(NetEntryOutflow, CallGraphInflow); const double CallGraphGap = 1 - (double)Min / Max; if (opts::Verbosity >= 2 && CallGraphGap >= 0.5) { OS << "Nontrivial call graph gap of size " - << format("%.2lf%%", 100 * CallGraphGap) - << " observed in function " << Function->getPrintName() << "\n"; + << formatv("{0:P}", CallGraphGap) << " observed in function " + << Function->getPrintName() << "\n"; if (opts::Verbosity >= 3) Function->dump(); } @@ -259,11 +255,11 @@ void printCallGraphFlowConservationStats( if (CallGraphGaps.empty()) return; - std::sort(CallGraphGaps.begin(), CallGraphGaps.end()); + llvm::sort(CallGraphGaps); const int Rank = int(CallGraphGaps.size() * opts::PercentileForProfileQualityCheck / 100); - OS << format("call graph flow conservation gap %.2lf%%; ", - CallGraphGaps[Rank] * 100); + OS << formatv("call graph flow conservation gap {0:P}; ", + CallGraphGaps[Rank]); if (opts::Verbosity >= 1) { OS << "\ndistribution of function entry flow conservation gaps\n"; printDistribution(OS, CallGraphGaps, /*Fraction=*/true); @@ -282,8 +278,10 @@ void printCFGFlowConservationStats(raw_ostream &OS, continue; const uint64_t FunctionNum = Function->getFunctionNumber(); - FlowMapTy &MaxCountMaps = TotalFlowMap.TotalMaxCountMaps[FunctionNum]; - FlowMapTy &MinCountMaps = TotalFlowMap.TotalMinCountMaps[FunctionNum]; + std::vector &MaxCountMaps = + TotalFlowMap.TotalMaxCountMaps[FunctionNum]; + std::vector &MinCountMaps = + TotalFlowMap.TotalMinCountMaps[FunctionNum]; double WeightedGapSum = 0.0; double WeightSum = 0.0; double WorstGap = 0.0; @@ -321,9 +319,9 @@ void printCFGFlowConservationStats(raw_ostream &OS, if (opts::Verbosity >= 2 && (WeightedGap >= 0.1 || WorstGap >= 0.9)) { OS << "Nontrivial CFG gap observed in function " << Function->getPrintName() << "\n" - << "Weighted gap: " << format("%.2lf%%", 100 * WeightedGap) << "\n"; + << "Weighted gap: " << formatv("{0:P}", WeightedGap) << "\n"; if (BBWorstGap) - OS << "Worst gap: " << format("%.2lf%%", 100 * WorstGap) + OS << "Worst gap: " << formatv("{0:P}", WorstGap) << " at BB with input offset: 0x" << Twine::utohexstr(BBWorstGap->getInputOffset()) << "\n"; if (BBWorstGapAbs) @@ -342,14 +340,14 @@ void printCFGFlowConservationStats(raw_ostream &OS, if (CFGGapsWeightedAvg.empty()) return; - std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end()); + llvm::sort(CFGGapsWeightedAvg); const int RankWA = int(CFGGapsWeightedAvg.size() * opts::PercentileForProfileQualityCheck / 100); - std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end()); + llvm::sort(CFGGapsWorst); const int RankW = int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100); - OS << format("CFG flow conservation gap %.2lf%% (weighted) %.2lf%% (worst)\n", - CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100); + OS << formatv("CFG flow conservation gap {0:P} (weighted) {1:P} (worst)\n", + CFGGapsWeightedAvg[RankWA], CFGGapsWorst[RankW]); if (opts::Verbosity >= 1) { OS << "distribution of weighted CFG flow conservation gaps\n"; printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true); @@ -359,68 +357,74 @@ void printCFGFlowConservationStats(raw_ostream &OS, printDistribution(OS, CFGGapsWorst, /*Fraction=*/true); OS << "distribution of worst block flow conservation gap (absolute " "value) per function\n"; - std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end()); + llvm::sort(CFGGapsWorstAbs); printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false); } } void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { // Increment block inflow and outflow with CFG jump counts. - TotalFlowMapTy &TotalIncomingMaps = TotalFlowMap.TotalIncomingMaps; - TotalFlowMapTy &TotalOutgoingMaps = TotalFlowMap.TotalOutgoingMaps; + TotalFlowMapTy &TotalIncomingFlows = TotalFlowMap.TotalIncomingFlows; + TotalFlowMapTy &TotalOutgoingFlows = TotalFlowMap.TotalOutgoingFlows; for (const auto &BFI : BC.getBinaryFunctions()) { const BinaryFunction *Function = &BFI.second; + std::vector &IncomingFlows = + TotalIncomingFlows[Function->getFunctionNumber()]; + std::vector &OutgoingFlows = + TotalOutgoingFlows[Function->getFunctionNumber()]; + const uint64_t NumBlocks = Function->size(); + IncomingFlows.resize(NumBlocks, 0); + OutgoingFlows.resize(NumBlocks, 0); if (Function->empty() || !Function->hasValidProfile()) continue; - FlowMapTy &IncomingMap = TotalIncomingMaps[Function->getFunctionNumber()]; - FlowMapTy &OutgoingMap = TotalOutgoingMaps[Function->getFunctionNumber()]; for (const BinaryBasicBlock &BB : *Function) { uint64_t TotalOutgoing = 0ULL; - auto SuccBIIter = BB.branch_info_begin(); - for (BinaryBasicBlock *Succ : BB.successors()) { - const uint64_t Count = SuccBIIter->Count; - if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { - ++SuccBIIter; + for (const auto &Tuple : llvm::zip(BB.successors(), BB.branch_info())) { + const auto *Succ = std::get<0>(Tuple); + const auto &BI = std::get<1>(Tuple); + const uint64_t Count = BI.Count; + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) continue; - } TotalOutgoing += Count; - IncomingMap[Succ->getLayoutIndex()] += Count; - ++SuccBIIter; + IncomingFlows[Succ->getLayoutIndex()] += Count; } - OutgoingMap[BB.getLayoutIndex()] = TotalOutgoing; + OutgoingFlows[BB.getLayoutIndex()] = TotalOutgoing; } } - // Initialize TotalMaxCountMaps and TotalMinCountMaps using - // TotalIncomingMaps and TotalOutgoingMaps + // TotalIncomingFlows and TotalOutgoingFlows TotalFlowMapTy &TotalMaxCountMaps = TotalFlowMap.TotalMaxCountMaps; TotalFlowMapTy &TotalMinCountMaps = TotalFlowMap.TotalMinCountMaps; for (const auto &BFI : BC.getBinaryFunctions()) { const BinaryFunction *Function = &BFI.second; + uint64_t FunctionNum = Function->getFunctionNumber(); + std::vector &IncomingFlows = TotalIncomingFlows[FunctionNum]; + std::vector &OutgoingFlows = TotalOutgoingFlows[FunctionNum]; + std::vector &MaxCountMap = TotalMaxCountMaps[FunctionNum]; + std::vector &MinCountMap = TotalMinCountMaps[FunctionNum]; + const uint64_t NumBlocks = Function->size(); + MaxCountMap.resize(NumBlocks, 0); + MinCountMap.resize(NumBlocks, 0); if (Function->empty() || !Function->hasValidProfile()) continue; - uint64_t FunctionNum = Function->getFunctionNumber(); - FlowMapTy &IncomingMap = TotalIncomingMaps[FunctionNum]; - FlowMapTy &OutgoingMap = TotalOutgoingMaps[FunctionNum]; - FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum]; - FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum]; for (const BinaryBasicBlock &BB : *Function) { uint64_t BBNum = BB.getLayoutIndex(); - MaxCountMap[BBNum] = std::max(IncomingMap[BBNum], OutgoingMap[BBNum]); - MinCountMap[BBNum] = std::min(IncomingMap[BBNum], OutgoingMap[BBNum]); + MaxCountMap[BBNum] = std::max(IncomingFlows[BBNum], OutgoingFlows[BBNum]); + MinCountMap[BBNum] = std::min(IncomingFlows[BBNum], OutgoingFlows[BBNum]); } } // Modify TotalMaxCountMaps and TotalMinCountMaps using call counts and - // fill out CallGraphIncomingMap - FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap; + // fill out CallGraphIncomingFlows + FunctionFlowMapTy &CallGraphIncomingFlows = + TotalFlowMap.CallGraphIncomingFlows; for (const auto &BFI : BC.getBinaryFunctions()) { const BinaryFunction *Function = &BFI.second; uint64_t FunctionNum = Function->getFunctionNumber(); - FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum]; - FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum]; + std::vector &MaxCountMap = TotalMaxCountMaps[FunctionNum]; + std::vector &MinCountMap = TotalMinCountMaps[FunctionNum]; - // Update MaxCountMap, MinCountMap, and CallGraphIncomingMap + // Update MaxCountMap, MinCountMap, and CallGraphIncomingFlows auto recordCall = [&](const BinaryBasicBlock *SourceBB, const MCSymbol *DestSymbol, uint64_t Count, uint64_t TotalCallCount) { @@ -429,7 +433,7 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { const BinaryFunction *DstFunc = DestSymbol ? BC.getFunctionForSymbol(DestSymbol) : nullptr; if (DstFunc) - CallGraphIncomingMap[DstFunc->getFunctionNumber()] += Count; + CallGraphIncomingFlows[DstFunc->getFunctionNumber()] += Count; if (SourceBB) { unsigned BlockIndex = SourceBB->getLayoutIndex(); MaxCountMap[BlockIndex] = @@ -449,9 +453,8 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); if (!DstSym && BC.MIB->hasAnnotation(Inst, "CallProfile")) { - const auto &ICSP = BC.MIB->getAnnotationAs( - Inst, "CallProfile"); - for (const IndirectCallProfile &CSI : ICSP) + for (const auto &CSI : BC.MIB->getAnnotationAs( + Inst, "CallProfile")) if (CSI.Symbol) Counts.emplace_back(CSI.Symbol, CSI.Count); } else { @@ -463,29 +466,28 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { }; // If the function has an invalid profile, try to use the perf data - // directly. The call EC is only used to update CallGraphIncomingMap. + // directly. The call EC is only used to update CallGraphIncomingFlows. if (!Function->hasValidProfile() && !Function->getAllCallSites().empty()) { - for (const IndirectCallProfile &CSI : Function->getAllCallSites()) { + for (const IndirectCallProfile &CSI : Function->getAllCallSites()) if (CSI.Symbol) recordCall(nullptr, CSI.Symbol, CSI.Count, CSI.Count); - } continue; } else { // If the function has a valid profile for (const BinaryBasicBlock &BB : *Function) { for (const MCInst &Inst : BB) { - if (BC.MIB->isCall(Inst)) { - // Find call instructions and extract target symbols from each - // one. - const CallInfoTy CallInfo = getCallInfo(&BB, Inst); - // We need the total call count to update MaxCountMap and - // MinCountMap in recordCall for indirect calls - uint64_t TotalCallCount = 0; - for (const TargetDesc &CI : CallInfo) - TotalCallCount += CI.second; - for (const TargetDesc &CI : CallInfo) - recordCall(&BB, CI.first, CI.second, TotalCallCount); - } + if (!BC.MIB->isCall(Inst)) + continue; + // Find call instructions and extract target symbols from each + // one. + const CallInfoTy CallInfo = getCallInfo(&BB, Inst); + // We need the total call count to update MaxCountMap and + // MinCountMap in recordCall for indirect calls + uint64_t TotalCallCount = 0; + for (const TargetDesc &CI : CallInfo) + TotalCallCount += CI.second; + for (const TargetDesc &CI : CallInfo) + recordCall(&BB, CI.first, CI.second, TotalCallCount); } } } @@ -567,9 +569,9 @@ Error PrintProfileQualityStats::runOnFunctions(BinaryContext &BC) { if (PrintProfileQualityStats::shouldOptimize(*Function)) ValidFunctions.push_back(Function); } - if (ValidFunctions.empty() || opts::NumFunctionsForProfileQualityCheck == 0) + if (ValidFunctions.empty() || opts::TopFunctionsForProfileQualityCheck == 0) return Error::success(); - printAll(BC, ValidFunctions, opts::NumFunctionsForProfileQualityCheck); + printAll(BC, ValidFunctions, opts::TopFunctionsForProfileQualityCheck); return Error::success(); } diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 6f750b95adf9a..dd48653931eb9 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -58,10 +58,10 @@ extern cl::opt ICF; -static cl::opt DynoStatsAll("dyno-stats-all", - cl::desc("print dyno stats after each stage"), - cl::ZeroOrMore, cl::Hidden, - cl::cat(BoltCategory)); +static cl::opt +DynoStatsAll("dyno-stats-all", + cl::desc("print dyno stats after each stage"), + cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory)); static cl::opt EliminateUnreachable("eliminate-unreachable", @@ -82,24 +82,25 @@ cl::opt cl::opt NeverPrint("never-print", cl::desc("never print"), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt PrintAfterBranchFixup( - "print-after-branch-fixup", - cl::desc("print function after fixing local branches"), cl::Hidden, - cl::cat(BoltOptCategory)); +cl::opt +PrintAfterBranchFixup("print-after-branch-fixup", + cl::desc("print function after fixing local branches"), + cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt - PrintAfterLowering("print-after-lowering", - cl::desc("print function after instruction lowering"), - cl::Hidden, cl::cat(BoltOptCategory)); +PrintAfterLowering("print-after-lowering", + cl::desc("print function after instruction lowering"), + cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt PrintEstimateEdgeCounts( "print-estimate-edge-counts", cl::desc("print function after edge counts are set for no-LBR profile"), cl::Hidden, cl::cat(BoltOptCategory)); -cl::opt PrintFinalized("print-finalized", - cl::desc("print function after CFG is finalized"), - cl::Hidden, cl::cat(BoltOptCategory)); +cl::opt +PrintFinalized("print-finalized", + cl::desc("print function after CFG is finalized"), + cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt PrintFOP("print-fop", @@ -234,13 +235,12 @@ static cl::opt SimplifyRODataLoads( "operand with the constant found in the corresponding section"), cl::cat(BoltOptCategory)); -static cl::list SpecializeMemcpy1( - "memcpy1-spec", - cl::desc( - "list of functions with call sites for which to specialize memcpy() " - "for size 1"), - cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."), cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +static cl::list +SpecializeMemcpy1("memcpy1-spec", + cl::desc("list of functions with call sites for which to specialize memcpy() " + "for size 1"), + cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."), + cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt Stoke("stoke", cl::desc("turn on the stoke analysis"), cl::cat(BoltOptCategory)); From c2962d92676d07270240470a3791056083a09d70 Mon Sep 17 00:00:00 2001 From: Shatian Wang Date: Thu, 27 Feb 2025 19:58:27 -0800 Subject: [PATCH 4/4] fixup! fixup! fixup! [BOLT] Flow conservation scores --- bolt/lib/Passes/ProfileQualityStats.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp index a5d0a621c37ce..78e6412f56ba1 100644 --- a/bolt/lib/Passes/ProfileQualityStats.cpp +++ b/bolt/lib/Passes/ProfileQualityStats.cpp @@ -126,9 +126,8 @@ void printCFGContinuityStats(raw_ostream &OS, const unsigned BBIndex = Queue.front(); const BinaryBasicBlock *BB = Layout.getBlock(BBIndex); Queue.pop(); - for (const auto &Tuple : llvm::zip(BB->successors(), BB->branch_info())) { - const auto *Succ = std::get<0>(Tuple); - const auto &BI = std::get<1>(Tuple); + for (const auto &[Succ, BI] : + llvm::zip(BB->successors(), BB->branch_info())) { const uint64_t Count = BI.Count; if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0 || !Visited.insert(Succ->getLayoutIndex()).second) @@ -272,7 +271,9 @@ void printCFGFlowConservationStats(raw_ostream &OS, std::vector CFGGapsWeightedAvg; std::vector CFGGapsWorst; std::vector CFGGapsWorstAbs; - + // We only consider blocks with execution counts > MinBlockCount when + // reporting the distribution of worst gaps. + const uint16_t MinBlockCount = 500; for (const BinaryFunction *Function : Functions) { if (Function->size() <= 1 || !Function->isSimple()) continue; @@ -305,11 +306,12 @@ void printCFGFlowConservationStats(raw_ostream &OS, Weight = log(Weight); WeightedGapSum += Gap * Weight; WeightSum += Weight; - if (BB.getKnownExecutionCount() > 500 && Gap > WorstGap) { + if (BB.getKnownExecutionCount() > MinBlockCount && Gap > WorstGap) { WorstGap = Gap; BBWorstGap = &BB; } - if (BB.getKnownExecutionCount() > 500 && Max - Min > WorstGapAbs) { + if (BB.getKnownExecutionCount() > MinBlockCount && + Max - Min > WorstGapAbs) { WorstGapAbs = Max - Min; BBWorstGapAbs = &BB; } @@ -351,7 +353,8 @@ void printCFGFlowConservationStats(raw_ostream &OS, if (opts::Verbosity >= 1) { OS << "distribution of weighted CFG flow conservation gaps\n"; printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true); - OS << "Consider only blocks with execution counts > 500:\n" + OS << format("Consider only blocks with execution counts > %zu:\n", + MinBlockCount) << "distribution of worst block flow conservation gap per " "function \n"; printDistribution(OS, CFGGapsWorst, /*Fraction=*/true); @@ -379,9 +382,8 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { continue; for (const BinaryBasicBlock &BB : *Function) { uint64_t TotalOutgoing = 0ULL; - for (const auto &Tuple : llvm::zip(BB.successors(), BB.branch_info())) { - const auto *Succ = std::get<0>(Tuple); - const auto &BI = std::get<1>(Tuple); + for (const auto &[Succ, BI] : + llvm::zip(BB.successors(), BB.branch_info())) { const uint64_t Count = BI.Count; if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) continue;