From 81bc18e431ff247d55c64576d8776c5ab54bf712 Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Thu, 2 Oct 2025 14:55:22 +0200 Subject: [PATCH 1/2] [clang][analyzer] Make per-entry-point metric rows uniquely identifiable Also remove the gratuitoius spaces after "," that break strict CSV compliance. As the debug function name does not uniquely identify an entry point, add the main-TU name and the USR values for each entry point snapshot to reduce the likelyhood of collisions between declarations across large projects. While adding a filename to each row increases the file size substantially, the difference in size for the compressed is acceptable. I evaluated it on our set of 200+ opensource C and C++ projects with 3M entry points, and got the following results when adding these two columns: - Raw CSV file increased from 530MB to 1.1GB - Compressed file (XZ) increased from 54 MB to 78 MB -- CPP-7098 Co-authored-by: Balazs Benics --- .../Core/PathSensitive/EntryPointStats.h | 2 +- .../StaticAnalyzer/Core/EntryPointStats.cpp | 31 ++++++++++++++----- .../Frontend/AnalysisConsumer.cpp | 12 ++++++- .../analyzer-stats/entry-point-stats.cpp | 8 +++-- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h index 633fb7aa8f72d..448e40269ca2d 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h @@ -25,7 +25,7 @@ class EntryPointStat { public: llvm::StringLiteral name() const { return Name; } - static void lockRegistry(); + static void lockRegistry(llvm::StringRef CPPFileName); static void takeSnapshot(const Decl *EntryPoint); static void dumpStatsAsCSV(llvm::raw_ostream &OS); diff --git a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp index b7f9044f65308..62ae62f2f2154 100644 --- a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp +++ b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp @@ -9,7 +9,9 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h" #include "clang/AST/DeclBase.h" #include "clang/Analysis/AnalysisDeclContext.h" +#include "clang/Index/USRGeneration.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/FileSystem.h" @@ -38,6 +40,7 @@ struct Registry { }; std::vector Snapshots; + std::string EscapedCPPFileName; }; } // namespace @@ -69,7 +72,7 @@ static void checkStatName(const EntryPointStat *M) { } } -void EntryPointStat::lockRegistry() { +void EntryPointStat::lockRegistry(llvm::StringRef CPPFileName) { auto CmpByNames = [](const EntryPointStat *L, const EntryPointStat *R) { return L->name() < R->name(); }; @@ -78,6 +81,8 @@ void EntryPointStat::lockRegistry() { enumerateStatVectors( [](const auto &Stats) { llvm::for_each(Stats, checkStatName); }); StatsRegistry->IsLocked = true; + llvm::raw_string_ostream OS(StatsRegistry->EscapedCPPFileName); + llvm::printEscapedString(CPPFileName, OS); } [[maybe_unused]] static bool isRegistered(llvm::StringLiteral Name) { @@ -144,15 +149,27 @@ static std::vector getStatNames() { return Ret; } +static std::string getUSR(const Decl *D) { + llvm::SmallVector Buf; + if (index::generateUSRForDecl(D, Buf)) { + assert(false && "This should never fail"); + return AnalysisDeclContext::getFunctionName(D); + } + return llvm::toStringRef(Buf).str(); +} + void Registry::Snapshot::dumpAsCSV(llvm::raw_ostream &OS) const { OS << '"'; + llvm::printEscapedString(getUSR(EntryPoint), OS); + OS << "\",\""; + OS << StatsRegistry->EscapedCPPFileName << "\",\""; llvm::printEscapedString( clang::AnalysisDeclContext::getFunctionName(EntryPoint), OS); - OS << "\", "; + OS << "\","; auto PrintAsBool = [&OS](bool B) { OS << (B ? "true" : "false"); }; - llvm::interleaveComma(BoolStatValues, OS, PrintAsBool); - OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ", "); - llvm::interleaveComma(UnsignedStatValues, OS); + llvm::interleave(BoolStatValues, OS, PrintAsBool, ","); + OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ","); + llvm::interleave(UnsignedStatValues, OS, [&OS](unsigned U) { OS << U; }, ","); } static std::vector consumeBoolStats() { @@ -181,8 +198,8 @@ void EntryPointStat::dumpStatsAsCSV(llvm::StringRef FileName) { } void EntryPointStat::dumpStatsAsCSV(llvm::raw_ostream &OS) { - OS << "EntryPoint, "; - llvm::interleaveComma(getStatNames(), OS); + OS << "USR,File,DebugName,"; + llvm::interleave(getStatNames(), OS, [&OS](const auto &a) { OS << a; }, ","); OS << "\n"; std::vector Rows; diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp index 53466e7a75b0f..9e7538eb34600 100644 --- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp +++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp @@ -65,6 +65,15 @@ STAT_MAX(MaxCFGSize, "The maximum number of basic blocks in a function."); namespace { +StringRef getMainFileName(const CompilerInvocation &Invocation) { + if (!Invocation.getFrontendOpts().Inputs.empty()) { + const FrontendInputFile &Input = Invocation.getFrontendOpts().Inputs[0]; + return Input.isFile() ? Input.getFile() + : Input.getBuffer().getBufferIdentifier(); + } + return {}; +} + class AnalysisConsumer : public AnalysisASTConsumer, public DynamicRecursiveASTVisitor { enum { @@ -125,7 +134,8 @@ class AnalysisConsumer : public AnalysisASTConsumer, PP(CI.getPreprocessor()), OutDir(outdir), Opts(opts), Plugins(plugins), Injector(std::move(injector)), CTU(CI), MacroExpansions(CI.getLangOpts()) { - EntryPointStat::lockRegistry(); + + EntryPointStat::lockRegistry(getMainFileName(CI.getInvocation())); DigestAnalyzerOptions(); if (Opts.AnalyzerDisplayProgress || Opts.PrintStats || diff --git a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp index 1ff31d114ee99..9cbe04550a8d3 100644 --- a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp +++ b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp @@ -5,7 +5,9 @@ // RUN: %csv2json "%t.csv" | FileCheck --check-prefix=CHECK %s // // CHECK: { -// CHECK-NEXT: "fib(unsigned int)": { +// CHECK-NEXT: "c:@F@fib#i#": { +// CHECK-NEXT: "File": "{{.*}}entry-point-stats.cpp", +// CHECK-NEXT: "DebugName": "fib(unsigned int)", // CHECK-NEXT: "NumBlocks": "{{[0-9]+}}", // CHECK-NEXT: "NumBlocksUnreachable": "{{[0-9]+}}", // CHECK-NEXT: "NumCTUSteps": "{{[0-9]+}}", @@ -40,7 +42,9 @@ // CHECK-NEXT: "MaxValidBugClassSize": "{{[0-9]+}}", // CHECK-NEXT: "PathRunningTime": "{{[0-9]+}}" // CHECK-NEXT: }, -// CHECK-NEXT: "main(int, char **)": { +// CHECK-NEXT: "c:@F@main#I#**C#": { +// CHECK-NEXT: "File": "{{.*}}entry-point-stats.cpp", +// CHECK-NEXT: "DebugName": "main(int, char **)", // CHECK-NEXT: "NumBlocks": "{{[0-9]+}}", // CHECK-NEXT: "NumBlocksUnreachable": "{{[0-9]+}}", // CHECK-NEXT: "NumCTUSteps": "{{[0-9]+}}", From 7118400ab4c28b8d87a9708ebef2e105910616ba Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Fri, 3 Oct 2025 15:23:41 +0200 Subject: [PATCH 2/2] Return "" if main faile can't be named --- clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp index 9e7538eb34600..2c1d20b2c051a 100644 --- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp +++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp @@ -71,7 +71,7 @@ StringRef getMainFileName(const CompilerInvocation &Invocation) { return Input.isFile() ? Input.getFile() : Input.getBuffer().getBufferIdentifier(); } - return {}; + return ""; } class AnalysisConsumer : public AnalysisASTConsumer,