Skip to content

Commit cbd2131

Browse files
fix: Skip non-TU main file command objects (#380)
When the compilation database for a codebase is generated using grailbio/bazel-compilation-database, it ends up generating lots of entries for headers and other template files like 'def' files in LLVM. However, those cannot actually be semantically analyzed without a translation unit, so skip them immediately after parsing.
1 parent 2fe721d commit cbd2131

File tree

7 files changed

+135
-12
lines changed

7 files changed

+135
-12
lines changed

indexer/CompilationDatabase.cc

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -565,7 +565,7 @@ compdb::File::openAndExitOnErrors(const StdPath &path,
565565
}
566566

567567
void ResumableParser::initialize(compdb::File compdb, size_t refillCount,
568-
bool inferResourceDir) {
568+
ParseOptions options) {
569569
auto averageJobSize = compdb.sizeInBytes() / compdb.commandCount();
570570
// Some customers have averageJobSize = 150KiB.
571571
// If numWorkers == 300 (very high core count machine),
@@ -580,7 +580,14 @@ void ResumableParser::initialize(compdb::File compdb, size_t refillCount,
580580
rapidjson::FileReadStream(compdb.file, this->jsonStreamBuffer.data(),
581581
this->jsonStreamBuffer.size());
582582
this->reader.IterativeParseInit();
583-
this->inferResourceDir = inferResourceDir;
583+
this->options = options;
584+
std::vector<std::string> extensions;
585+
// Via https://stackoverflow.com/a/3223792/2682729
586+
for (auto ext : {"c", "C", "cc", "cpp", "CPP", "cxx", "c++"}) {
587+
extensions.emplace_back(llvm::Regex::escape(fmt::format(".{}", ext)));
588+
};
589+
this->fileExtensionRegex =
590+
llvm::Regex(fmt::format(".+({})$", fmt::join(extensions, "|")));
584591
}
585592

586593
void ResumableParser::parseMore(std::vector<compdb::CommandObject> &out,
@@ -633,16 +640,23 @@ void ResumableParser::parseMore(std::vector<compdb::CommandObject> &out,
633640
for (auto &cmd : this->handler->commands) {
634641
cmd.index = this->currentIndex;
635642
++this->currentIndex;
643+
if (this->options.skipNonMainFileEntries) {
644+
if (!this->fileExtensionRegex.match(cmd.filePath)) {
645+
++this->stats.skippedNonTuFileExtension;
646+
continue;
647+
}
648+
}
636649
if (checkFilesExist
637650
&& !doesFileExist(cmd.filePath, cmd.workingDirectory)) {
651+
++this->stats.skippedNonExistentTuFile;
638652
continue;
639653
}
640654
out.emplace_back(std::move(cmd));
641655
}
642656
this->handler->commands.clear();
643657
}
644658

645-
if (this->inferResourceDir) {
659+
if (this->options.inferResourceDir) {
646660
for (auto &cmd : out) {
647661
if (cmd.arguments.empty()) {
648662
continue;

indexer/CompilationDatabase.h

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "spdlog/fmt/fmt.h"
1616
#include "spdlog/spdlog.h"
1717

18+
#include "llvm/Support/Regex.h"
19+
1820
#include "indexer/Derive.h"
1921
#include "indexer/FileSystem.h"
2022

@@ -106,16 +108,35 @@ struct ToolchainConfig {
106108
std::vector<std::string> extraArgs;
107109
};
108110

111+
struct ParseOptions {
112+
bool inferResourceDir;
113+
bool skipNonMainFileEntries;
114+
115+
explicit ParseOptions(bool isTesting = false)
116+
: inferResourceDir(!isTesting), skipNonMainFileEntries(!isTesting) {}
117+
118+
ParseOptions(bool inferResourceDir, bool skipNonMainFileEntries)
119+
: inferResourceDir(inferResourceDir),
120+
skipNonMainFileEntries(skipNonMainFileEntries) {}
121+
};
122+
123+
struct ParseStats {
124+
unsigned skippedNonTuFileExtension = 0;
125+
unsigned skippedNonExistentTuFile = 0;
126+
};
127+
109128
class ResumableParser {
110129
std::string jsonStreamBuffer;
111130
std::optional<rapidjson::FileReadStream> compDbStream;
112131
std::optional<CommandObjectHandler> handler;
113132
rapidjson::Reader reader;
114133
size_t currentIndex = 0;
115134

116-
bool inferResourceDir;
135+
ParseOptions options;
117136
absl::flat_hash_set<std::string> emittedErrors;
118137

138+
llvm::Regex fileExtensionRegex;
139+
119140
/// Mapping from compiler/wrapper path to extra information needed
120141
/// to tweak the command object before invoking the driver.
121142
///
@@ -126,15 +147,16 @@ class ResumableParser {
126147
absl::flat_hash_map<std::string, ToolchainConfig> toolchainConfigMap;
127148

128149
public:
150+
ParseStats stats;
151+
129152
ResumableParser() = default;
130153
ResumableParser(const ResumableParser &) = delete;
131154
ResumableParser &operator=(const ResumableParser &) = delete;
132155

133156
/// If \param inferResourceDir is set, then the parser will automatically
134157
/// add extra '-resource-dir' '<path>' arguments to the parsed
135158
/// CompileCommands' CommandLine field.
136-
void initialize(compdb::File compdb, size_t refillCount,
137-
bool inferResourceDir);
159+
void initialize(compdb::File compdb, size_t refillCount, ParseOptions);
138160

139161
// Parses at most refillCount elements (passed during initialization)
140162
// from the compilation database passed during initialization.

indexer/Driver.cc

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,15 @@ class Driver {
958958
"{:.1f}s, merging: {:.1f}s, num errored TUs: {}).\n",
959959
numTus.first.value, total.value<secs>(), indexing.value<secs>(),
960960
merging.value<secs>(), numTus.second);
961+
auto parseStats = this->compdbParser.stats;
962+
auto totalSkipped = parseStats.skippedNonExistentTuFile
963+
+ parseStats.skippedNonTuFileExtension;
964+
if (totalSkipped != 0) {
965+
fmt::print("Skipped: {} compilation database entries (non main file "
966+
"extension: {}, not found on disk: {}).\n",
967+
totalSkipped, parseStats.skippedNonTuFileExtension,
968+
parseStats.skippedNonExistentTuFile);
969+
}
961970
}
962971

963972
private:
@@ -1194,12 +1203,14 @@ class Driver {
11941203
this->compdbCommandCount = compdbFile.commandCount();
11951204
this->options.numWorkers =
11961205
std::min(this->compdbCommandCount, this->numWorkers());
1197-
spdlog::debug("total {} compilation jobs", this->compdbCommandCount);
1206+
spdlog::debug("total {} command objects in compilation database",
1207+
this->compdbCommandCount);
11981208

11991209
// FIXME(def: resource-dir-extra): If we're passed in a resource dir
12001210
// as an extra argument, we should not pass it here.
1201-
this->compdbParser.initialize(compdbFile, this->refillCount(),
1202-
!this->options.isTesting);
1211+
this->compdbParser.initialize(
1212+
compdbFile, this->refillCount(),
1213+
compdb::ParseOptions(this->options.isTesting));
12031214
return FileGuard(compdbFile.file);
12041215
}
12051216

indexer/Worker.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ Worker::Worker(WorkerOptions &&options)
146146
compdb::ValidationOptions{.checkDirectoryPathsAreAbsolute = true});
147147
compdb::ResumableParser parser{};
148148
// See FIXME(ref: resource-dir-extra)
149-
parser.initialize(compdbFile, std::numeric_limits<size_t>::max(), true);
149+
parser.initialize(compdbFile, std::numeric_limits<size_t>::max(),
150+
compdb::ParseOptions{/*isTesting*/ false});
150151
parser.parseMore(this->compileCommands);
151152
std::fclose(compdbFile.file);
152153
break;

test/compdb/skipping-4.snapshot.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
---
2+
- command:
3+
- abc
4+
filePath: a.c
5+
directory: not_skipped
6+
- command:
7+
- abc
8+
filePath: a.cpp
9+
directory: not_skipped
10+
- command:
11+
- abc
12+
filePath: a.cc
13+
directory: not_skipped
14+
...
15+
---
16+
- command:
17+
- abc
18+
filePath: a.cxx
19+
directory: not_skipped
20+
...

test/compdb/skipping.json

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
[
2+
{
3+
"command": "abc",
4+
"directory": "not_skipped",
5+
"file": "a.c"
6+
},
7+
{
8+
"command": "abc",
9+
"directory": "not_skipped",
10+
"file": "a.cpp"
11+
},
12+
{
13+
"command": "abc",
14+
"directory": "not_skipped",
15+
"file": "a.cc"
16+
},
17+
{
18+
"command": "abc",
19+
"directory": "skipped",
20+
"file": "foo.h"
21+
},
22+
{
23+
"command": "abc",
24+
"directory": "skipped",
25+
"file": "foo.hpp"
26+
},
27+
{
28+
"command": "abc",
29+
"directory": "skipped",
30+
"file": "foo.hxx"
31+
},
32+
{
33+
"command": "abc",
34+
"directory": "not_skipped",
35+
"file": "a.cxx"
36+
},
37+
{
38+
"command": "abc",
39+
"directory": "skipped",
40+
"file": "foo.def"
41+
},
42+
{
43+
"command": "abc",
44+
"directory": "skipped",
45+
"file": "foo.S"
46+
}
47+
]

test/test_main.cc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ struct CompDbTestCase {
6060
std::string jsonFilename;
6161
size_t checkCount;
6262
std::vector<size_t> refillCountsToTry;
63+
compdb::ParseOptions parseOptions;
6364
};
6465

6566
// Use YAML instead of JSON for easy line-based diffs (a la insta in Rust-land)
@@ -158,7 +159,14 @@ TEST_CASE("COMPDB_PARSING") {
158159
scip_clang::compdb::ResumableParser parser;
159160

160161
std::vector<CompDbTestCase> testCases{};
161-
testCases.push_back(CompDbTestCase{"simple.json", 3, {2, 3, 4}});
162+
testCases.push_back(CompDbTestCase{
163+
"simple.json", 3, {2, 3, 4}, compdb::ParseOptions(/*isTesting*/ true)});
164+
testCases.push_back(
165+
CompDbTestCase{"skipping.json",
166+
9,
167+
{4},
168+
compdb::ParseOptions(/*inferResourceDir*/ false,
169+
/*skipNonMainFileEntries*/ true)});
162170

163171
auto dataDir =
164172
std::filesystem::current_path().append("test").append("compdb");
@@ -182,7 +190,7 @@ TEST_CASE("COMPDB_PARSING") {
182190

183191
for (auto refillCount : testCase.refillCountsToTry) {
184192
compdb::ResumableParser parser{};
185-
parser.initialize(compdbFile, refillCount, false);
193+
parser.initialize(compdbFile, refillCount, testCase.parseOptions);
186194
std::vector<std::vector<compdb::CommandObject>> commandGroups;
187195
std::string buffer;
188196
llvm::raw_string_ostream outStr(buffer);

0 commit comments

Comments
 (0)