Skip to content

Commit b6508a1

Browse files
committed
[ASTWriter] Detect more non-affecting FileIDs to reduce duplication of source locations
Currently, any FileID that references a module map file that was required for a compilation is considered as affecting. This misses an important opportunity to reduce the source location space taken by the resulting PCM. In particular, consider the situation where the same module map file is passed multiple times in the dependency chain: ```shell $ clang -fmodule-map-file=foo.modulemap ... -o mod1.pcm $ clang -fmodule-map-file=foo.modulemap -fmodule-file=mod1.pcm ... -o mod2.pcm ... $ clang -fmodule-map-file=foo.modulemap -fmodule-file=mod$((N-1)).pcm ... -o mod$N.pcm ``` Because `foo.modulemap` is read before reading any of the `.pcm` files, we have to create a unique `FileID` for it when creating each module. However, when reading the `.pcm` files, we will reuse the `FileID` loaded from it for the same module map file and the `FileID` we created can never be used again, but we will still not mark it as affecting and it will take the source location space in the output PCM. For a chain of N dependencies, this results in the file taking `N * (size of file)` source location space, which could be significant. For examples, we observer internally that some targets that run out of 2GB of source location space end up wasting up to 20% of that space in module maps as described above. I take extra care to still write the InputFile entries for those files, which has not been done before. It is required for ClangScanDeps to properly function. The change in the output of one of the ClangScanDeps tests is attributed to that: we now add a bit more module maps to the output in some tricky cases. E.g. in the test case the module map file is required and is referenced by another top-level module map, adding it is redundant, but should not be breaking.
1 parent 2e30f8d commit b6508a1

File tree

5 files changed

+126
-11
lines changed

5 files changed

+126
-11
lines changed

clang/include/clang/Serialization/ASTWriter.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,9 @@ class ASTWriter : public ASTDeserializationListener,
491491

492492
/// Mapping from a source location entry to whether it is affecting or not.
493493
llvm::BitVector IsSLocAffecting;
494+
/// Mapping from a source location entry to whether it must be included as
495+
/// input file.
496+
llvm::BitVector IsSLocFileEntryAffecting;
494497

495498
/// Mapping from \c FileID to an index into the FileID adjustment table.
496499
std::vector<FileID> NonAffectingFileIDs;

clang/lib/Serialization/ASTReader.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5862,6 +5862,12 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
58625862
}
58635863

58645864
CurrentModule->Kind = Kind;
5865+
// Note that we may be rewriting an existing location and it is important
5866+
// to keep doing that. In particular, we would like to prefer a
5867+
// `DefinitionLoc` loaded from the module file instead of the location
5868+
// created in the current source manager, because it allows the new
5869+
// location to be marked as "unaffecting" when writing and avoid creating
5870+
// duplicate locations for the same module map file.
58655871
CurrentModule->DefinitionLoc = DefinitionLoc;
58665872
CurrentModule->Signature = F.Signature;
58675873
CurrentModule->IsFromModuleFile = true;

clang/lib/Serialization/ASTWriter.cpp

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "clang/AST/TypeLocVisitor.h"
3939
#include "clang/Basic/Diagnostic.h"
4040
#include "clang/Basic/DiagnosticOptions.h"
41+
#include "clang/Basic/FileEntry.h"
4142
#include "clang/Basic/FileManager.h"
4243
#include "clang/Basic/FileSystemOptions.h"
4344
#include "clang/Basic/IdentifierTable.h"
@@ -81,6 +82,7 @@
8182
#include "llvm/ADT/APSInt.h"
8283
#include "llvm/ADT/ArrayRef.h"
8384
#include "llvm/ADT/DenseMap.h"
85+
#include "llvm/ADT/DenseSet.h"
8486
#include "llvm/ADT/Hashing.h"
8587
#include "llvm/ADT/PointerIntPair.h"
8688
#include "llvm/ADT/STLExtras.h"
@@ -166,18 +168,25 @@ static TypeCode getTypeCodeForTypeClass(Type::TypeClass id) {
166168

167169
namespace {
168170

169-
std::optional<std::set<const FileEntry *>>
171+
struct AffectingModuleMaps {
172+
llvm::DenseSet<FileID> DefinitionFileIDs;
173+
llvm::DenseSet<const FileEntry *> DefinitionFiles;
174+
};
175+
176+
std::optional<AffectingModuleMaps>
170177
GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
171178
if (!PP.getHeaderSearchInfo()
172179
.getHeaderSearchOpts()
173180
.ModulesPruneNonAffectingModuleMaps)
174181
return std::nullopt;
175182

176183
const HeaderSearch &HS = PP.getHeaderSearchInfo();
184+
const SourceManager &SM = PP.getSourceManager();
177185
const ModuleMap &MM = HS.getModuleMap();
178186

179-
std::set<const FileEntry *> ModuleMaps;
180-
std::set<const Module *> ProcessedModules;
187+
llvm::DenseSet<FileID> ModuleMaps;
188+
189+
llvm::DenseSet<const Module *> ProcessedModules;
181190
auto CollectModuleMapsForHierarchy = [&](const Module *M) {
182191
M = M->getTopLevelModule();
183192

@@ -192,13 +201,13 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
192201

193202
// The containing module map is affecting, because it's being pointed
194203
// into by Module::DefinitionLoc.
195-
if (auto FE = MM.getContainingModuleMapFile(Mod))
196-
ModuleMaps.insert(*FE);
204+
if (auto F = MM.getContainingModuleMapFileID(Mod); F.isValid())
205+
ModuleMaps.insert(F);
197206
// For inferred modules, the module map that allowed inferring is not
198207
// related to the virtual containing module map file. It did affect the
199208
// compilation, though.
200-
if (auto FE = MM.getModuleMapFileForUniquing(Mod))
201-
ModuleMaps.insert(*FE);
209+
if (auto F = MM.getModuleMapFileIDForUniquing(Mod); F.isValid())
210+
ModuleMaps.insert(F);
202211

203212
for (auto *SubM : Mod->submodules())
204213
Q.push(SubM);
@@ -268,7 +277,16 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
268277
// just ban module map hierarchies where module map defining a (sub)module X
269278
// includes a module map defining a module that's not a submodule of X.
270279

271-
return ModuleMaps;
280+
llvm::DenseSet<const FileEntry *> ModuleFileEntries;
281+
for (FileID MM : ModuleMaps) {
282+
if (auto *FE = SM.getFileEntryForID(MM))
283+
ModuleFileEntries.insert(FE);
284+
}
285+
286+
AffectingModuleMaps R;
287+
R.DefinitionFileIDs = std::move(ModuleMaps);
288+
R.DefinitionFiles = std::move(ModuleFileEntries);
289+
return std::move(R);
272290
}
273291

274292
class ASTTypeWriter {
@@ -1772,7 +1790,7 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
17721790
continue;
17731791

17741792
// Do not emit input files that do not affect current module.
1775-
if (!IsSLocAffecting[I])
1793+
if (!IsSLocFileEntryAffecting[I])
17761794
continue;
17771795

17781796
InputFileEntry Entry(*Cache->OrigEntry);
@@ -4924,6 +4942,7 @@ void ASTWriter::computeNonAffectingInputFiles() {
49244942
unsigned N = SrcMgr.local_sloc_entry_size();
49254943

49264944
IsSLocAffecting.resize(N, true);
4945+
IsSLocFileEntryAffecting.resize(N, true);
49274946

49284947
if (!WritingModule)
49294948
return;
@@ -4960,10 +4979,12 @@ void ASTWriter::computeNonAffectingInputFiles() {
49604979
continue;
49614980

49624981
// Don't prune module maps that are affecting.
4963-
if (llvm::is_contained(*AffectingModuleMaps, *Cache->OrigEntry))
4982+
if (AffectingModuleMaps->DefinitionFileIDs.contains(FID))
49644983
continue;
49654984

49664985
IsSLocAffecting[I] = false;
4986+
IsSLocFileEntryAffecting[I] =
4987+
AffectingModuleMaps->DefinitionFiles.contains(*Cache->OrigEntry);
49674988

49684989
FileIDAdjustment += 1;
49694990
// Even empty files take up one element in the offset table.

clang/test/ClangScanDeps/modules-extern-submodule.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ module third {}
4343
// CHECK-NEXT: "command-line": [
4444
// CHECK-NEXT: "-cc1",
4545
// CHECK: "-fmodule-map-file=[[PREFIX]]/second/second/module.modulemap"
46-
// CHECK-NOT: "-fmodule-map-file=[[PREFIX]]/second/second/sub.modulemap"
46+
// CHECK-NEXT: "-fmodule-map-file=[[PREFIX]]/second/second/sub.modulemap"
4747
// CHECK-NOT: "-fmodule-map-file=[[PREFIX]]/third/module.modulemap"
4848
// CHECK: "-fmodule-file=second=[[PREFIX]]/cache/{{.*}}/second-{{.*}}.pcm"
4949
// CHECK: ],
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// Check that the same module map file passed to -fmodule-map-file *and*
2+
// available from one of the `-fmodule-file` does not allocate extra source
3+
// location space. This optimization is important for using module maps in
4+
// large codebases to avoid running out of source location space.
5+
6+
// RUN: rm -rf %t && mkdir %t
7+
// RUN: split-file %s %t
8+
9+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules -fmodule-map-file=%t/base.map -fmodule-name=base -emit-module %t/base.map -o %t/base.pcm
10+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
11+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod1.map \
12+
// RUN: -fmodule-file=%t/base.pcm \
13+
// RUN: -fmodule-name=mod1 -emit-module %t/mod1.map -o %t/mod1.pcm
14+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
15+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod1.map -fmodule-map-file=%t/mod2.map \
16+
// RUN: -fmodule-file=%t/mod1.pcm \
17+
// RUN: -fmodule-name=mod2 -emit-module %t/mod2.map -o %t/mod2.pcm
18+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
19+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod2.map -fmodule-map-file=%t/mod3.map \
20+
// RUN: -fmodule-file=%t/mod2.pcm \
21+
// RUN: -fmodule-name=mod3 -emit-module %t/mod3.map -o %t/mod3.pcm
22+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
23+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod3.map -fmodule-map-file=%t/mod4.map \
24+
// RUN: -fmodule-file=%t/mod3.pcm \
25+
// RUN: -fmodule-name=mod4 -emit-module %t/mod4.map -o %t/mod4.pcm
26+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod4.map -fmodule-file=%t/mod4.pcm -fsyntax-only -verify %t/check_slocs.cc
27+
28+
//--- base.map
29+
module base { header "vector.h" }
30+
//--- mod1.map
31+
module mod1 { header "mod1.h" }
32+
//--- mod2.map
33+
module mod2 { header "mod2.h" }
34+
//--- mod3.map
35+
module mod3 { header "mod3.h" }
36+
//--- mod4.map
37+
module mod4 { header "mod4.h" }
38+
//--- check_slocs.cc
39+
#include "mod4.h"
40+
#pragma clang __debug sloc_usage // expected-remark {{source manager location address space usage}}
41+
// expected-note@* {{% of available space}}
42+
43+
// Module map files files that were specified on the command line are entered twice (once when parsing command-line, once loaded from the .pcm)
44+
// Those that not specified on the command line must be entered once.
45+
46+
// [email protected]:1 {{file entered 2 times}}
47+
// [email protected]:1 {{file entered 2 times}}
48+
// [email protected]:1 {{file entered 1 time}}
49+
// [email protected]:1 {{file entered 1 time}}
50+
// [email protected]:1 {{file entered 1 time}}
51+
// expected-note@* + {{file entered}}
52+
53+
54+
//--- vector.h
55+
#ifndef VECTOR_H
56+
#define VECTOR_H
57+
#endif
58+
59+
//--- mod1.h
60+
#ifndef MOD1
61+
#define MOD1
62+
#include "vector.h"
63+
int mod1();
64+
#endif
65+
//--- mod2.h
66+
#ifndef MOD2
67+
#define MOD2
68+
#include "vector.h"
69+
#include "mod1.h"
70+
int mod2();
71+
#endif
72+
//--- mod3.h
73+
#ifndef MOD3
74+
#define MOD3
75+
#include "vector.h"
76+
#include "mod2.h"
77+
int mod3();
78+
#endif
79+
//--- mod4.h
80+
#ifndef MOD4
81+
#define MOD4
82+
#include "vector.h"
83+
#include "mod3.h"
84+
int mod4();
85+
#endif

0 commit comments

Comments
 (0)