1+ // ===- BPSectionOrdererBase.cpp -------------------------------------------===//
2+ //
3+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+ // See https://llvm.org/LICENSE.txt for license information.
5+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+ //
7+ // ===----------------------------------------------------------------------===//
8+
9+ #include " lld/Common/BPSectionOrdererBase.h"
10+ #include " lld/Common/ErrorHandler.h"
11+ #include " llvm/ADT/DenseSet.h"
12+ #include " llvm/ADT/SetVector.h"
13+ #include " llvm/ADT/SmallSet.h"
14+ #include " llvm/ADT/StringMap.h"
15+ #include " llvm/ProfileData/InstrProfReader.h"
16+ #include " llvm/Support/BalancedPartitioning.h"
17+ #include " llvm/Support/TimeProfiler.h"
18+ #include " llvm/Support/VirtualFileSystem.h"
19+
20+ #define DEBUG_TYPE " bp-section-orderer"
21+
22+ using namespace llvm ;
23+ using namespace lld ;
24+
25+ using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
26+
27+ static SmallVector<std::pair<unsigned , UtilityNodes>> getUnsForCompression (
28+ ArrayRef<const BPSectionBase *> sections,
29+ const DenseMap<const void *, uint64_t > §ionToIdx,
30+ ArrayRef<unsigned > sectionIdxs,
31+ DenseMap<unsigned , SmallVector<unsigned >> *duplicateSectionIdxs,
32+ BPFunctionNode::UtilityNodeT &maxUN) {
33+ TimeTraceScope timeScope (" Build nodes for compression" );
34+
35+ SmallVector<std::pair<unsigned , SmallVector<uint64_t >>> sectionHashes;
36+ sectionHashes.reserve (sectionIdxs.size ());
37+ SmallVector<uint64_t > hashes;
38+
39+ for (unsigned sectionIdx : sectionIdxs) {
40+ const auto *isec = sections[sectionIdx];
41+ isec->getSectionHashes (hashes, sectionToIdx);
42+ sectionHashes.emplace_back (sectionIdx, std::move (hashes));
43+ hashes.clear ();
44+ }
45+
46+ DenseMap<uint64_t , unsigned > hashFrequency;
47+ for (auto &[sectionIdx, hashes] : sectionHashes)
48+ for (auto hash : hashes)
49+ ++hashFrequency[hash];
50+
51+ if (duplicateSectionIdxs) {
52+ // Merge sections that are nearly identical
53+ SmallVector<std::pair<unsigned , SmallVector<uint64_t >>> newSectionHashes;
54+ DenseMap<uint64_t , unsigned > wholeHashToSectionIdx;
55+ for (auto &[sectionIdx, hashes] : sectionHashes) {
56+ uint64_t wholeHash = 0 ;
57+ for (auto hash : hashes)
58+ if (hashFrequency[hash] > 5 )
59+ wholeHash ^= hash;
60+ auto [it, wasInserted] =
61+ wholeHashToSectionIdx.insert (std::make_pair (wholeHash, sectionIdx));
62+ if (wasInserted) {
63+ newSectionHashes.emplace_back (sectionIdx, hashes);
64+ } else {
65+ (*duplicateSectionIdxs)[it->getSecond ()].push_back (sectionIdx);
66+ }
67+ }
68+ sectionHashes = newSectionHashes;
69+
70+ // Recompute hash frequencies
71+ hashFrequency.clear ();
72+ for (auto &[sectionIdx, hashes] : sectionHashes)
73+ for (auto hash : hashes)
74+ ++hashFrequency[hash];
75+ }
76+
77+ // Filter rare and common hashes and assign each a unique utility node that
78+ // doesn't conflict with the trace utility nodes
79+ DenseMap<uint64_t , BPFunctionNode::UtilityNodeT> hashToUN;
80+ for (auto &[hash, frequency] : hashFrequency) {
81+ if (frequency <= 1 || frequency * 2 > sectionHashes.size ())
82+ continue ;
83+ hashToUN[hash] = ++maxUN;
84+ }
85+
86+ SmallVector<std::pair<unsigned , UtilityNodes>> sectionUns;
87+ for (auto &[sectionIdx, hashes] : sectionHashes) {
88+ UtilityNodes uns;
89+ for (auto &hash : hashes) {
90+ auto it = hashToUN.find (hash);
91+ if (it != hashToUN.end ())
92+ uns.push_back (it->second );
93+ }
94+ sectionUns.emplace_back (sectionIdx, uns);
95+ }
96+ return sectionUns;
97+ }
98+
99+ llvm::DenseMap<const BPSectionBase *, size_t >
100+ BPSectionBase::reorderSectionsByBalancedPartitioning (
101+ size_t &highestAvailablePriority, llvm::StringRef profilePath,
102+ bool forFunctionCompression, bool forDataCompression,
103+ bool compressionSortStartupFunctions, bool verbose,
104+ SmallVector<std::unique_ptr<BPSectionBase>> &inputSections) {
105+ TimeTraceScope timeScope (" Setup Balanced Partitioning" );
106+ SmallVector<const BPSectionBase *> sections;
107+ DenseMap<const void *, uint64_t > sectionToIdx;
108+ StringMap<DenseSet<unsigned >> symbolToSectionIdxs;
109+
110+ // Process input sections
111+ for (const auto &isec : inputSections) {
112+ if (!isec->hasValidData ())
113+ continue ;
114+
115+ unsigned sectionIdx = sections.size ();
116+ sectionToIdx.try_emplace (isec->getSection (), sectionIdx);
117+ sections.emplace_back (isec.get ());
118+ for (auto &sym : isec->getSymbols ())
119+ symbolToSectionIdxs[sym->getName ()].insert (sectionIdx);
120+ }
121+ StringMap<DenseSet<unsigned >> rootSymbolToSectionIdxs;
122+ for (auto &entry : symbolToSectionIdxs) {
123+ StringRef name = entry.getKey ();
124+ auto §ionIdxs = entry.getValue ();
125+ name = BPSectionBase::getRootSymbol (name);
126+ rootSymbolToSectionIdxs[name].insert (sectionIdxs.begin (),
127+ sectionIdxs.end ());
128+ if (auto resolvedLinkageName =
129+ sections[*sectionIdxs.begin ()]->getResolvedLinkageName (name))
130+ rootSymbolToSectionIdxs[resolvedLinkageName.value ()].insert (
131+ sectionIdxs.begin (), sectionIdxs.end ());
132+ }
133+
134+ BPFunctionNode::UtilityNodeT maxUN = 0 ;
135+ DenseMap<unsigned , UtilityNodes> startupSectionIdxUNs;
136+ // Used to define the initial order for startup functions.
137+ DenseMap<unsigned , size_t > sectionIdxToTimestamp;
138+ std::unique_ptr<InstrProfReader> reader;
139+ if (!profilePath.empty ()) {
140+ auto fs = vfs::getRealFileSystem ();
141+ auto readerOrErr = InstrProfReader::create (profilePath, *fs);
142+ lld::checkError (readerOrErr.takeError ());
143+
144+ reader = std::move (readerOrErr.get ());
145+ for (auto &entry : *reader) {
146+ // Read all entries
147+ (void )entry;
148+ }
149+ auto &traces = reader->getTemporalProfTraces ();
150+
151+ DenseMap<unsigned , BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
152+ for (size_t traceIdx = 0 ; traceIdx < traces.size (); traceIdx++) {
153+ uint64_t currentSize = 0 , cutoffSize = 1 ;
154+ size_t cutoffTimestamp = 1 ;
155+ auto &trace = traces[traceIdx].FunctionNameRefs ;
156+ for (size_t timestamp = 0 ; timestamp < trace.size (); timestamp++) {
157+ auto [Filename, ParsedFuncName] = getParsedIRPGOName (
158+ reader->getSymtab ().getFuncOrVarName (trace[timestamp]));
159+ ParsedFuncName = BPSectionBase::getRootSymbol (ParsedFuncName);
160+
161+ auto sectionIdxsIt = rootSymbolToSectionIdxs.find (ParsedFuncName);
162+ if (sectionIdxsIt == rootSymbolToSectionIdxs.end ())
163+ continue ;
164+ auto §ionIdxs = sectionIdxsIt->getValue ();
165+ // If the same symbol is found in multiple sections, they might be
166+ // identical, so we arbitrarily use the size from the first section.
167+ currentSize += sections[*sectionIdxs.begin ()]->getSize ();
168+
169+ // Since BalancedPartitioning is sensitive to the initial order, we need
170+ // to explicitly define it to be ordered by earliest timestamp.
171+ for (unsigned sectionIdx : sectionIdxs) {
172+ auto [it, wasInserted] =
173+ sectionIdxToTimestamp.try_emplace (sectionIdx, timestamp);
174+ if (!wasInserted)
175+ it->getSecond () = std::min<size_t >(it->getSecond (), timestamp);
176+ }
177+
178+ if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
179+ ++maxUN;
180+ cutoffSize = 2 * currentSize;
181+ cutoffTimestamp = 2 * cutoffTimestamp;
182+ }
183+ for (unsigned sectionIdx : sectionIdxs)
184+ sectionIdxToFirstUN.try_emplace (sectionIdx, maxUN);
185+ }
186+ for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
187+ for (auto un = firstUN; un <= maxUN; ++un)
188+ startupSectionIdxUNs[sectionIdx].push_back (un);
189+ ++maxUN;
190+ sectionIdxToFirstUN.clear ();
191+ }
192+ }
193+
194+ SmallVector<unsigned > sectionIdxsForFunctionCompression,
195+ sectionIdxsForDataCompression;
196+ for (unsigned sectionIdx = 0 ; sectionIdx < sections.size (); sectionIdx++) {
197+ if (startupSectionIdxUNs.count (sectionIdx))
198+ continue ;
199+ const auto *isec = sections[sectionIdx];
200+ if (isec->isCodeSection ()) {
201+ if (forFunctionCompression)
202+ sectionIdxsForFunctionCompression.push_back (sectionIdx);
203+ } else {
204+ if (forDataCompression)
205+ sectionIdxsForDataCompression.push_back (sectionIdx);
206+ }
207+ }
208+
209+ if (compressionSortStartupFunctions) {
210+ SmallVector<unsigned > startupIdxs;
211+ for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
212+ startupIdxs.push_back (sectionIdx);
213+ auto unsForStartupFunctionCompression =
214+ getUnsForCompression (sections, sectionToIdx, startupIdxs,
215+ /* duplicateSectionIdxs=*/ nullptr , maxUN);
216+ for (auto &[sectionIdx, compressionUns] :
217+ unsForStartupFunctionCompression) {
218+ auto &uns = startupSectionIdxUNs[sectionIdx];
219+ uns.append (compressionUns);
220+ llvm::sort (uns);
221+ uns.erase (std::unique (uns.begin (), uns.end ()), uns.end ());
222+ }
223+ }
224+
225+ // Map a section index (order directly) to a list of duplicate section indices
226+ // (not ordered directly).
227+ DenseMap<unsigned , SmallVector<unsigned >> duplicateSectionIdxs;
228+ auto unsForFunctionCompression = getUnsForCompression (
229+ sections, sectionToIdx, sectionIdxsForFunctionCompression,
230+ &duplicateSectionIdxs, maxUN);
231+ auto unsForDataCompression = getUnsForCompression (
232+ sections, sectionToIdx, sectionIdxsForDataCompression,
233+ &duplicateSectionIdxs, maxUN);
234+
235+ std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
236+ nodesForDataCompression;
237+ for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
238+ nodesForStartup.emplace_back (sectionIdx, uns);
239+ for (auto &[sectionIdx, uns] : unsForFunctionCompression)
240+ nodesForFunctionCompression.emplace_back (sectionIdx, uns);
241+ for (auto &[sectionIdx, uns] : unsForDataCompression)
242+ nodesForDataCompression.emplace_back (sectionIdx, uns);
243+
244+ // Use the first timestamp to define the initial order for startup nodes.
245+ llvm::sort (nodesForStartup, [§ionIdxToTimestamp](auto &L, auto &R) {
246+ return std::make_pair (sectionIdxToTimestamp[L.Id ], L.Id ) <
247+ std::make_pair (sectionIdxToTimestamp[R.Id ], R.Id );
248+ });
249+ // Sort compression nodes by their Id (which is the section index) because the
250+ // input linker order tends to be not bad.
251+ llvm::sort (nodesForFunctionCompression,
252+ [](auto &L, auto &R) { return L.Id < R.Id ; });
253+ llvm::sort (nodesForDataCompression,
254+ [](auto &L, auto &R) { return L.Id < R.Id ; });
255+
256+ {
257+ TimeTraceScope timeScope (" Balanced Partitioning" );
258+ BalancedPartitioningConfig config;
259+ BalancedPartitioning bp (config);
260+ bp.run (nodesForStartup);
261+ bp.run (nodesForFunctionCompression);
262+ bp.run (nodesForDataCompression);
263+ }
264+
265+ unsigned numStartupSections = 0 ;
266+ unsigned numCodeCompressionSections = 0 ;
267+ unsigned numDuplicateCodeSections = 0 ;
268+ unsigned numDataCompressionSections = 0 ;
269+ unsigned numDuplicateDataSections = 0 ;
270+ SetVector<const BPSectionBase *> orderedSections;
271+ // Order startup functions,
272+ for (auto &node : nodesForStartup) {
273+ const auto *isec = sections[node.Id ];
274+ if (orderedSections.insert (isec))
275+ ++numStartupSections;
276+ }
277+ // then functions for compression,
278+ for (auto &node : nodesForFunctionCompression) {
279+ const auto *isec = sections[node.Id ];
280+ if (orderedSections.insert (isec))
281+ ++numCodeCompressionSections;
282+
283+ auto It = duplicateSectionIdxs.find (node.Id );
284+ if (It == duplicateSectionIdxs.end ())
285+ continue ;
286+ for (auto dupSecIdx : It->getSecond ()) {
287+ const auto *dupIsec = sections[dupSecIdx];
288+ if (orderedSections.insert (dupIsec))
289+ ++numDuplicateCodeSections;
290+ }
291+ }
292+ // then data for compression.
293+ for (auto &node : nodesForDataCompression) {
294+ const auto *isec = sections[node.Id ];
295+ if (orderedSections.insert (isec))
296+ ++numDataCompressionSections;
297+ auto It = duplicateSectionIdxs.find (node.Id );
298+ if (It == duplicateSectionIdxs.end ())
299+ continue ;
300+ for (auto dupSecIdx : It->getSecond ()) {
301+ const auto *dupIsec = sections[dupSecIdx];
302+ if (orderedSections.insert (dupIsec))
303+ ++numDuplicateDataSections;
304+ }
305+ }
306+
307+ if (verbose) {
308+ unsigned numTotalOrderedSections =
309+ numStartupSections + numCodeCompressionSections +
310+ numDuplicateCodeSections + numDataCompressionSections +
311+ numDuplicateDataSections;
312+ dbgs ()
313+ << " Ordered " << numTotalOrderedSections
314+ << " sections using balanced partitioning:\n Functions for startup: "
315+ << numStartupSections
316+ << " \n Functions for compression: " << numCodeCompressionSections
317+ << " \n Duplicate functions: " << numDuplicateCodeSections
318+ << " \n Data for compression: " << numDataCompressionSections
319+ << " \n Duplicate data: " << numDuplicateDataSections << " \n " ;
320+
321+ if (!profilePath.empty ()) {
322+ // Evaluate this function order for startup
323+ StringMap<std::pair<uint64_t , uint64_t >> symbolToPageNumbers;
324+ const uint64_t pageSize = (1 << 14 );
325+ uint64_t currentAddress = 0 ;
326+ for (const auto *isec : orderedSections) {
327+ for (auto &sym : isec->getSymbols ()) {
328+ uint64_t startAddress = currentAddress + sym->getValue ().value_or (0 );
329+ uint64_t endAddress = startAddress + sym->getSize ().value_or (0 );
330+ uint64_t firstPage = startAddress / pageSize;
331+ // I think the kernel might pull in a few pages when one it touched,
332+ // so it might be more accurate to force lastPage to be aligned by
333+ // 4?
334+ uint64_t lastPage = endAddress / pageSize;
335+ StringRef rootSymbol = sym->getName ();
336+ rootSymbol = BPSectionBase::getRootSymbol (rootSymbol);
337+ symbolToPageNumbers.try_emplace (rootSymbol, firstPage, lastPage);
338+ if (auto resolvedLinkageName =
339+ isec->getResolvedLinkageName (rootSymbol))
340+ symbolToPageNumbers.try_emplace (resolvedLinkageName.value (),
341+ firstPage, lastPage);
342+ }
343+ currentAddress += isec->getSize ();
344+ }
345+
346+ // The area under the curve F where F(t) is the total number of page
347+ // faults at step t.
348+ unsigned area = 0 ;
349+ for (auto &trace : reader->getTemporalProfTraces ()) {
350+ SmallSet<uint64_t , 0 > touchedPages;
351+ for (unsigned step = 0 ; step < trace.FunctionNameRefs .size (); step++) {
352+ auto traceId = trace.FunctionNameRefs [step];
353+ auto [Filename, ParsedFuncName] =
354+ getParsedIRPGOName (reader->getSymtab ().getFuncOrVarName (traceId));
355+ ParsedFuncName = BPSectionBase::getRootSymbol (ParsedFuncName);
356+ auto it = symbolToPageNumbers.find (ParsedFuncName);
357+ if (it != symbolToPageNumbers.end ()) {
358+ auto &[firstPage, lastPage] = it->getValue ();
359+ for (uint64_t i = firstPage; i <= lastPage; i++)
360+ touchedPages.insert (i);
361+ }
362+ area += touchedPages.size ();
363+ }
364+ }
365+ dbgs () << " Total area under the page fault curve: " << (float )area
366+ << " \n " ;
367+ }
368+ }
369+
370+ DenseMap<const BPSectionBase *, size_t > sectionPriorities;
371+ for (const auto *isec : orderedSections)
372+ sectionPriorities[isec] = --highestAvailablePriority;
373+ return sectionPriorities;
374+ }
0 commit comments