1010#include " sanitizer_common/sanitizer_allocator_internal.h"
1111#include " sanitizer_common/sanitizer_common.h"
1212#include " sanitizer_common/sanitizer_dense_map.h"
13+ #include " sanitizer_common/sanitizer_libc.h"
1314#include " sanitizer_common/sanitizer_mutex.h"
1415#include " sanitizer_common/sanitizer_placement_new.h"
1516#include " sanitizer_common/sanitizer_thread_safety.h"
17+ #include " sanitizer_common/sanitizer_vector.h"
1618
1719#include < assert.h>
1820
1921using namespace __ctx_profile ;
2022
23+ namespace {
24+ // Keep track of all the context roots we actually saw, so we can then traverse
25+ // them when the user asks for the profile in __llvm_ctx_profile_fetch
26+ __sanitizer::SpinMutex AllContextsMutex;
27+ SANITIZER_GUARDED_BY (AllContextsMutex)
28+ __sanitizer::Vector<ContextRoot *> AllContextRoots;
29+
30+ // utility to taint a pointer by setting the LSB. There is an assumption
31+ // throughout that the addresses of contexts are even (really, they should be
32+ // align(8), but "even"-ness is the minimum assumption)
33+ // "scratch contexts" are buffers that we return in certain cases - they are
34+ // large enough to allow for memory safe counter access, but they don't link
35+ // subcontexts below them (the runtime recognizes them and enforces that)
36+ ContextNode *markAsScratch (const ContextNode *Ctx) {
37+ return reinterpret_cast <ContextNode *>(reinterpret_cast <uint64_t >(Ctx) | 1 );
38+ }
39+
40+ // Used when getting the data from TLS. We don't *really* need to reset, but
41+ // it's a simpler system if we do.
42+ template <typename T> inline T consume (T &V) {
43+ auto R = V;
44+ V = {0 };
45+ return R;
46+ }
47+
48+ // We allocate at least kBuffSize Arena pages. The scratch buffer is also that
49+ // large.
50+ constexpr size_t kPower = 20 ;
51+ constexpr size_t kBuffSize = 1 << kPower ;
52+
53+ // Highly unlikely we need more than kBuffSize for a context.
54+ size_t getArenaAllocSize (size_t Needed) {
55+ if (Needed >= kBuffSize )
56+ return 2 * Needed;
57+ return kBuffSize ;
58+ }
59+
60+ // verify the structural integrity of the context
61+ bool validate (const ContextRoot *Root) {
62+ // all contexts should be laid out in some arena page. Go over each arena
63+ // allocated for this Root, and jump over contained contexts based on
64+ // self-reported sizes.
65+ __sanitizer::DenseMap<uint64_t , bool > ContextStartAddrs;
66+ for (const auto *Mem = Root->FirstMemBlock ; Mem; Mem = Mem->next ()) {
67+ const auto *Pos = Mem->start ();
68+ while (Pos < Mem->pos ()) {
69+ const auto *Ctx = reinterpret_cast <const ContextNode *>(Pos);
70+ if (!ContextStartAddrs.insert ({reinterpret_cast <uint64_t >(Ctx), true })
71+ .second )
72+ return false ;
73+ Pos += Ctx->size ();
74+ }
75+ }
76+
77+ // Now traverse the contexts again the same way, but validate all nonull
78+ // subcontext addresses appear in the set computed above.
79+ for (const auto *Mem = Root->FirstMemBlock ; Mem; Mem = Mem->next ()) {
80+ const auto *Pos = Mem->start ();
81+ while (Pos < Mem->pos ()) {
82+ const auto *Ctx = reinterpret_cast <const ContextNode *>(Pos);
83+ for (uint32_t I = 0 ; I < Ctx->callsites_size (); ++I)
84+ for (auto *Sub = Ctx->subContexts ()[I]; Sub; Sub = Sub->next ())
85+ if (!ContextStartAddrs.find (reinterpret_cast <uint64_t >(Sub)))
86+ return false ;
87+
88+ Pos += Ctx->size ();
89+ }
90+ }
91+ return true ;
92+ }
93+ } // namespace
94+
95+ // the scratch buffer - what we give when we can't produce a real context (the
96+ // scratch isn't "real" in that it's expected to be clobbered carelessly - we
97+ // don't read it). The other important thing is that the callees from a scratch
98+ // context also get a scratch context.
99+ // Eventually this can be replaced with per-function buffers, a'la the typical
100+ // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
101+ // the part about determining the nature of the subcontexts does.
102+ __thread char __Buffer[kBuffSize ] = {0 };
103+
104+ #define TheScratchContext \
105+ markAsScratch (reinterpret_cast <ContextNode *>(__Buffer))
106+
107+ // init the TLSes
108+ __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr ,
109+ nullptr };
110+ __thread ContextNode **volatile __llvm_ctx_profile_callsite[2 ] = {0 , 0 };
111+
112+ __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
113+ nullptr ;
114+
21115// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
22116// the dependency on the latter.
23117Arena *Arena::allocateNewArena (size_t Size, Arena *Prev) {
24118 assert (!Prev || Prev->Next == nullptr );
25- Arena *NewArena =
26- new (__sanitizer::InternalAlloc (Size + sizeof (Arena))) Arena (Size);
119+ Arena *NewArena = new (__sanitizer::InternalAlloc (
120+ Size + sizeof (Arena), /* cache=*/ nullptr , /* alignment=*/ ExpectedAlignment))
121+ Arena (Size);
27122 if (Prev)
28123 Prev->Next = NewArena;
29124 return NewArena;
@@ -38,3 +133,187 @@ void Arena::freeArenaList(Arena *&A) {
38133 }
39134 A = nullptr ;
40135}
136+
137+ inline ContextNode *ContextNode::alloc (char *Place, GUID Guid,
138+ uint32_t NrCounters,
139+ uint32_t NrCallsites,
140+ ContextNode *Next) {
141+ assert (reinterpret_cast <uint64_t >(Place) % ExpectedAlignment == 0 );
142+ return new (Place) ContextNode (Guid, NrCounters, NrCallsites, Next);
143+ }
144+
145+ void ContextNode::reset () {
146+ // FIXME(mtrofin): this is std::memset, which we can probably use if we
147+ // drop/reduce the dependency on sanitizer_common.
148+ for (uint32_t I = 0 ; I < NrCounters; ++I)
149+ counters ()[I] = 0 ;
150+ for (uint32_t I = 0 ; I < NrCallsites; ++I)
151+ for (auto *Next = subContexts ()[I]; Next; Next = Next->Next )
152+ Next->reset ();
153+ }
154+
155+ // If this is the first time we hit a callsite with this (Guid) particular
156+ // callee, we need to allocate.
157+ ContextNode *getCallsiteSlow (uint64_t Guid, ContextNode **InsertionPoint,
158+ uint32_t NrCounters, uint32_t NrCallsites) {
159+ auto AllocSize = ContextNode::getAllocSize (NrCounters, NrCallsites);
160+ auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem ;
161+ char *AllocPlace = Mem->tryBumpAllocate (AllocSize);
162+ if (!AllocPlace) {
163+ // if we failed to allocate on the current arena, allocate a new arena,
164+ // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
165+ // find it from now on for other cases when we need to getCallsiteSlow.
166+ // Note that allocateNewArena will link the allocated memory in the list of
167+ // Arenas.
168+ __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
169+ Mem->allocateNewArena (getArenaAllocSize (AllocSize), Mem);
170+ AllocPlace = Mem->tryBumpAllocate (AllocSize);
171+ }
172+ auto *Ret = ContextNode::alloc (AllocPlace, Guid, NrCounters, NrCallsites,
173+ *InsertionPoint);
174+ *InsertionPoint = Ret;
175+ return Ret;
176+ }
177+
178+ ContextNode *__llvm_ctx_profile_get_context (void *Callee, GUID Guid,
179+ uint32_t NrCounters,
180+ uint32_t NrCallsites) {
181+ // fast "out" if we're not even doing contextual collection.
182+ if (!__llvm_ctx_profile_current_context_root)
183+ return TheScratchContext;
184+
185+ // also fast "out" if the caller is scratch. We can see if it's scratch by
186+ // looking at the interior pointer into the subcontexts vector that the caller
187+ // provided, which, if the context is scratch, so is that interior pointer
188+ // (because all the address calculations are using even values. Or more
189+ // precisely, aligned - 8 values)
190+ auto **CallsiteContext = consume (__llvm_ctx_profile_callsite[0 ]);
191+ if (!CallsiteContext || isScratch (CallsiteContext))
192+ return TheScratchContext;
193+
194+ // if the callee isn't the expected one, return scratch.
195+ // Signal handler(s) could have been invoked at any point in the execution.
196+ // Should that have happened, and had it (the handler) be built with
197+ // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
198+ // Its sub call graph would have then populated
199+ // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
200+ // The normal call graph may be impacted in that, if the signal handler
201+ // happened somewhere before we read the TLS here, we'd see the TLS reset and
202+ // we'd also fail here. That would just mean we would loose counter values for
203+ // the normal subgraph, this time around. That should be very unlikely, but if
204+ // it happens too frequently, we should be able to detect discrepancies in
205+ // entry counts (caller-callee). At the moment, the design goes on the
206+ // assumption that is so unfrequent, though, that it's not worth doing more
207+ // for that case.
208+ auto *ExpectedCallee = consume (__llvm_ctx_profile_expected_callee[0 ]);
209+ if (ExpectedCallee != Callee)
210+ return TheScratchContext;
211+
212+ auto *Callsite = *CallsiteContext;
213+ // in the case of indirect calls, we will have all seen targets forming a
214+ // linked list here. Find the one corresponding to this callee.
215+ while (Callsite && Callsite->guid () != Guid) {
216+ Callsite = Callsite->next ();
217+ }
218+ auto *Ret = Callsite ? Callsite
219+ : getCallsiteSlow (Guid, CallsiteContext, NrCounters,
220+ NrCallsites);
221+ if (Ret->callsites_size () != NrCallsites ||
222+ Ret->counters_size () != NrCounters)
223+ __sanitizer::Printf (" [ctxprof] Returned ctx differs from what's asked: "
224+ " Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n " ,
225+ Ret, Guid, NrCallsites, NrCounters, Ret->guid (),
226+ Ret->callsites_size (), Ret->counters_size ());
227+ Ret->onEntry ();
228+ return Ret;
229+ }
230+
231+ // This should be called once for a Root. Allocate the first arena, set up the
232+ // first context.
233+ void setupContext (ContextRoot *Root, GUID Guid, uint32_t NrCounters,
234+ uint32_t NrCallsites) {
235+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock (
236+ &AllContextsMutex);
237+ // Re-check - we got here without having had taken a lock.
238+ if (Root->FirstMemBlock )
239+ return ;
240+ const auto Needed = ContextNode::getAllocSize (NrCounters, NrCallsites);
241+ auto *M = Arena::allocateNewArena (getArenaAllocSize (Needed));
242+ Root->FirstMemBlock = M;
243+ Root->CurrentMem = M;
244+ Root->FirstNode = ContextNode::alloc (M->tryBumpAllocate (Needed), Guid,
245+ NrCounters, NrCallsites);
246+ AllContextRoots.PushBack (Root);
247+ }
248+
249+ ContextNode *__llvm_ctx_profile_start_context (
250+ ContextRoot *Root, GUID Guid, uint32_t Counters,
251+ uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
252+ if (!Root->FirstMemBlock ) {
253+ setupContext (Root, Guid, Counters, Callsites);
254+ }
255+ if (Root->Taken .TryLock ()) {
256+ __llvm_ctx_profile_current_context_root = Root;
257+ Root->FirstNode ->onEntry ();
258+ return Root->FirstNode ;
259+ }
260+ // If this thread couldn't take the lock, return scratch context.
261+ __llvm_ctx_profile_current_context_root = nullptr ;
262+ return TheScratchContext;
263+ }
264+
265+ void __llvm_ctx_profile_release_context (ContextRoot *Root)
266+ SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
267+ if (__llvm_ctx_profile_current_context_root) {
268+ __llvm_ctx_profile_current_context_root = nullptr ;
269+ Root->Taken .Unlock ();
270+ }
271+ }
272+
273+ void __llvm_ctx_profile_start_collection () {
274+ size_t NrMemUnits = 0 ;
275+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock (
276+ &AllContextsMutex);
277+ for (uint32_t I = 0 ; I < AllContextRoots.Size (); ++I) {
278+ auto *Root = AllContextRoots[I];
279+ __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock (
280+ &Root->Taken );
281+ for (auto *Mem = Root->FirstMemBlock ; Mem; Mem = Mem->next ())
282+ ++NrMemUnits;
283+
284+ Root->FirstNode ->reset ();
285+ }
286+ __sanitizer::Printf (" [ctxprof] Initial NrMemUnits: %zu \n " , NrMemUnits);
287+ }
288+
289+ bool __llvm_ctx_profile_fetch (
290+ void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) {
291+ assert (Writer);
292+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock (
293+ &AllContextsMutex);
294+
295+ for (int I = 0 , E = AllContextRoots.Size (); I < E; ++I) {
296+ auto *Root = AllContextRoots[I];
297+ __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock (
298+ &Root->Taken );
299+ if (!validate (Root)) {
300+ __sanitizer::Printf (" [ctxprof] Contextual Profile is %s\n " , " invalid" );
301+ return false ;
302+ }
303+ if (!Writer (Data, *Root->FirstNode ))
304+ return false ;
305+ }
306+ return true ;
307+ }
308+
309+ void __llvm_ctx_profile_free () {
310+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock (
311+ &AllContextsMutex);
312+ for (int I = 0 , E = AllContextRoots.Size (); I < E; ++I)
313+ for (auto *A = AllContextRoots[I]->FirstMemBlock ; A;) {
314+ auto *C = A;
315+ A = A->next ();
316+ __sanitizer::InternalFree (C);
317+ }
318+ AllContextRoots.Reset ();
319+ }
0 commit comments