Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compiler-rt/lib/ctx_profile/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ endif()
add_compiler_rt_runtime(clang_rt.ctx_profile
STATIC
ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc RTSanitizerCommonSymbolizer
CFLAGS ${EXTRA_FLAGS}
SOURCES ${CTX_PROFILE_SOURCES}
ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}
Expand Down
1 change: 1 addition & 0 deletions compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class ContextNode final {
/// MUTEXDECL takes one parameter, the name of a field that is a mutex.
#define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \
PTRDECL(FunctionData, Next) \
VOLATILE_PTRDECL(void, EntryAddress) \
VOLATILE_PTRDECL(ContextRoot, CtxRoot) \
VOLATILE_PTRDECL(ContextNode, FlatCtx) \
MUTEXDECL(Mutex)
Expand Down
79 changes: 57 additions & 22 deletions compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//

#include "CtxInstrProfiling.h"
#include "RootAutoDetector.h"
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_atomic.h"
#include "sanitizer_common/sanitizer_atomic_clang.h"
Expand Down Expand Up @@ -43,6 +44,12 @@ Arena *FlatCtxArena = nullptr;
__thread bool IsUnderContext = false;
__sanitizer::atomic_uint8_t ProfilingStarted = {};

__sanitizer::atomic_uintptr_t RootDetector = {};
RootAutoDetector *getRootDetector() {
return reinterpret_cast<RootAutoDetector *>(
__sanitizer::atomic_load_relaxed(&RootDetector));
}

// utility to taint a pointer by setting the LSB. There is an assumption
// throughout that the addresses of contexts are even (really, they should be
// align(8), but "even"-ness is the minimum assumption)
Expand Down Expand Up @@ -201,7 +208,7 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
return Ret;
}

ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid,
uint32_t NumCounters) {
if (ContextNode *Existing = Data.FlatCtx)
return Existing;
Expand Down Expand Up @@ -232,6 +239,7 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
Data.FlatCtx = Ret;

Data.EntryAddress = Callee;
Data.Next = reinterpret_cast<FunctionData *>(
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
while (!__sanitizer::atomic_compare_exchange_strong(
Expand Down Expand Up @@ -296,8 +304,9 @@ ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid,
return TheScratchContext;
}

ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
uint32_t NumCounters) {
ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid,
uint32_t NumCounters, uint32_t NumCallsites,
ContextRoot *CtxRoot) {

// 1) if we are currently collecting a contextual profile, fetch a ContextNode
// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
Expand All @@ -316,27 +325,32 @@ ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
// entered once and never exit. They should be assumed to be entered before
// profiling starts - because profiling should start after the server is up
// and running (which is equivalent to "message pumps are set up").
ContextRoot *R = __llvm_ctx_profile_current_context_root;
if (!R) {
if (!CtxRoot) {
if (auto *RAD = getRootDetector())
RAD->sample();
else if (auto *CR = Data.CtxRoot)
return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites);
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
return TheScratchContext;
else
return markAsScratch(
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
}
auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
if (Ins)
Iter->second =
getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
NumCounters, 0);
return markAsScratch(onContextEnter(*Iter->second));
}

ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
GUID Guid, uint32_t NumCounters,
uint32_t NumCallsites) {
auto *CtxRoot = __llvm_ctx_profile_current_context_root;
// fast "out" if we're not even doing contextual collection.
if (!__llvm_ctx_profile_current_context_root)
return getUnhandledContext(*Data, Guid, NumCounters);
if (!CtxRoot)
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
nullptr);

// also fast "out" if the caller is scratch. We can see if it's scratch by
// looking at the interior pointer into the subcontexts vector that the caller
Expand All @@ -345,7 +359,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
// precisely, aligned - 8 values)
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
if (!CallsiteContext || isScratch(CallsiteContext))
return getUnhandledContext(*Data, Guid, NumCounters);
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
CtxRoot);

// if the callee isn't the expected one, return scratch.
// Signal handler(s) could have been invoked at any point in the execution.
Expand All @@ -363,7 +378,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
// for that case.
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
if (ExpectedCallee != Callee)
return getUnhandledContext(*Data, Guid, NumCounters);
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
CtxRoot);

auto *Callsite = *CallsiteContext;
// in the case of indirect calls, we will have all seen targets forming a
Expand All @@ -388,21 +404,23 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid,
uint32_t Counters,
uint32_t Callsites) {

return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid,
Counters, Callsites);
}

void __llvm_ctx_profile_release_context(FunctionData *FData)
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
if (!CurrentRoot || FData->CtxRoot != CurrentRoot)
return;
IsUnderContext = false;
if (__llvm_ctx_profile_current_context_root) {
__llvm_ctx_profile_current_context_root = nullptr;
assert(FData->CtxRoot);
FData->CtxRoot->Taken.Unlock();
}
assert(FData->CtxRoot);
__llvm_ctx_profile_current_context_root = nullptr;
FData->CtxRoot->Taken.Unlock();
}

void __llvm_ctx_profile_start_collection() {
void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) {
size_t NumMemUnits = 0;
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);
Expand All @@ -418,12 +436,28 @@ void __llvm_ctx_profile_start_collection() {
resetContextNode(*Root->FirstUnhandledCalleeNode);
__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
}
if (AutodetectDuration) {
// we leak RD intentionally. Knowing when to free it is tricky, there's a
// race condition with functions observing the `RootDectector` as non-null.
// This can be addressed but the alternatives have some added complexity and
// it's not (yet) worth it.
auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration);
RD->start();
} else {
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
}
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
}

bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
if (auto *RD = getRootDetector()) {
__sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
"finished well before attempting to fetch a context");
RD->join();
}

__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);

Expand All @@ -448,8 +482,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
const auto *Pos = reinterpret_cast<const FunctionData *>(
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
for (; Pos; Pos = Pos->Next)
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
Pos->FlatCtx->counters_size());
if (!Pos->CtxRoot)
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
Pos->FlatCtx->counters_size());
Writer.endFlatSection();
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData,

/// Prepares for collection. Currently this resets counter values but preserves
/// internal context tree structure.
void __llvm_ctx_profile_start_collection();
void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = 0);

/// Completely free allocated memory.
void __llvm_ctx_profile_free();
Expand Down
94 changes: 94 additions & 0 deletions compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "RootAutoDetector.h"

#include "CtxInstrProfiling.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
#include <assert.h>
Expand All @@ -17,6 +18,99 @@
using namespace __ctx_profile;
template <typename T> using Set = DenseMap<T, bool>;

namespace __sanitizer {
void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
bool request_fast, u32 max_depth) {
// We can't implement the fast variant. The fast variant ends up invoking an
// external allocator, because of pthread_attr_getstack. If this happens
// during an allocation of the program being instrumented, a non-reentrant
// lock may be taken (this was observed). The allocator called by
// pthread_attr_getstack will also try to take that lock.
UnwindSlow(pc, max_depth);
}
} // namespace __sanitizer

RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
Parent.AllSamples.PushBack(this);
}

void RootAutoDetector::start() {
atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
pthread_create(
&WorkerThread, nullptr,
+[](void *Ctx) -> void * {
RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
SleepForSeconds(RAD->WaitSeconds);
// To avoid holding the AllSamplesMutex, make a snapshot of all the
// thread samples collected so far
Vector<PerThreadSamples *> SamplesSnapshot;
{
GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
SamplesSnapshot.Resize(RAD->AllSamples.Size());
for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
SamplesSnapshot[I] = RAD->AllSamples[I];
}
DenseMap<uptr, uint64_t> AllRoots;
for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
auto [FAddr, Count] = KVP;
AllRoots[FAddr] += Count;
return true;
});
}
// FIXME: as a next step, establish a minimum relative nr of samples
// per root that would qualify it as a root.
for (auto *FD = reinterpret_cast<FunctionData *>(
atomic_load_relaxed(&RAD->FunctionDataListHead));
FD; FD = FD->Next) {
if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
FD->getOrAllocateContextRoot();
}
}
atomic_store_relaxed(&RAD->Self, 0);
return nullptr;
},
this);
}

void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }

void RootAutoDetector::sample() {
// tracking reentry in case we want to re-explore fast stack unwind - which
// does potentially re-enter the runtime because it calls the instrumented
// allocator because of pthread_attr_getstack. See the notes also on
// UnwindImpl above.
static thread_local bool Entered = false;
static thread_local uint64_t Entries = 0;
if (Entered || (++Entries % SampleRate))
return;
Entered = true;
collectStack();
Entered = false;
}

void RootAutoDetector::collectStack() {
GET_CALLER_PC_BP;
BufferedStackTrace CurrentStack;
CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
// 2 stack frames would be very unlikely to mean anything, since at least the
// compiler-rt frame - which can't be inlined - should be observable, which
// counts as 1; we can be even more aggressive with this number.
if (CurrentStack.size <= 2)
return;
static thread_local PerThreadSamples *ThisThreadSamples =
new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
PerThreadSamples(*this);

if (!ThisThreadSamples->M.TryLock())
return;

ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
ThisThreadSamples->M.Unlock();
}

uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
// this requires --linkopt=-Wl,--export-dynamic
Dl_info Info;
Expand Down
43 changes: 43 additions & 0 deletions compiler-rt/lib/ctx_profile/RootAutoDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "sanitizer_common/sanitizer_dense_map.h"
#include "sanitizer_common/sanitizer_internal_defs.h"
#include "sanitizer_common/sanitizer_stacktrace.h"
#include "sanitizer_common/sanitizer_vector.h"
#include <pthread.h>
#include <sanitizer/common_interface_defs.h>

Expand Down Expand Up @@ -53,5 +54,47 @@ class PerThreadCallsiteTrie {
/// thread, together with the number of samples that included them.
DenseMap<uptr, uint64_t> determineRoots() const;
};

class RootAutoDetector final {
// A prime number. We may want to make this configurable at collection start.
static const uint64_t SampleRate = 6113;
const unsigned WaitSeconds;
pthread_t WorkerThread;

struct PerThreadSamples {
PerThreadSamples(RootAutoDetector &Parent);

PerThreadCallsiteTrie TrieRoot;
SpinMutex M;
};
SpinMutex AllSamplesMutex;
SANITIZER_GUARDED_BY(AllSamplesMutex)
Vector<PerThreadSamples *> AllSamples;
atomic_uintptr_t &FunctionDataListHead;
atomic_uintptr_t &Self;
void collectStack();

public:
RootAutoDetector(atomic_uintptr_t &FunctionDataListHead,
atomic_uintptr_t &Self, unsigned WaitSeconds)
: WaitSeconds(WaitSeconds), FunctionDataListHead(FunctionDataListHead),
Self(Self) {}

// Samples the stack at `SampleRate` (rate observed independently on each
// thread) in thread local `PerThreadCallsiteTrie`s.
void sample();

// Start a thread waiting `WaitSeconds`, after which it uses the
// `PerThreadCallsiteTrie` data observed so far over all threads to determine
// roots. Marks those roots by traversing the linked list of FunctionData that
// starts at `FunctionDataListHead`, and assigning their `CtxRoot`. Finally,
// resets the `Self` atomic, so that other threads don't continue calling
// `sample`.
void start();

// join the waiting thread.
void join();
};

} // namespace __ctx_profile
#endif
Loading