diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt index ce491fc7e8bf0..bb606449c61b1 100644 --- a/compiler-rt/lib/ctx_profile/CMakeLists.txt +++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt @@ -2,11 +2,13 @@ add_compiler_rt_component(ctx_profile) set(CTX_PROFILE_SOURCES CtxInstrProfiling.cpp + RootAutoDetector.cpp ) set(CTX_PROFILE_HEADERS CtxInstrContextNode.h CtxInstrProfiling.h + RootAutoDetector.h ) include_directories(..) diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp new file mode 100644 index 0000000000000..483c55c25eefe --- /dev/null +++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp @@ -0,0 +1,90 @@ +//===- RootAutodetector.cpp - detect contextual profiling roots -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RootAutoDetector.h" + +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap) +#include +#include +#include + +using namespace __ctx_profile; +template using Set = DenseMap; + +uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const { + // this requires --linkopt=-Wl,--export-dynamic + Dl_info Info; + if (dladdr(reinterpret_cast(CallsiteAddress), &Info) != 0) + return reinterpret_cast(Info.dli_saddr); + return 0; +} + +void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) { + ++TheTrie.Count; + auto *Current = &TheTrie; + // the stack is backwards - the first callsite is at the top. + for (int I = ST.size - 1; I >= 0; --I) { + uptr ChildAddr = ST.trace[I]; + auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)}); + ++Iter->second.Count; + Current = &Iter->second; + } +} + +DenseMap PerThreadCallsiteTrie::determineRoots() const { + // Assuming a message pump design, roots are those functions called by the + // message pump. The message pump is an infinite loop (for all practical + // considerations) fetching data from a queue. The root functions return - + // otherwise the message pump doesn't work. This function detects roots as the + // first place in the trie (starting from the root) where a function calls 2 + // or more functions. + // + // We start with a callsite trie - the nodes are callsites. Different child + // nodes may actually correspond to the same function. + // + // For example: using function(callsite) + // f1(csf1_1) -> f2(csf2_1) -> f3 + // -> f2(csf2_2) -> f4 + // + // would be represented in our trie as: + // csf1_1 -> csf2_1 -> f3 + // -> csf2_2 -> f4 + // + // While we can assert the control flow returns to f2, we don't know if it + // ever returns to f1. f2 could be the message pump. + // + // We need to convert our callsite tree into a function tree. We can also, + // more economically, just see how many distinct functions there are at a + // certain depth. When that count is greater than 1, we got to potential roots + // and everything above should be considered as non-roots. + DenseMap Result; + Set Worklist; + Worklist.insert({&TheTrie, {}}); + + while (!Worklist.empty()) { + Set NextWorklist; + DenseMap Candidates; + Worklist.forEach([&](const auto &KVP) { + auto [Node, _] = KVP; + auto SA = getFctStartAddr(Node->CallsiteAddress); + Candidates[SA] += Node->Count; + Node->Children.forEach([&](auto &ChildKVP) { + NextWorklist.insert({&ChildKVP.second, true}); + return true; + }); + return true; + }); + if (Candidates.size() > 1) { + Result.swap(Candidates); + break; + } + Worklist.swap(NextWorklist); + } + return Result; +} diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.h b/compiler-rt/lib/ctx_profile/RootAutoDetector.h new file mode 100644 index 0000000000000..85dd5ef1c32d9 --- /dev/null +++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.h @@ -0,0 +1,57 @@ +/*===- RootAutodetector.h- auto-detect roots for ctxprof -----------------===*\ +|* +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +|* See https://llvm.org/LICENSE.txt for license information. +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +|* +\*===----------------------------------------------------------------------===*/ + +#ifndef CTX_PROFILE_ROOTAUTODETECTOR_H_ +#define CTX_PROFILE_ROOTAUTODETECTOR_H_ + +#include "sanitizer_common/sanitizer_dense_map.h" +#include "sanitizer_common/sanitizer_internal_defs.h" +#include "sanitizer_common/sanitizer_stacktrace.h" +#include +#include + +using namespace __asan; +using namespace __sanitizer; + +namespace __ctx_profile { + +/// Capture all the stack traces observed for a specific thread. The "for a +/// specific thread" part is not enforced, but assumed in determineRoots. +class PerThreadCallsiteTrie { +protected: + /// A trie. A node is the address of a callsite in a function activation. A + /// child is a callsite in the activation made from the callsite + /// corresponding to the parent. + struct Trie final { + const uptr CallsiteAddress; + uint64_t Count = 0; + DenseMap Children; + + Trie(uptr CallsiteAddress = 0) : CallsiteAddress(CallsiteAddress) {} + }; + Trie TheTrie; + + /// Return the runtime start address of the function that contains the call at + /// the runtime address CallsiteAddress. May be overriden for easy testing. + virtual uptr getFctStartAddr(uptr CallsiteAddress) const; + +public: + PerThreadCallsiteTrie(const PerThreadCallsiteTrie &) = delete; + PerThreadCallsiteTrie(PerThreadCallsiteTrie &&) = default; + PerThreadCallsiteTrie() = default; + + virtual ~PerThreadCallsiteTrie() = default; + + void insertStack(const StackTrace &ST); + + /// Return the runtime address of root functions, as determined for this + /// thread, together with the number of samples that included them. + DenseMap determineRoots() const; +}; +} // namespace __ctx_profile +#endif diff --git a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt index 012fd7aff7862..0954d5cd34487 100644 --- a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt +++ b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt @@ -22,10 +22,12 @@ append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PR file(GLOB CTX_PROFILE_HEADERS ../*.h) set(CTX_PROFILE_SOURCES - ../CtxInstrProfiling.cpp) + ../CtxInstrProfiling.cpp + ../RootAutoDetector.cpp) set(CTX_PROFILE_UNITTESTS CtxInstrProfilingTest.cpp + RootAutoDetectorTest.cpp driver.cpp) include_directories(../../../include) diff --git a/compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp b/compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp new file mode 100644 index 0000000000000..8fd5bf004faf7 --- /dev/null +++ b/compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp @@ -0,0 +1,155 @@ +#include "../RootAutoDetector.h" +#include "sanitizer_common/sanitizer_array_ref.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace __ctx_profile; +using ::testing::IsEmpty; +using ::testing::Not; +using ::testing::SizeIs; + +// Utility for describing a preorder traversal. By default it captures the +// address and count at a callsite node. Implicitly nodes are expected to have 1 +// child. If they have none, we place a Marker::term and if they have more than +// one, we place a Marker::split(nr_of_children) For example, using a list +// notation, and letters to denote a pair of address and count: +// (A (B C) (D (E F))) is a list of markers: A, split(2), B, term, C, +// term, D, split(2), E, term, F, term +class Marker { + enum class Kind { End, Value, Split }; + const uptr Value; + const uptr Count; + const Kind K; + Marker(uptr V, uptr C, Kind S) : Value(V), Count(C), K(S) {} + +public: + Marker(uptr V, uptr C) : Marker(V, C, Kind::Value) {} + + static Marker split(uptr V) { return Marker(V, 0, Kind::Split); } + static Marker term() { return Marker(0, 0, Kind::End); } + + bool isSplit() const { return K == Kind::Split; } + bool isTerm() const { return K == Kind::End; } + bool isVal() const { return K == Kind::Value; } + + bool operator==(const Marker &M) const { + return Value == M.Value && Count == M.Count && K == M.K; + } +}; + +class MockCallsiteTrie final : public PerThreadCallsiteTrie { + // Return the first multiple of 100. + uptr getFctStartAddr(uptr CallsiteAddress) const override { + return (CallsiteAddress / 100) * 100; + } + + static void popAndCheck(ArrayRef &Preorder, Marker M) { + ASSERT_THAT(Preorder, Not(IsEmpty())); + ASSERT_EQ(Preorder[0], M); + Preorder = Preorder.drop_front(); + } + + static void checkSameImpl(const Trie &T, ArrayRef &Preorder) { + popAndCheck(Preorder, {T.CallsiteAddress, T.Count}); + + if (T.Children.empty()) { + popAndCheck(Preorder, Marker::term()); + return; + } + + if (T.Children.size() > 1) + popAndCheck(Preorder, Marker::split(T.Children.size())); + + T.Children.forEach([&](const auto &KVP) { + checkSameImpl(KVP.second, Preorder); + return true; + }); + } + +public: + void checkSame(ArrayRef Preorder) const { + checkSameImpl(TheTrie, Preorder); + ASSERT_THAT(Preorder, IsEmpty()); + } +}; + +TEST(PerThreadCallsiteTrieTest, Insert) { + MockCallsiteTrie R; + uptr Stack1[]{4, 3, 2, 1}; + R.insertStack(StackTrace(Stack1, 4)); + R.checkSame(ArrayRef( + {{0, 1}, {1, 1}, {2, 1}, {3, 1}, {4, 1}, Marker::term()})); + + uptr Stack2[]{5, 4, 3, 2, 1}; + R.insertStack(StackTrace(Stack2, 5)); + R.checkSame(ArrayRef( + {{0, 2}, {1, 2}, {2, 2}, {3, 2}, {4, 2}, {5, 1}, Marker::term()})); + + uptr Stack3[]{6, 3, 2, 1}; + R.insertStack(StackTrace(Stack3, 4)); + R.checkSame(ArrayRef({{0, 3}, + {1, 3}, + {2, 3}, + {3, 3}, + Marker::split(2), + {4, 2}, + {5, 1}, + Marker::term(), + {6, 1}, + Marker::term()})); + uptr Stack4[]{7, 2, 1}; + R.insertStack(StackTrace(Stack4, 3)); + R.checkSame(ArrayRef({{0, 4}, + {1, 4}, + {2, 4}, + Marker::split(2), + {7, 1}, + Marker::term(), + {3, 3}, + Marker::split(2), + {4, 2}, + {5, 1}, + Marker::term(), + {6, 1}, + Marker::term()})); +} + +TEST(PerThreadCallsiteTrieTest, DetectRoots) { + MockCallsiteTrie T; + + uptr Stack1[]{501, 302, 202, 102}; + uptr Stack2[]{601, 402, 203, 102}; + T.insertStack({Stack1, 4}); + T.insertStack({Stack2, 4}); + + auto R = T.determineRoots(); + EXPECT_THAT(R, SizeIs(2U)); + EXPECT_TRUE(R.contains(300)); + EXPECT_TRUE(R.contains(400)); +} + +TEST(PerThreadCallsiteTrieTest, DetectRootsNoBranches) { + MockCallsiteTrie T; + + uptr Stack1[]{501, 302, 202, 102}; + T.insertStack({Stack1, 4}); + + auto R = T.determineRoots(); + EXPECT_THAT(R, IsEmpty()); +} + +TEST(PerThreadCallsiteTrieTest, DetectRootsUnknownFct) { + MockCallsiteTrie T; + + uptr Stack1[]{501, 302, 202, 102}; + // The MockCallsiteTree address resolver resolves addresses over 100, so 40 + // will be mapped to 0. + uptr Stack2[]{601, 40, 203, 102}; + T.insertStack({Stack1, 4}); + T.insertStack({Stack2, 4}); + + auto R = T.determineRoots(); + ASSERT_THAT(R, SizeIs(2U)); + EXPECT_TRUE(R.contains(300)); + EXPECT_TRUE(R.contains(0)); +}