From c22cc933abd0cef3d08fa72a21e2f67b5ea48ce1 Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Wed, 26 Nov 2025 14:32:51 -0800 Subject: [PATCH 1/3] [scudo] Refactor initialization of TSDs. Instead of getting a lock and then checking/modifying the Initialization variable, make it an atomic. Doing this, we can remove one of the mutexes in shared TSDs and avoid any potential lock contention in both shared TSDs and exclusive TSDs if multiple threads do allocation operations at the same time. Add two new tests that make sure no crashes occur if multiple threads try and do allocations at the same time. --- .../scudo/standalone/tests/combined_test.cpp | 90 +++++++++++++++++++ .../lib/scudo/standalone/tsd_exclusive.h | 19 ++-- compiler-rt/lib/scudo/standalone/tsd_shared.h | 50 ++++++----- 3 files changed, 129 insertions(+), 30 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 1d4208b6a2aa0..50e7445c4b4e4 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -18,6 +18,7 @@ #include "size_class_map.h" #include +#include #include #include #include @@ -1396,6 +1397,7 @@ TEST(ScudoCombinedTest, FullUsableSizeMTE) { VerifyExactUsableSize(*Allocator); VerifyIterateOverUsableSize(*Allocator); } + // Verify that no special quarantine blocks appear in iterateOverChunks. TEST(ScudoCombinedTest, QuarantineIterateOverChunks) { using AllocatorT = TestAllocator; @@ -1426,3 +1428,91 @@ TEST(ScudoCombinedTest, QuarantineIterateOverChunks) { << std::hex << Base << " Size " << std::dec << Size; } } + +struct InitSizeClassConfig { + static const scudo::uptr NumBits = 1; + static const scudo::uptr MinSizeLog = 10; + static const scudo::uptr MidSizeLog = 10; + static const scudo::uptr MaxSizeLog = 13; + static const scudo::u16 MaxNumCachedHint = 8; + static const scudo::uptr MaxBytesCachedLog = 12; + static const scudo::uptr SizeDelta = 0; +}; + +struct TestInitSizeConfig { + static const bool MaySupportMemoryTagging = false; + static const bool QuarantineDisabled = true; + + struct Primary { + // In order to properly test the usable size, this Primary config has + // four real size classes: 1024, 2048, 4096, 8192. + using SizeClassMap = scudo::FixedSizeClassMap; + static const scudo::uptr RegionSizeLog = 21U; + static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN; + static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX; + typedef scudo::uptr CompactPtrT; + static const scudo::uptr CompactPtrScale = 0; + static const bool EnableRandomOffset = true; + static const scudo::uptr MapSizeIncrement = 1UL << 18; + static const scudo::uptr GroupSizeLog = 18; + }; + template + using PrimaryT = scudo::SizeClassAllocator64; + + struct Secondary { + template + using CacheT = scudo::MapAllocatorNoCache; + }; + + template using SecondaryT = scudo::MapAllocator; +}; + +struct TestInitSizeTSDSharedConfig : public TestInitSizeConfig { + template using TSDRegistryT = scudo::TSDRegistrySharedT; +}; + +struct TestInitSizeTSDExclusiveConfig : public TestInitSizeConfig { + template using TSDRegistryT = scudo::TSDRegistryExT; +}; + +template +void RunStress() { + auto Allocator = std::unique_ptr(new AllocatorT()); + + // This test is designed to try and have many threads trying to initialize + // the TSD at the same time. Make sure this doesn't crash. + std::atomic_bool StartRunning = false; + std::vector threads; + for (size_t I = 0; I < 16; I++) { + threads.emplace_back(new std::thread([&Allocator, &StartRunning] (){ + while (!StartRunning.load()); + + void *Ptr = Allocator->allocate(10, Origin); + EXPECT_TRUE(Ptr != nullptr); + // Make sure this value is not optimized away. + asm volatile("" : : "r,m"(Ptr) : "memory"); + Allocator->deallocate(Ptr, Origin); + })); + } + + StartRunning = true; + + for (auto* thread : threads) { + thread->join(); + delete thread; + } +} + +TEST(ScudoCombinedTest, StressThreadInitTSDShared) { + using AllocatorT = scudo::Allocator; + // Run the stress test a few times. + for (size_t I = 0; I < 10; I++) + RunStress(); +} + +TEST(ScudoCombinedTest, StressThreadInitTSDExclusive) { + using AllocatorT = scudo::Allocator; + // Run the stress test a few times. + for (size_t I = 0; I < 10; I++) + RunStress(); +} diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h index a58ba6505089f..75921f2be3ffe 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h +++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h @@ -52,17 +52,20 @@ template struct TSDRegistryExT { bool UnlockRequired; }; - void init(Allocator *Instance) REQUIRES(Mutex) { - DCHECK(!Initialized); + void init(Allocator *Instance) EXCLUDES(Mutex) { + ScopedLock L(Mutex); + // If more than one thread is initializing at the exact same moment, the + // threads that lose don't need to do anything. + if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0)) + return; Instance->init(); CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread), 0); FallbackTSD.init(Instance); - Initialized = true; + atomic_store_relaxed(&Initialized, 1); } - void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) { - ScopedLock L(Mutex); - if (LIKELY(Initialized)) + void initOnceMaybe(Allocator *Instance) { + if (LIKELY(atomic_load_relaxed(&Initialized) != 0)) return; init(Instance); // Sets Initialized. } @@ -81,7 +84,7 @@ template struct TSDRegistryExT { FallbackTSD = {}; State = {}; ScopedLock L(Mutex); - Initialized = false; + atomic_store_relaxed(&Initialized, 0); } void drainCaches(Allocator *Instance) { @@ -158,7 +161,7 @@ template struct TSDRegistryExT { } pthread_key_t PThreadKey = {}; - bool Initialized GUARDED_BY(Mutex) = false; + atomic_u8 Initialized = {}; atomic_u8 Disabled = {}; TSD FallbackTSD; HybridMutex Mutex; diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h index 404e984e1f5e9..425a028c955aa 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_shared.h +++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h @@ -47,20 +47,24 @@ struct TSDRegistrySharedT { TSD *CurrentTSD; }; - void init(Allocator *Instance) REQUIRES(Mutex) { - DCHECK(!Initialized); + void init(Allocator *Instance) EXCLUDES(Mutex) { + ScopedLock L(Mutex); + // If more than one thread is initializing at the exact same moment, the + // threads that lose don't need to do anything. + if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0)) + return; + Instance->init(); for (u32 I = 0; I < TSDsArraySize; I++) TSDs[I].init(Instance); const u32 NumberOfCPUs = getNumberOfCPUs(); setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount : Min(NumberOfCPUs, DefaultTSDCount)); - Initialized = true; + atomic_store_relaxed(&Initialized, 1); } - void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) { - ScopedLock L(Mutex); - if (LIKELY(Initialized)) + void initOnceMaybe(Allocator *Instance) { + if (LIKELY(atomic_load_relaxed(&Initialized) != 0)) return; init(Instance); // Sets Initialized. } @@ -72,11 +76,11 @@ struct TSDRegistrySharedT { } setCurrentTSD(nullptr); ScopedLock L(Mutex); - Initialized = false; + atomic_store_relaxed(&Initialized, 0); } void drainCaches(Allocator *Instance) { - ScopedLock L(MutexTSDs); + ScopedLock L(Mutex); for (uptr I = 0; I < NumberOfTSDs; ++I) { TSDs[I].lock(); Instance->drainCache(&TSDs[I]); @@ -93,7 +97,6 @@ struct TSDRegistrySharedT { void disable() NO_THREAD_SAFETY_ANALYSIS { Mutex.lock(); - MutexTSDs.lock(); for (u32 I = 0; I < TSDsArraySize; I++) TSDs[I].lock(); } @@ -101,13 +104,14 @@ struct TSDRegistrySharedT { void enable() NO_THREAD_SAFETY_ANALYSIS { for (s32 I = static_cast(TSDsArraySize - 1); I >= 0; I--) TSDs[I].unlock(); - MutexTSDs.unlock(); Mutex.unlock(); } bool setOption(Option O, sptr Value) { - if (O == Option::MaxTSDsCount) + if (O == Option::MaxTSDsCount) { + ScopedLock L(Mutex); return setNumberOfTSDs(static_cast(Value)); + } if (O == Option::ThreadDisableMemInit) setDisableMemInit(Value); // Not supported by the TSD Registry, but not an error either. @@ -116,8 +120,8 @@ struct TSDRegistrySharedT { bool getDisableMemInit() const { return *getTlsPtr() & 1; } - void getStats(ScopedString *Str) EXCLUDES(MutexTSDs) { - ScopedLock L(MutexTSDs); + void getStats(ScopedString *Str) EXCLUDES(Mutex) { + ScopedLock L(Mutex); Str->append("Stats: SharedTSDs: %u available; total %u\n", NumberOfTSDs, TSDsArraySize); @@ -171,8 +175,7 @@ struct TSDRegistrySharedT { return reinterpret_cast *>(*getTlsPtr() & ~1ULL); } - bool setNumberOfTSDs(u32 N) EXCLUDES(MutexTSDs) { - ScopedLock L(MutexTSDs); + bool setNumberOfTSDs(u32 N) REQUIRES(Mutex) { if (N < NumberOfTSDs) return false; if (N > TSDsArraySize) @@ -213,14 +216,14 @@ struct TSDRegistrySharedT { // TSDs is an array of locks which is not supported for marking thread-safety // capability. NOINLINE TSD *getTSDAndLockSlow(TSD *CurrentTSD) - EXCLUDES(MutexTSDs) { + EXCLUDES(Mutex) { // Use the Precedence of the current TSD as our random seed. Since we are // in the slow path, it means that tryLock failed, and as a result it's // very likely that said Precedence is non-zero. const u32 R = static_cast(CurrentTSD->getPrecedence()); u32 N, Inc; { - ScopedLock L(MutexTSDs); + ScopedLock L(Mutex); N = NumberOfTSDs; DCHECK_NE(NumberOfCoPrimes, 0U); Inc = CoPrimes[R % NumberOfCoPrimes]; @@ -257,12 +260,15 @@ struct TSDRegistrySharedT { } atomic_u32 CurrentIndex = {}; - u32 NumberOfTSDs GUARDED_BY(MutexTSDs) = 0; - u32 NumberOfCoPrimes GUARDED_BY(MutexTSDs) = 0; - u32 CoPrimes[TSDsArraySize] GUARDED_BY(MutexTSDs) = {}; - bool Initialized GUARDED_BY(Mutex) = false; + u32 NumberOfTSDs GUARDED_BY(Mutex) = 0; + u32 NumberOfCoPrimes GUARDED_BY(Mutex) = 0; + u32 CoPrimes[TSDsArraySize] GUARDED_BY(Mutex) = {}; + atomic_u8 Initialized = {}; + // Used for global initialization and TSDs access. + // Acquiring the global initialization should only lock once in normal + // operation, which is why using it for TSDs access should not cause + // any interference. HybridMutex Mutex; - HybridMutex MutexTSDs; TSD TSDs[TSDsArraySize]; }; From e73ac183681359e35350e77f4f5fbbbe77933f3c Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Wed, 26 Nov 2025 14:41:10 -0800 Subject: [PATCH 2/3] clang format. --- .../lib/scudo/standalone/tests/combined_test.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 50e7445c4b4e4..ef3df12754ed2 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -1475,17 +1475,17 @@ struct TestInitSizeTSDExclusiveConfig : public TestInitSizeConfig { template using TSDRegistryT = scudo::TSDRegistryExT; }; -template -void RunStress() { +template void RunStress() { auto Allocator = std::unique_ptr(new AllocatorT()); // This test is designed to try and have many threads trying to initialize // the TSD at the same time. Make sure this doesn't crash. std::atomic_bool StartRunning = false; - std::vector threads; + std::vector threads; for (size_t I = 0; I < 16; I++) { - threads.emplace_back(new std::thread([&Allocator, &StartRunning] (){ - while (!StartRunning.load()); + threads.emplace_back(new std::thread([&Allocator, &StartRunning]() { + while (!StartRunning.load()) + ; void *Ptr = Allocator->allocate(10, Origin); EXPECT_TRUE(Ptr != nullptr); @@ -1497,7 +1497,7 @@ void RunStress() { StartRunning = true; - for (auto* thread : threads) { + for (auto *thread : threads) { thread->join(); delete thread; } From 5fadee294dadc735df00967ac56d7d596e0ef997 Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Mon, 8 Dec 2025 14:58:50 -0800 Subject: [PATCH 3/3] Remove copy and pasted comment. --- compiler-rt/lib/scudo/standalone/tests/combined_test.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index ef3df12754ed2..b70b9c9269fed 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -1444,8 +1444,6 @@ struct TestInitSizeConfig { static const bool QuarantineDisabled = true; struct Primary { - // In order to properly test the usable size, this Primary config has - // four real size classes: 1024, 2048, 4096, 8192. using SizeClassMap = scudo::FixedSizeClassMap; static const scudo::uptr RegionSizeLog = 21U; static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;