From 7bf6476f29acda05f819163c9b2f504d3651751e Mon Sep 17 00:00:00 2001 From: Jeremy Koritzinsky Date: Mon, 15 Dec 2025 16:21:31 -0800 Subject: [PATCH 1/8] Refactor lock upgrade logic --- src/coreclr/vm/syncblk.cpp | 51 +++++++++++++++++++++++++++----------- src/coreclr/vm/syncblk.h | 2 ++ 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/src/coreclr/vm/syncblk.cpp b/src/coreclr/vm/syncblk.cpp index 00021d92f87d65..a8c114c9f7ea7c 100644 --- a/src/coreclr/vm/syncblk.cpp +++ b/src/coreclr/vm/syncblk.cpp @@ -1735,7 +1735,30 @@ OBJECTHANDLE SyncBlock::GetOrCreateLock(OBJECTREF lockObj) // We'll likely need to put this lock object into the sync block. // Create the handle here. - OBJECTHANDLEHolder lockHandle = GetAppDomain()->CreateHandle(lockObj); + OBJECTHANDLE lockHandle = GetAppDomain()->CreateHandle(lockObj); + + if (TryUpgradeThinLockToFullLock(lockHandle)) + { + // Our lock instance is the one in the sync block now. + return lockHandle; + } + + // Another thread beat us to the upgrade. + // Delete our GC handle. + DestroyHandle(lockHandle); + + return VolatileLoad(&m_Lock); +} + +bool SyncBlock::TryUpgradeThinLockToFullLock(OBJECTHANDLE lockHandle) +{ + CONTRACTL + { + GC_TRIGGERS; + THROWS; + MODE_COOPERATIVE; + } + CONTRACTL_END; // Switch to preemptive so we can grab the spin-lock. // Use the NO_DTOR version so we don't do a coop->preemptive->coop transition on return. @@ -1747,10 +1770,9 @@ OBJECTHANDLE SyncBlock::GetOrCreateLock(OBJECTREF lockObj) GCX_PREEMP_NO_DTOR_END(); // Check again now that we hold the spin-lock - existingLock = m_Lock; - if (existingLock != (OBJECTHANDLE)NULL) + if (m_Lock != (OBJECTHANDLE)NULL) { - return existingLock; + return false; } // We need to create a new lock @@ -1761,32 +1783,33 @@ OBJECTHANDLE SyncBlock::GetOrCreateLock(OBJECTREF lockObj) if (thinLock != 0) { - // We have thin-lock info that needs to be transferred to the lock object. + OBJECTREF lockObj = ObjectFromHandle(lockHandle); + GCPROTECT_BEGIN(lockObj); + DWORD lockThreadId = thinLock & SBLK_MASK_LOCK_THREADID; DWORD recursionLevel = (thinLock & SBLK_MASK_LOCK_RECLEVEL) >> SBLK_RECLEVEL_SHIFT; _ASSERTE(lockThreadId != 0); PREPARE_NONVIRTUAL_CALLSITE(METHOD__LOCK__INITIALIZE_FOR_MONITOR); DECLARE_ARGHOLDER_ARRAY(args, 3); - args[ARGNUM_0] = OBJECTREF_TO_ARGHOLDER(ObjectFromHandle(lockHandle)); + args[ARGNUM_0] = OBJECTREF_TO_ARGHOLDER(lockObj); args[ARGNUM_1] = DWORD_TO_ARGHOLDER(lockThreadId); args[ARGNUM_2] = DWORD_TO_ARGHOLDER(recursionLevel); CALL_MANAGED_METHOD_NORET(args); - } - VolatileStore(&m_Lock, lockHandle.GetValue()); - - // Our lock instance is in the sync block now. - // Don't release it. - lockHandle.SuppressRelease(); + GCPROTECT_END(); + } - // Also, clear the thin lock info. + VolatileStore(&m_Lock, lockHandle); + // Clear the thin lock info. // It won't be used any more, but it will look out of date. // Only clear the relevant bits, as the spin-lock bit is used to lock this method. // That bit will be reset upon return. m_thinLock.StoreWithoutBarrier(m_thinLock.LoadWithoutBarrier() & ~((SBLK_MASK_LOCK_THREADID) | (SBLK_MASK_LOCK_RECLEVEL))); - return lockHandle; + // Our lock instance is in the sync block now. + // Don't release it. + return true; } #endif // !DACCESS_COMPILE diff --git a/src/coreclr/vm/syncblk.h b/src/coreclr/vm/syncblk.h index ec7f8567ac6c9b..19cf57f7b6f9cd 100644 --- a/src/coreclr/vm/syncblk.h +++ b/src/coreclr/vm/syncblk.h @@ -592,6 +592,8 @@ class SyncBlock private: void InitializeThinLock(DWORD recursionLevel, DWORD threadId); + bool TryUpgradeThinLockToFullLock(OBJECTHANDLE lockHandle); + friend struct ::cdac_data; }; From 01bb0c026bff3ba23b6804d265781a662154bab1 Mon Sep 17 00:00:00 2001 From: Jeremy Koritzinsky Date: Tue, 16 Dec 2025 15:20:46 -0800 Subject: [PATCH 2/8] Move all object header manipulation back into native code. --- .../src/System/Threading/Monitor.CoreCLR.cs | 16 +- .../System/Threading/ObjectHeader.CoreCLR.cs | 202 +----------------- src/coreclr/vm/comsynchronizable.cpp | 8 + src/coreclr/vm/comsynchronizable.h | 1 + src/coreclr/vm/ecalllist.h | 3 +- src/coreclr/vm/syncblk.h | 5 + src/coreclr/vm/syncblk.inl | 149 ++++++++++++- 7 files changed, 176 insertions(+), 208 deletions(-) diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs index ab5c33950ab010..4bc9af62c467ca 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs @@ -56,7 +56,8 @@ static Lock GetLockObjectFallback(object obj) #region Public Enter/Exit methods public static void Enter(object obj) { - ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); + ArgumentNullException.ThrowIfNull(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); if (result == ObjectHeader.HeaderLockResult.Success) return; @@ -65,7 +66,8 @@ public static void Enter(object obj) public static bool TryEnter(object obj) { - ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); + ArgumentNullException.ThrowIfNull(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); if (result == ObjectHeader.HeaderLockResult.Success) return true; @@ -78,8 +80,9 @@ public static bool TryEnter(object obj) public static bool TryEnter(object obj, int millisecondsTimeout) { ArgumentOutOfRangeException.ThrowIfLessThan(millisecondsTimeout, -1); + ArgumentNullException.ThrowIfNull(obj); - ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); if (result == ObjectHeader.HeaderLockResult.Success) return true; @@ -105,10 +108,9 @@ public static void Exit(object obj) GetLockObject(obj).Exit(); } - // Marked no-inlining to prevent recursive inlining of IsAcquired. - [MethodImpl(MethodImplOptions.NoInlining)] public static bool IsEntered(object obj) { + ArgumentNullException.ThrowIfNull(obj); ObjectHeader.HeaderLockResult result = ObjectHeader.IsAcquired(obj); if (result == ObjectHeader.HeaderLockResult.Success) return true; @@ -122,7 +124,8 @@ public static bool IsEntered(object obj) internal static void SynchronizedMethodEnter(object obj, ref bool lockTaken) { - ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); + ArgumentNullException.ThrowIfNull(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); if (result == ObjectHeader.HeaderLockResult.Success) { lockTaken = true; @@ -135,6 +138,7 @@ internal static void SynchronizedMethodEnter(object obj, ref bool lockTaken) internal static void SynchronizedMethodExit(object obj, ref bool lockTaken) { + Debug.Assert(obj is not null); // Inlined Monitor.Exit with a few tweaks if (!lockTaken) return; diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs index 0b978c2e000765..90bbb4c60fa657 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs @@ -18,36 +18,6 @@ namespace System.Threading /// internal static class ObjectHeader { - // The following three header bits are reserved for the GC engine: - // BIT_SBLK_UNUSED = 0x80000000 - // BIT_SBLK_FINALIZER_RUN = 0x40000000 - // BIT_SBLK_GC_RESERVE = 0x20000000 - // - // All other bits may be used to store runtime data: hash code, sync entry index, etc. - // Here we use the same bit layout as in CLR: if bit 26 (BIT_SBLK_IS_HASHCODE) is set, - // all the lower bits 0..25 store the hash code, otherwise they store either the sync - // entry index (indicated by BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) or thin lock data. - private const int IS_HASHCODE_BIT_NUMBER = 26; - private const int IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER = 27; - private const int BIT_SBLK_IS_HASHCODE = 1 << IS_HASHCODE_BIT_NUMBER; - internal const int MASK_HASHCODE_INDEX = BIT_SBLK_IS_HASHCODE - 1; - private const int BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX = 1 << IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER; - - // This lock is only taken when we need to modify the index value in m_SyncBlockValue. - // It should not be taken if the object already has a real syncblock index. - // In this managed side, we skip the fast path while this spinlock is in use. - // We'll sync up in the slow path. - private const int BIT_SBLK_SPIN_LOCK = 0x10000000; - - // if BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX is clear, the rest of the header dword is laid out as follows: - // - lower sixteen bits (bits 0 thru 15) is thread id used for the thin locks - // value is zero if no thread is holding the lock - // - following six bits (bits 16 thru 21) is recursion level used for the thin locks - // value is zero if lock is not taken or only taken once by the same thread - private const int SBLK_MASK_LOCK_THREADID = 0x0000FFFF; // special value of 0 + 65535 thread ids - private const int SBLK_MASK_LOCK_RECLEVEL = 0x003F0000; // 64 recursion levels - private const int SBLK_LOCK_RECLEVEL_INC = 0x00010000; // each level is this much higher than the previous one - // These must match the values in syncblk.h public enum HeaderLockResult { @@ -57,178 +27,12 @@ public enum HeaderLockResult }; [MethodImpl(MethodImplOptions.InternalCall)] - private static extern HeaderLockResult AcquireInternal(object obj); + public static extern HeaderLockResult Acquire(object obj); [MethodImpl(MethodImplOptions.InternalCall)] public static extern HeaderLockResult Release(object obj); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int* GetHeaderPtr(byte* ppObjectData) - { - // The header is the 4 bytes before a pointer-sized chunk before the object data pointer. - return (int*)(ppObjectData - sizeof(void*) - sizeof(int)); - } - - // - // A few words about spinning choices: - // - // Most locks are not contentious. In fact most locks provide exclusive access safety, but in reality are used by - // one thread. And then there are locks that do see multiple threads, but the accesses are short and not overlapping. - // Thin lock is an optimization for such scenarios. - // - // If we see a thin lock held by other thread for longer than ~5 microseconds, we will "inflate" the lock - // and let the adaptive spinning in the fat Lock sort it out whether we have a contentious lock or a long-held lock. - // - // Another thing to consider is that SpinWait(1) is calibrated to about 35-50 nanoseconds. - // It can take much longer only if nop/pause takes much longer, which it should not, as that would be getting - // close to the RAM latency. - // - // Considering that taking and releasing the lock takes 2 CAS instructions + some overhead, we can estimate shortest - // time the lock can be held to be in hundreds of nanoseconds. Thus it is unlikely to see more than - // 8-10 threads contending for the lock without inflating it. Therefore we can expect to acquire a thin lock in - // under 16 tries. - // - // As for the backoff strategy we have two choices: - // Exponential back-off with a lmit: - // 0, 1, 2, 4, 8, 8, 8, 8, 8, 8, 8, . . . . - // - // Linear back-off - // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, . . . . - // - // In this case these strategies are close in terms of average and worst case latency, so we will prefer linear - // back-off as it favors micro-contention scenario, which we expect. - // - - // Try acquiring the thin-lock - public static unsafe HeaderLockResult TryAcquireThinLock(object obj) - { - ArgumentNullException.ThrowIfNull(obj); - - HeaderLockResult result = AcquireInternal(obj); - if (result == HeaderLockResult.Failure) - { - return TryAcquireThinLockSpin(obj); - } - return result; - } - - private static unsafe HeaderLockResult TryAcquireThinLockSpin(object obj) - { - int currentThreadID = ManagedThreadId.Current; - - // does thread ID fit? - if (currentThreadID > SBLK_MASK_LOCK_THREADID) - return HeaderLockResult.UseSlowPath; - - int retries = Lock.IsSingleProcessor ? 0 : 16; - - // retry when the lock is owned by somebody else. - // this loop will spinwait between iterations. - for (int i = 0; i <= retries; i++) - { - fixed (byte* pObjectData = &obj.GetRawData()) - { - int* pHeader = GetHeaderPtr(pObjectData); - - // rare retries when lock is not owned by somebody else. - // these do not count as iterations and do not spinwait. - while (true) - { - int oldBits = *pHeader; - - // If has a hash code, syncblock, or is in the process of upgrading, - // we cannot use a thin-lock. - if ((oldBits & (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX | BIT_SBLK_SPIN_LOCK)) != 0) - { - // Need to use a thick-lock. - return HeaderLockResult.UseSlowPath; - } - // If we already own the lock, try incrementing recursion level. - else if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID) - { - // try incrementing recursion level, check for overflow - int newBits = oldBits + SBLK_LOCK_RECLEVEL_INC; - if ((newBits & SBLK_MASK_LOCK_RECLEVEL) != 0) - { - if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits) - { - return HeaderLockResult.Success; - } - - // rare contention on owned lock, - // perhaps hashcode was installed or finalization bits were touched. - // we still own the lock though and may be able to increment, try again - continue; - } - else - { - // overflow, need to transition to a fat Lock - return HeaderLockResult.UseSlowPath; - } - } - // If no one owns the lock, try acquiring it. - else if ((oldBits & SBLK_MASK_LOCK_THREADID) == 0) - { - int newBits = oldBits | currentThreadID; - if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits) - { - return HeaderLockResult.Success; - } - - // rare contention on lock. - // Try again in case the finalization bits were touched. - continue; - } - else - { - // Owned by somebody else. Now we spinwait and retry. - break; - } - } - } - - if (retries != 0) - { - // spin a bit before retrying (1 spinwait is roughly 35 nsec) - // the object is not pinned here - Thread.SpinWait(i); - } - } - - // owned by somebody else - return HeaderLockResult.Failure; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe HeaderLockResult IsAcquired(object obj) - { - ArgumentNullException.ThrowIfNull(obj); - - fixed (byte* pObjectData = &obj.GetRawData()) - { - int* pHeader = GetHeaderPtr(pObjectData); - - // Ignore the spinlock here. - // Either we'll read the thin-lock data in the header or we'll have a sync block. - // In either case, the two will be consistent. - int oldBits = *pHeader; - - // If has a hash code or syncblock, we cannot determine the lock state from the header - // use the slow path. - if ((oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) != 0) - { - return HeaderLockResult.UseSlowPath; - } - - // if we own the lock - if ((oldBits & SBLK_MASK_LOCK_THREADID) == ManagedThreadId.Current) - { - return HeaderLockResult.Success; - } - - // someone else owns or no one. - return HeaderLockResult.Failure; - } - } + [MethodImpl(MethodImplOptions.InternalCall)] + public static extern HeaderLockResult IsAcquired(object obj); } } diff --git a/src/coreclr/vm/comsynchronizable.cpp b/src/coreclr/vm/comsynchronizable.cpp index 64be7831043167..59fadb6a3d3106 100644 --- a/src/coreclr/vm/comsynchronizable.cpp +++ b/src/coreclr/vm/comsynchronizable.cpp @@ -909,3 +909,11 @@ FCIMPL1(ObjHeader::HeaderLockResult, ObjHeader_ReleaseThinLock, Object* obj) return obj->GetHeader()->ReleaseHeaderThinLock(GetThread()->GetThreadId()); } FCIMPLEND + +FCIMPL1(ObjHeader::HeaderLockResult, ObjHeader_IsThinLockOwnedByThread, Object* obj) +{ + FCALL_CONTRACT; + + return obj->GetHeader()->IsHeaderThinLockOwnedByThread(GetThread()->GetThreadId()); +} +FCIMPLEND diff --git a/src/coreclr/vm/comsynchronizable.h b/src/coreclr/vm/comsynchronizable.h index ca6f3fb5be1619..f5b04a0de2638a 100644 --- a/src/coreclr/vm/comsynchronizable.h +++ b/src/coreclr/vm/comsynchronizable.h @@ -78,5 +78,6 @@ FCDECL1(OBJECTHANDLE, Monitor_GetLockHandleIfExists, Object* obj); FCDECL1(ObjHeader::HeaderLockResult, ObjHeader_AcquireThinLock, Object* obj); FCDECL1(ObjHeader::HeaderLockResult, ObjHeader_ReleaseThinLock, Object* obj); +FCDECL1(ObjHeader::HeaderLockResult, ObjHeader_IsThinLockOwnedByThread, Object* obj); #endif // _COMSYNCHRONIZABLE_H diff --git a/src/coreclr/vm/ecalllist.h b/src/coreclr/vm/ecalllist.h index 61828016c80987..68891d00463e4d 100644 --- a/src/coreclr/vm/ecalllist.h +++ b/src/coreclr/vm/ecalllist.h @@ -256,8 +256,9 @@ FCFuncStart(gThreadFuncs) FCFuncEnd() FCFuncStart(gObjectHeaderFuncs) - FCFuncElement("AcquireInternal", ObjHeader_AcquireThinLock) + FCFuncElement("Acquire", ObjHeader_AcquireThinLock) FCFuncElement("Release", ObjHeader_ReleaseThinLock) + FCFuncElement("IsAcquired", ObjHeader_IsThinLockOwnedByThread) FCFuncEnd() FCFuncStart(gMonitorFuncs) diff --git a/src/coreclr/vm/syncblk.h b/src/coreclr/vm/syncblk.h index 19cf57f7b6f9cd..1a985e44f21962 100644 --- a/src/coreclr/vm/syncblk.h +++ b/src/coreclr/vm/syncblk.h @@ -958,7 +958,12 @@ class ObjHeader HeaderLockResult ReleaseHeaderThinLock(DWORD tid); + HeaderLockResult IsHeaderThinLockOwnedByThread(DWORD tid); + friend struct ::cdac_data; + + private: + HeaderLockResult AcquireHeaderThinLockWithSpin(DWORD tid); }; template<> diff --git a/src/coreclr/vm/syncblk.inl b/src/coreclr/vm/syncblk.inl index 7131f2f7754805..23609622f1db10 100644 --- a/src/coreclr/vm/syncblk.inl +++ b/src/coreclr/vm/syncblk.inl @@ -74,8 +74,125 @@ FORCEINLINE ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLock(DWORD t return HeaderLockResult::Success; } - // Use the slow path instead of spinning. The compare-exchange above would not fail often, and it's not worth forcing the - // spin loop that typically follows the call to this function to check the recursive case, so just bail to the slow path. + // We failed to acquire the lock in one shot. + // If we have only have one processor, don't waste time spinning. + if (g_SystemInfo.dwNumberOfProcessors == 1) + { + return HeaderLockResult::UseSlowPath; + } + + return AcquireHeaderThinLockWithSpin(tid); +} + +inline ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLockWithSpin(DWORD tid) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_COOPERATIVE; + PRECONDITION(tid <= SBLK_MASK_LOCK_THREADID); + } CONTRACTL_END; + + // + // A few words about spinning choices: + // + // Most locks are not contentious. In fact most locks provide exclusive access safety, but in reality are used by + // one thread. And then there are locks that do see multiple threads, but the accesses are short and not overlapping. + // Thin lock is an optimization for such scenarios. + // + // If we see a thin lock held by other thread for longer than ~5 microseconds, we will "inflate" the lock + // and let the adaptive spinning in the fat Lock sort it out whether we have a contentious lock or a long-held lock. + // + // Another thing to consider is that SpinWait(1) is calibrated to about 35-50 nanoseconds. + // It can take much longer only if nop/pause takes much longer, which it should not, as that would be getting + // close to the RAM latency. + // + // Considering that taking and releasing the lock takes 2 CAS instructions + some overhead, we can estimate shortest + // time the lock can be held to be in hundreds of nanoseconds. Thus it is unlikely to see more than + // 8-10 threads contending for the lock without inflating it. Therefore we can expect to acquire a thin lock in + // under 16 tries. + // + // As for the backoff strategy we have two choices: + // Exponential back-off with a lmit: + // 0, 1, 2, 4, 8, 8, 8, 8, 8, 8, 8, . . . . + // + // Linear back-off + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, . . . . + // + // In this case these strategies are close in terms of average and worst case latency, so we will prefer linear + // back-off as it favors micro-contention scenario, which we expect. + // + + for (int i = 0; i < 16; ++i) + { + LONG oldValue = m_SyncBlockValue.LoadWithoutBarrier(); + + if ((oldValue & (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + + BIT_SBLK_SPIN_LOCK + + SBLK_MASK_LOCK_THREADID + + SBLK_MASK_LOCK_RECLEVEL)) == 0) + { + + LONG newValue = oldValue | tid; +#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) + if (FastInterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) +#else + if (InterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) +#endif + { + return HeaderLockResult::Success; + } + + return HeaderLockResult::Failure; + } + + if (oldValue & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) + { + return HeaderLockResult::UseSlowPath; + } + + // The header is transitioning - use the slow path + if (oldValue & BIT_SBLK_SPIN_LOCK) + { + return HeaderLockResult::UseSlowPath; + } + + // Here we know we have the "thin lock" layout, but the lock is not free. + // It could still be the recursion case - compare the thread id to check + if (tid != (DWORD)(oldValue & SBLK_MASK_LOCK_THREADID)) + { + // Someone else owns the lock. + // Try again. + YieldProcessorNormalized(i); + continue; + } + + // Ok, the thread id matches, it's the recursion case. + // Bump up the recursion level and check for overflow + LONG newValue = oldValue + SBLK_LOCK_RECLEVEL_INC; + + if ((newValue & SBLK_MASK_LOCK_RECLEVEL) == 0) + { + return HeaderLockResult::UseSlowPath; + } + +#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) + if (FastInterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) +#else + if (InterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) +#endif + { + return HeaderLockResult::Success; + } + + // Something touched one of the other bits in the header (like the finalizer bits). + // Try again. + YieldProcessorNormalized(i); + } + + // We failed to acquire the lock after spinning. + // Use the slow path to wait. return HeaderLockResult::UseSlowPath; } @@ -140,4 +257,32 @@ FORCEINLINE ObjHeader::HeaderLockResult ObjHeader::ReleaseHeaderThinLock(DWORD t return HeaderLockResult::UseSlowPath; } +FORCEINLINE ObjHeader::HeaderLockResult ObjHeader::IsHeaderThinLockOwnedByThread(DWORD tid) +{ + CONTRACTL { + NOTHROW; + GC_NOTRIGGER; + MODE_COOPERATIVE; + } CONTRACTL_END; + + DWORD syncBlockValue = m_SyncBlockValue.LoadWithoutBarrier(); + + // We ignore the header spinlock here. + // Either we'll read the thin-lock data in the header or we'll have a sync block. + // In either case, the two will be consistent. + if ((syncBlockValue & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0) + { + if ((syncBlockValue & SBLK_MASK_LOCK_THREADID) != tid) + { + // This thread does not own the lock. + return HeaderLockResult::Failure; + } + + return HeaderLockResult::Success; + } + + // If has a hash code or syncblock, we cannot determine the lock state from the header. + return HeaderLockResult::UseSlowPath; +} + #endif // _SYNCBLK_INL_ From 183af372fefc05ecb167da887ca64280be1d01f8 Mon Sep 17 00:00:00 2001 From: Jeremy Koritzinsky Date: Wed, 17 Dec 2025 13:17:28 -0800 Subject: [PATCH 3/8] Copilot PR feedback --- src/coreclr/vm/syncblk.inl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/coreclr/vm/syncblk.inl b/src/coreclr/vm/syncblk.inl index 23609622f1db10..1741476441f9cb 100644 --- a/src/coreclr/vm/syncblk.inl +++ b/src/coreclr/vm/syncblk.inl @@ -75,7 +75,7 @@ FORCEINLINE ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLock(DWORD t } // We failed to acquire the lock in one shot. - // If we have only have one processor, don't waste time spinning. + // If we only have one processor, don't waste time spinning. if (g_SystemInfo.dwNumberOfProcessors == 1) { return HeaderLockResult::UseSlowPath; @@ -114,7 +114,7 @@ inline ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLockWithSpin(DWOR // under 16 tries. // // As for the backoff strategy we have two choices: - // Exponential back-off with a lmit: + // Exponential back-off with a limit: // 0, 1, 2, 4, 8, 8, 8, 8, 8, 8, 8, . . . . // // Linear back-off @@ -133,7 +133,6 @@ inline ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLockWithSpin(DWOR SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL)) == 0) { - LONG newValue = oldValue | tid; #if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) if (FastInterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) @@ -144,7 +143,10 @@ inline ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLockWithSpin(DWOR return HeaderLockResult::Success; } - return HeaderLockResult::Failure; + // Someone else just beat us to the lock. + // Try again. + YieldProcessorNormalized(i); + continue; } if (oldValue & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) From 4bc6c3a7ba409747c56006bea3250582f0c7c57b Mon Sep 17 00:00:00 2001 From: Jeremy Koritzinsky Date: Wed, 17 Dec 2025 14:56:37 -0800 Subject: [PATCH 4/8] Remove unnecessary GC frame --- src/coreclr/vm/syncblk.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/coreclr/vm/syncblk.cpp b/src/coreclr/vm/syncblk.cpp index a8c114c9f7ea7c..1e24c6a1b54f0c 100644 --- a/src/coreclr/vm/syncblk.cpp +++ b/src/coreclr/vm/syncblk.cpp @@ -1783,21 +1783,19 @@ bool SyncBlock::TryUpgradeThinLockToFullLock(OBJECTHANDLE lockHandle) if (thinLock != 0) { - // We have thin-lock info that needs to be transferred to the lock object. - OBJECTREF lockObj = ObjectFromHandle(lockHandle); - GCPROTECT_BEGIN(lockObj); - DWORD lockThreadId = thinLock & SBLK_MASK_LOCK_THREADID; DWORD recursionLevel = (thinLock & SBLK_MASK_LOCK_RECLEVEL) >> SBLK_RECLEVEL_SHIFT; _ASSERTE(lockThreadId != 0); PREPARE_NONVIRTUAL_CALLSITE(METHOD__LOCK__INITIALIZE_FOR_MONITOR); + + // We have thin-lock info that needs to be transferred to the lock object. + OBJECTREF lockObj = ObjectFromHandle(lockHandle); + DECLARE_ARGHOLDER_ARRAY(args, 3); args[ARGNUM_0] = OBJECTREF_TO_ARGHOLDER(lockObj); args[ARGNUM_1] = DWORD_TO_ARGHOLDER(lockThreadId); args[ARGNUM_2] = DWORD_TO_ARGHOLDER(recursionLevel); CALL_MANAGED_METHOD_NORET(args); - - GCPROTECT_END(); } VolatileStore(&m_Lock, lockHandle); From c4ab2dd6d07bad279abccfc64cea5aad3eb04569 Mon Sep 17 00:00:00 2001 From: Jeremy Koritzinsky Date: Tue, 23 Dec 2025 11:22:10 -0800 Subject: [PATCH 5/8] Go back to a managed implementation of the spinning and IsAcquired checks with a pointer-based Interlocked.CompareExchange --- .../System/Threading/Interlocked.CoreCLR.cs | 21 ++ .../src/System/Threading/Monitor.CoreCLR.cs | 16 +- .../System/Threading/ObjectHeader.CoreCLR.cs | 202 +++++++++++++++++- src/coreclr/vm/comsynchronizable.cpp | 8 - src/coreclr/vm/comsynchronizable.h | 1 - src/coreclr/vm/ecalllist.h | 4 +- src/coreclr/vm/syncblk.h | 5 - src/coreclr/vm/syncblk.inl | 152 +------------ 8 files changed, 230 insertions(+), 179 deletions(-) diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs index 4a62578672c75c..3da108deb837a3 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs @@ -127,6 +127,27 @@ public static int CompareExchange(ref int location1, int value, int comparand) [MethodImpl(MethodImplOptions.InternalCall)] private static extern int CompareExchange32(ref int location1, int value, int comparand); + // This is used internally in cases where having a managed + // ref to the location is unsafe (Ex: it is the syncblock of a pinned object). + // The intrinsic expansion for this overload is exactly the same as for the `ref int` + // variant and will go on the same path since expansion is triggered by the name and + // return type of the method. + // The important part is avoiding `ref *location` in the unexpanded scenario, like + // in a case when compiling the Debug flavor of the app. + [Intrinsic] + internal static unsafe int CompareExchange(int* location1, int value, int comparand) + { +#if TARGET_X86 || TARGET_AMD64 || TARGET_ARM64 || TARGET_RISCV64 + return CompareExchange(location1, value, comparand); // Must expand intrinsic +#else + Debug.Assert(location1 != null); + return RuntimeImports.InterlockedCompareExchange(location1, value, comparand); +#endif + } + + [MethodImpl(MethodImplOptions.InternalCall)] + private static unsafe extern int CompareExchange32Pointer(int* location1, int value, int comparand); + /// Compares two 64-bit signed integers for equality and, if they are equal, replaces the first value. /// The destination, whose value is compared with and possibly replaced. /// The value that replaces the destination value if the comparison results in equality. diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs index 4bc9af62c467ca..ab5c33950ab010 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Monitor.CoreCLR.cs @@ -56,8 +56,7 @@ static Lock GetLockObjectFallback(object obj) #region Public Enter/Exit methods public static void Enter(object obj) { - ArgumentNullException.ThrowIfNull(obj); - ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); if (result == ObjectHeader.HeaderLockResult.Success) return; @@ -66,8 +65,7 @@ public static void Enter(object obj) public static bool TryEnter(object obj) { - ArgumentNullException.ThrowIfNull(obj); - ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); if (result == ObjectHeader.HeaderLockResult.Success) return true; @@ -80,9 +78,8 @@ public static bool TryEnter(object obj) public static bool TryEnter(object obj, int millisecondsTimeout) { ArgumentOutOfRangeException.ThrowIfLessThan(millisecondsTimeout, -1); - ArgumentNullException.ThrowIfNull(obj); - ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); if (result == ObjectHeader.HeaderLockResult.Success) return true; @@ -108,9 +105,10 @@ public static void Exit(object obj) GetLockObject(obj).Exit(); } + // Marked no-inlining to prevent recursive inlining of IsAcquired. + [MethodImpl(MethodImplOptions.NoInlining)] public static bool IsEntered(object obj) { - ArgumentNullException.ThrowIfNull(obj); ObjectHeader.HeaderLockResult result = ObjectHeader.IsAcquired(obj); if (result == ObjectHeader.HeaderLockResult.Success) return true; @@ -124,8 +122,7 @@ public static bool IsEntered(object obj) internal static void SynchronizedMethodEnter(object obj, ref bool lockTaken) { - ArgumentNullException.ThrowIfNull(obj); - ObjectHeader.HeaderLockResult result = ObjectHeader.Acquire(obj); + ObjectHeader.HeaderLockResult result = ObjectHeader.TryAcquireThinLock(obj); if (result == ObjectHeader.HeaderLockResult.Success) { lockTaken = true; @@ -138,7 +135,6 @@ internal static void SynchronizedMethodEnter(object obj, ref bool lockTaken) internal static void SynchronizedMethodExit(object obj, ref bool lockTaken) { - Debug.Assert(obj is not null); // Inlined Monitor.Exit with a few tweaks if (!lockTaken) return; diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs index 90bbb4c60fa657..7a91dffbc677ae 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/ObjectHeader.CoreCLR.cs @@ -18,6 +18,36 @@ namespace System.Threading /// internal static class ObjectHeader { + // The following three header bits are reserved for the GC engine: + // BIT_SBLK_UNUSED = 0x80000000 + // BIT_SBLK_FINALIZER_RUN = 0x40000000 + // BIT_SBLK_GC_RESERVE = 0x20000000 + // + // All other bits may be used to store runtime data: hash code, sync entry index, etc. + // Here we use the same bit layout as in CLR: if bit 26 (BIT_SBLK_IS_HASHCODE) is set, + // all the lower bits 0..25 store the hash code, otherwise they store either the sync + // entry index (indicated by BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) or thin lock data. + private const int IS_HASHCODE_BIT_NUMBER = 26; + private const int IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER = 27; + private const int BIT_SBLK_IS_HASHCODE = 1 << IS_HASHCODE_BIT_NUMBER; + internal const int MASK_HASHCODE_INDEX = BIT_SBLK_IS_HASHCODE - 1; + private const int BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX = 1 << IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER; + + // This lock is only taken when we need to modify the index value in m_SyncBlockValue. + // It should not be taken if the object already has a real syncblock index. + // In this managed side, we skip the fast path while this spinlock is in use. + // We'll sync up in the slow path. + private const int BIT_SBLK_SPIN_LOCK = 0x10000000; + + // if BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX is clear, the rest of the header dword is laid out as follows: + // - lower sixteen bits (bits 0 thru 15) is thread id used for the thin locks + // value is zero if no thread is holding the lock + // - following six bits (bits 16 thru 21) is recursion level used for the thin locks + // value is zero if lock is not taken or only taken once by the same thread + private const int SBLK_MASK_LOCK_THREADID = 0x0000FFFF; // special value of 0 + 65535 thread ids + private const int SBLK_MASK_LOCK_RECLEVEL = 0x003F0000; // 64 recursion levels + private const int SBLK_LOCK_RECLEVEL_INC = 0x00010000; // each level is this much higher than the previous one + // These must match the values in syncblk.h public enum HeaderLockResult { @@ -27,12 +57,178 @@ public enum HeaderLockResult }; [MethodImpl(MethodImplOptions.InternalCall)] - public static extern HeaderLockResult Acquire(object obj); + private static extern HeaderLockResult AcquireInternal(object obj); [MethodImpl(MethodImplOptions.InternalCall)] public static extern HeaderLockResult Release(object obj); - [MethodImpl(MethodImplOptions.InternalCall)] - public static extern HeaderLockResult IsAcquired(object obj); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe int* GetHeaderPtr(byte* ppObjectData) + { + // The header is the 4 bytes before a pointer-sized chunk before the object data pointer. + return (int*)(ppObjectData - sizeof(void*) - sizeof(int)); + } + + // + // A few words about spinning choices: + // + // Most locks are not contentious. In fact most locks provide exclusive access safety, but in reality are used by + // one thread. And then there are locks that do see multiple threads, but the accesses are short and not overlapping. + // Thin lock is an optimization for such scenarios. + // + // If we see a thin lock held by other thread for longer than ~5 microseconds, we will "inflate" the lock + // and let the adaptive spinning in the fat Lock sort it out whether we have a contentious lock or a long-held lock. + // + // Another thing to consider is that SpinWait(1) is calibrated to about 35-50 nanoseconds. + // It can take much longer only if nop/pause takes much longer, which it should not, as that would be getting + // close to the RAM latency. + // + // Considering that taking and releasing the lock takes 2 CAS instructions + some overhead, we can estimate shortest + // time the lock can be held to be in hundreds of nanoseconds. Thus it is unlikely to see more than + // 8-10 threads contending for the lock without inflating it. Therefore we can expect to acquire a thin lock in + // under 16 tries. + // + // As for the backoff strategy we have two choices: + // Exponential back-off with a lmit: + // 0, 1, 2, 4, 8, 8, 8, 8, 8, 8, 8, . . . . + // + // Linear back-off + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, . . . . + // + // In this case these strategies are close in terms of average and worst case latency, so we will prefer linear + // back-off as it favors micro-contention scenario, which we expect. + // + + // Try acquiring the thin-lock + public static unsafe HeaderLockResult TryAcquireThinLock(object obj) + { + ArgumentNullException.ThrowIfNull(obj); + + HeaderLockResult result = AcquireInternal(obj); + if (result == HeaderLockResult.Failure) + { + return TryAcquireThinLockSpin(obj); + } + return result; + } + + private static unsafe HeaderLockResult TryAcquireThinLockSpin(object obj) + { + int currentThreadID = ManagedThreadId.Current; + + // does thread ID fit? + if (currentThreadID > SBLK_MASK_LOCK_THREADID) + return HeaderLockResult.UseSlowPath; + + int retries = Lock.IsSingleProcessor ? 0 : 16; + + // retry when the lock is owned by somebody else. + // this loop will spinwait between iterations. + for (int i = 0; i <= retries; i++) + { + fixed (byte* pObjectData = &obj.GetRawData()) + { + int* pHeader = GetHeaderPtr(pObjectData); + + // rare retries when lock is not owned by somebody else. + // these do not count as iterations and do not spinwait. + while (true) + { + int oldBits = *pHeader; + + // If has a hash code, syncblock, or is in the process of upgrading, + // we cannot use a thin-lock. + if ((oldBits & (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX | BIT_SBLK_SPIN_LOCK)) != 0) + { + // Need to use a thick-lock. + return HeaderLockResult.UseSlowPath; + } + // If we already own the lock, try incrementing recursion level. + else if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID) + { + // try incrementing recursion level, check for overflow + int newBits = oldBits + SBLK_LOCK_RECLEVEL_INC; + if ((newBits & SBLK_MASK_LOCK_RECLEVEL) != 0) + { + if (Interlocked.CompareExchange(pHeader, newBits, oldBits) == oldBits) + { + return HeaderLockResult.Success; + } + + // rare contention on owned lock, + // perhaps hashcode was installed or finalization bits were touched. + // we still own the lock though and may be able to increment, try again + continue; + } + else + { + // overflow, need to transition to a fat Lock + return HeaderLockResult.UseSlowPath; + } + } + // If no one owns the lock, try acquiring it. + else if ((oldBits & SBLK_MASK_LOCK_THREADID) == 0) + { + int newBits = oldBits | currentThreadID; + if (Interlocked.CompareExchange(pHeader, newBits, oldBits) == oldBits) + { + return HeaderLockResult.Success; + } + + // rare contention on lock. + // Try again in case the finalization bits were touched. + continue; + } + else + { + // Owned by somebody else. Now we spinwait and retry. + break; + } + } + } + + if (retries != 0) + { + // spin a bit before retrying (1 spinwait is roughly 35 nsec) + // the object is not pinned here + Thread.SpinWait(i); + } + } + + // owned by somebody else + return HeaderLockResult.Failure; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe HeaderLockResult IsAcquired(object obj) + { + ArgumentNullException.ThrowIfNull(obj); + + fixed (byte* pObjectData = &obj.GetRawData()) + { + int* pHeader = GetHeaderPtr(pObjectData); + + // Ignore the spinlock here. + // Either we'll read the thin-lock data in the header or we'll have a sync block. + // In either case, the two will be consistent. + int oldBits = *pHeader; + + // If has a hash code or syncblock, we cannot determine the lock state from the header + // use the slow path. + if ((oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) != 0) + { + return HeaderLockResult.UseSlowPath; + } + + // if we own the lock + if ((oldBits & SBLK_MASK_LOCK_THREADID) == ManagedThreadId.Current) + { + return HeaderLockResult.Success; + } + + // someone else owns or no one. + return HeaderLockResult.Failure; + } + } } } diff --git a/src/coreclr/vm/comsynchronizable.cpp b/src/coreclr/vm/comsynchronizable.cpp index 29a026be9d1334..643373a5207b2b 100644 --- a/src/coreclr/vm/comsynchronizable.cpp +++ b/src/coreclr/vm/comsynchronizable.cpp @@ -862,14 +862,6 @@ FCIMPL1(ObjHeader::HeaderLockResult, ObjHeader_ReleaseThinLock, Object* obj) } FCIMPLEND -FCIMPL1(ObjHeader::HeaderLockResult, ObjHeader_IsThinLockOwnedByThread, Object* obj) -{ - FCALL_CONTRACT; - - return obj->GetHeader()->IsHeaderThinLockOwnedByThread(GetThread()->GetThreadId()); -} -FCIMPLEND - extern "C" INT32 QCALLTYPE ThreadNative_ReentrantWaitAny(BOOL alertable, INT32 timeout, INT32 count, HANDLE *handles) { QCALL_CONTRACT; diff --git a/src/coreclr/vm/comsynchronizable.h b/src/coreclr/vm/comsynchronizable.h index 42529ffe866182..16b1fe6d898b7a 100644 --- a/src/coreclr/vm/comsynchronizable.h +++ b/src/coreclr/vm/comsynchronizable.h @@ -86,6 +86,5 @@ FCDECL1(OBJECTHANDLE, Monitor_GetLockHandleIfExists, Object* obj); FCDECL1(ObjHeader::HeaderLockResult, ObjHeader_AcquireThinLock, Object* obj); FCDECL1(ObjHeader::HeaderLockResult, ObjHeader_ReleaseThinLock, Object* obj); -FCDECL1(ObjHeader::HeaderLockResult, ObjHeader_IsThinLockOwnedByThread, Object* obj); #endif // _COMSYNCHRONIZABLE_H diff --git a/src/coreclr/vm/ecalllist.h b/src/coreclr/vm/ecalllist.h index 68891d00463e4d..e03240ef6d0e61 100644 --- a/src/coreclr/vm/ecalllist.h +++ b/src/coreclr/vm/ecalllist.h @@ -256,9 +256,8 @@ FCFuncStart(gThreadFuncs) FCFuncEnd() FCFuncStart(gObjectHeaderFuncs) - FCFuncElement("Acquire", ObjHeader_AcquireThinLock) + FCFuncElement("AcquireInternal", ObjHeader_AcquireThinLock) FCFuncElement("Release", ObjHeader_ReleaseThinLock) - FCFuncElement("IsAcquired", ObjHeader_IsThinLockOwnedByThread) FCFuncEnd() FCFuncStart(gMonitorFuncs) @@ -324,6 +323,7 @@ FCFuncStart(gInterlockedFuncs) FCFuncElement("Exchange64", COMInterlocked::Exchange64) FCFuncElement("ExchangeObject", COMInterlocked::ExchangeObject) FCFuncElement("CompareExchange32", COMInterlocked::CompareExchange32) + FCFuncElement("CompareExchange32Pointer", COMInterlocked::CompareExchange32) FCFuncElement("CompareExchange64", COMInterlocked::CompareExchange64) FCFuncElement("CompareExchangeObject", COMInterlocked::CompareExchangeObject) FCFuncElement("ExchangeAdd32", COMInterlocked::ExchangeAdd32) diff --git a/src/coreclr/vm/syncblk.h b/src/coreclr/vm/syncblk.h index 1a985e44f21962..19cf57f7b6f9cd 100644 --- a/src/coreclr/vm/syncblk.h +++ b/src/coreclr/vm/syncblk.h @@ -958,12 +958,7 @@ class ObjHeader HeaderLockResult ReleaseHeaderThinLock(DWORD tid); - HeaderLockResult IsHeaderThinLockOwnedByThread(DWORD tid); - friend struct ::cdac_data; - - private: - HeaderLockResult AcquireHeaderThinLockWithSpin(DWORD tid); }; template<> diff --git a/src/coreclr/vm/syncblk.inl b/src/coreclr/vm/syncblk.inl index 1741476441f9cb..245c16fdaa66a8 100644 --- a/src/coreclr/vm/syncblk.inl +++ b/src/coreclr/vm/syncblk.inl @@ -74,128 +74,8 @@ FORCEINLINE ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLock(DWORD t return HeaderLockResult::Success; } - // We failed to acquire the lock in one shot. - // If we only have one processor, don't waste time spinning. - if (g_SystemInfo.dwNumberOfProcessors == 1) - { - return HeaderLockResult::UseSlowPath; - } - - return AcquireHeaderThinLockWithSpin(tid); -} - -inline ObjHeader::HeaderLockResult ObjHeader::AcquireHeaderThinLockWithSpin(DWORD tid) -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_COOPERATIVE; - PRECONDITION(tid <= SBLK_MASK_LOCK_THREADID); - } CONTRACTL_END; - - // - // A few words about spinning choices: - // - // Most locks are not contentious. In fact most locks provide exclusive access safety, but in reality are used by - // one thread. And then there are locks that do see multiple threads, but the accesses are short and not overlapping. - // Thin lock is an optimization for such scenarios. - // - // If we see a thin lock held by other thread for longer than ~5 microseconds, we will "inflate" the lock - // and let the adaptive spinning in the fat Lock sort it out whether we have a contentious lock or a long-held lock. - // - // Another thing to consider is that SpinWait(1) is calibrated to about 35-50 nanoseconds. - // It can take much longer only if nop/pause takes much longer, which it should not, as that would be getting - // close to the RAM latency. - // - // Considering that taking and releasing the lock takes 2 CAS instructions + some overhead, we can estimate shortest - // time the lock can be held to be in hundreds of nanoseconds. Thus it is unlikely to see more than - // 8-10 threads contending for the lock without inflating it. Therefore we can expect to acquire a thin lock in - // under 16 tries. - // - // As for the backoff strategy we have two choices: - // Exponential back-off with a limit: - // 0, 1, 2, 4, 8, 8, 8, 8, 8, 8, 8, . . . . - // - // Linear back-off - // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, . . . . - // - // In this case these strategies are close in terms of average and worst case latency, so we will prefer linear - // back-off as it favors micro-contention scenario, which we expect. - // - - for (int i = 0; i < 16; ++i) - { - LONG oldValue = m_SyncBlockValue.LoadWithoutBarrier(); - - if ((oldValue & (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + - BIT_SBLK_SPIN_LOCK + - SBLK_MASK_LOCK_THREADID + - SBLK_MASK_LOCK_RECLEVEL)) == 0) - { - LONG newValue = oldValue | tid; -#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) - if (FastInterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) -#else - if (InterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) -#endif - { - return HeaderLockResult::Success; - } - - // Someone else just beat us to the lock. - // Try again. - YieldProcessorNormalized(i); - continue; - } - - if (oldValue & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) - { - return HeaderLockResult::UseSlowPath; - } - - // The header is transitioning - use the slow path - if (oldValue & BIT_SBLK_SPIN_LOCK) - { - return HeaderLockResult::UseSlowPath; - } - - // Here we know we have the "thin lock" layout, but the lock is not free. - // It could still be the recursion case - compare the thread id to check - if (tid != (DWORD)(oldValue & SBLK_MASK_LOCK_THREADID)) - { - // Someone else owns the lock. - // Try again. - YieldProcessorNormalized(i); - continue; - } - - // Ok, the thread id matches, it's the recursion case. - // Bump up the recursion level and check for overflow - LONG newValue = oldValue + SBLK_LOCK_RECLEVEL_INC; - - if ((newValue & SBLK_MASK_LOCK_RECLEVEL) == 0) - { - return HeaderLockResult::UseSlowPath; - } - -#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) - if (FastInterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) -#else - if (InterlockedCompareExchangeAcquire((LONG*)&m_SyncBlockValue, newValue, oldValue) == oldValue) -#endif - { - return HeaderLockResult::Success; - } - - // Something touched one of the other bits in the header (like the finalizer bits). - // Try again. - YieldProcessorNormalized(i); - } - - // We failed to acquire the lock after spinning. - // Use the slow path to wait. - return HeaderLockResult::UseSlowPath; + // We failed to acquire because someone touched other bits in the header. + return HeaderLockResult::Failure; } // Helper encapsulating the core logic for releasing monitor. Returns what kind of @@ -259,32 +139,4 @@ FORCEINLINE ObjHeader::HeaderLockResult ObjHeader::ReleaseHeaderThinLock(DWORD t return HeaderLockResult::UseSlowPath; } -FORCEINLINE ObjHeader::HeaderLockResult ObjHeader::IsHeaderThinLockOwnedByThread(DWORD tid) -{ - CONTRACTL { - NOTHROW; - GC_NOTRIGGER; - MODE_COOPERATIVE; - } CONTRACTL_END; - - DWORD syncBlockValue = m_SyncBlockValue.LoadWithoutBarrier(); - - // We ignore the header spinlock here. - // Either we'll read the thin-lock data in the header or we'll have a sync block. - // In either case, the two will be consistent. - if ((syncBlockValue & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0) - { - if ((syncBlockValue & SBLK_MASK_LOCK_THREADID) != tid) - { - // This thread does not own the lock. - return HeaderLockResult::Failure; - } - - return HeaderLockResult::Success; - } - - // If has a hash code or syncblock, we cannot determine the lock state from the header. - return HeaderLockResult::UseSlowPath; -} - #endif // _SYNCBLK_INL_ From d98cfa18caf85d2168f33cd40e18b1bd11f9eac2 Mon Sep 17 00:00:00 2001 From: Jeremy Koritzinsky Date: Thu, 25 Dec 2025 15:53:49 -0800 Subject: [PATCH 6/8] Adjust comments and fixup build --- .../src/System/Threading/Interlocked.CoreCLR.cs | 10 +++++----- .../src/System/Threading/Interlocked.cs | 7 +++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs index 3da108deb837a3..7ae24770d07801 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -128,20 +129,19 @@ public static int CompareExchange(ref int location1, int value, int comparand) private static extern int CompareExchange32(ref int location1, int value, int comparand); // This is used internally in cases where having a managed - // ref to the location is unsafe (Ex: it is the syncblock of a pinned object). + // ref to the location is unsafe (Ex: it is the object header of a pinned object). // The intrinsic expansion for this overload is exactly the same as for the `ref int` // variant and will go on the same path since expansion is triggered by the name and // return type of the method. - // The important part is avoiding `ref *location` in the unexpanded scenario, like - // in a case when compiling the Debug flavor of the app. + // The important part is avoiding `ref *location` that is reported as byref to the GC. [Intrinsic] internal static unsafe int CompareExchange(int* location1, int value, int comparand) { -#if TARGET_X86 || TARGET_AMD64 || TARGET_ARM64 || TARGET_RISCV64 +#if TARGET_X86 || TARGET_64BIT return CompareExchange(location1, value, comparand); // Must expand intrinsic #else Debug.Assert(location1 != null); - return RuntimeImports.InterlockedCompareExchange(location1, value, comparand); + return CompareExchange32Pointer(location1, value, comparand); #endif } diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs index 7596f676c901e8..ca9ea313eb8780 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs @@ -24,13 +24,12 @@ public static int CompareExchange(ref int location1, int value, int comparand) #endif } - // This is used internally by NativeAOT runtime in cases where having a managed - // ref to the location is unsafe (Ex: it is the syncblock of a pinned object). + // This is used internally in cases where having a managed + // ref to the location is unsafe (Ex: it is the object header of a pinned object). // The intrinsic expansion for this overload is exactly the same as for the `ref int` // variant and will go on the same path since expansion is triggered by the name and // return type of the method. - // The important part is avoiding `ref *location` in the unexpanded scenario, like - // in a case when compiling the Debug flavor of the app. + // The important part is avoiding `ref *location` that is reported as byref to the GC. [Intrinsic] internal static unsafe int CompareExchange(int* location1, int value, int comparand) { From d26ade63c60602352e851ad7801f47bef0490b58 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Thu, 25 Dec 2025 18:44:17 -0800 Subject: [PATCH 7/8] Apply suggestions from code review --- .../src/System/Threading/Interlocked.CoreCLR.cs | 4 ++-- .../src/System/Threading/Interlocked.cs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs index 7ae24770d07801..d4ab6346f83769 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs @@ -133,11 +133,11 @@ public static int CompareExchange(ref int location1, int value, int comparand) // The intrinsic expansion for this overload is exactly the same as for the `ref int` // variant and will go on the same path since expansion is triggered by the name and // return type of the method. - // The important part is avoiding `ref *location` that is reported as byref to the GC. + // The important part is avoiding `ref *location` that is reported as byref to the GC. [Intrinsic] internal static unsafe int CompareExchange(int* location1, int value, int comparand) { -#if TARGET_X86 || TARGET_64BIT +#if TARGET_X86 || TARGET_AMD64 || TARGET_ARM64 || TARGET_RISCV64 return CompareExchange(location1, value, comparand); // Must expand intrinsic #else Debug.Assert(location1 != null); diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs index ca9ea313eb8780..5ee8576357a1aa 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Interlocked.cs @@ -29,7 +29,7 @@ public static int CompareExchange(ref int location1, int value, int comparand) // The intrinsic expansion for this overload is exactly the same as for the `ref int` // variant and will go on the same path since expansion is triggered by the name and // return type of the method. - // The important part is avoiding `ref *location` that is reported as byref to the GC. + // The important part is avoiding `ref *location` that is reported as byref to the GC. [Intrinsic] internal static unsafe int CompareExchange(int* location1, int value, int comparand) { From 59df8b7e12ba5285fb96aefd48304c15eb314411 Mon Sep 17 00:00:00 2001 From: Jeremy Koritzinsky Date: Fri, 26 Dec 2025 23:36:53 +0000 Subject: [PATCH 8/8] Use the holder type --- src/coreclr/vm/syncblk.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/coreclr/vm/syncblk.cpp b/src/coreclr/vm/syncblk.cpp index 1e24c6a1b54f0c..c210092c685fdb 100644 --- a/src/coreclr/vm/syncblk.cpp +++ b/src/coreclr/vm/syncblk.cpp @@ -1735,18 +1735,14 @@ OBJECTHANDLE SyncBlock::GetOrCreateLock(OBJECTREF lockObj) // We'll likely need to put this lock object into the sync block. // Create the handle here. - OBJECTHANDLE lockHandle = GetAppDomain()->CreateHandle(lockObj); + OBJECTHANDLEHolder lockHandle = GetAppDomain()->CreateHandle(lockObj); - if (TryUpgradeThinLockToFullLock(lockHandle)) + if (TryUpgradeThinLockToFullLock(lockHandle.GetValue())) { // Our lock instance is the one in the sync block now. - return lockHandle; + return lockHandle.Extract(); } - // Another thread beat us to the upgrade. - // Delete our GC handle. - DestroyHandle(lockHandle); - return VolatileLoad(&m_Lock); }