Skip to content

Commit ef0a8df

Browse files
committed
[core] Implement RecurseCountsThreadLocal to be used in gCoreMutex
The `RecurseCountsThreadLocal` makes the assumption that only two instances are ever created per process. Therefore, its implementation can be optimized to the point that it *should* be as fast or faster than `RecurseCountsTBBUnique`, avoiding the Core dependence on TBB without compromising performance.
1 parent ddc834a commit ef0a8df

File tree

5 files changed

+80
-44
lines changed

5 files changed

+80
-44
lines changed

core/thread/src/TRWMutexImp.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ template class TRWMutexImp<TMutex, ROOT::Internal::UniqueLockRecurseCount>;
115115
template class TRWMutexImp<ROOT::TSpinMutex, ROOT::Internal::UniqueLockRecurseCount>;
116116

117117
#ifdef R__HAS_TBB
118-
template class TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsTBB>;
118+
template class TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsThreadLocal>;
119119
template class TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsTBBUnique>;
120120
#endif
121121

core/thread/src/TReentrantRWLock.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ template class TReentrantRWLock<TMutex, ROOT::Internal::UniqueLockRecurseCount>;
415415
template class TReentrantRWLock<std::mutex, ROOT::Internal::UniqueLockRecurseCount>;
416416

417417
#ifdef R__HAS_TBB
418-
template class TReentrantRWLock<std::mutex, ROOT::Internal::RecurseCountsTBB>;
418+
template class TReentrantRWLock<std::mutex, ROOT::Internal::RecurseCountsThreadLocal>;
419419
template class TReentrantRWLock<std::mutex, ROOT::Internal::RecurseCountsTBBUnique>;
420420
#endif
421421
}

core/thread/src/TReentrantRWLock.hxx

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <atomic>
2121
#include <condition_variable>
2222
#include <thread>
23+
#include <stdexcept>
2324
#include <unordered_map>
2425

2526
#ifdef R__HAS_TBB
@@ -152,20 +153,40 @@ struct RecurseCounts {
152153

153154
};
154155

155-
#ifdef R__HAS_TBB
156-
struct RecurseCountsTBB {
156+
// This class is similar to RecurseCountsTBBUnique, but it doesn't use
157+
// different TLS keys for each instance of this class - just like
158+
// tbb::enumerable_thread_specific would do with tbb::ets_no_key.
159+
//
160+
// Instead of implementing the logic with TBB, this class implements it in
161+
// standard C++, using a fixed-sized array of thread local statics for the
162+
// data. This allows for fast lookups, just as if using different TLS keys per
163+
// instance (the alternative would have been the slower std::unordered_map).
164+
//
165+
// We can make this optimization because we know that only two instances of
166+
// this class will ever be created: one for gCoreMutex, and one for testing.
167+
//
168+
// Like this, we can reach the performance of tbb::enumerable_thread_specific
169+
// with tbb::ets_key_per_instance (as implemented in RecurseCountsTBBUnique),
170+
// but without depending on TBB.
171+
struct RecurseCountsThreadLocal {
172+
173+
RecurseCountsThreadLocal() : fId{nextId()} {}
174+
157175
using Hint_t = TVirtualRWMutex::Hint_t;
158176

159177
struct LocalCounts {
160178
size_t fReadersCount = 0;
161179
bool fIsWriter = false;
162180
};
163-
tbb::enumerable_thread_specific<LocalCounts> fLocalCounts;
164181
size_t fWriteRecurse = 0; ///<! Number of re-entry in the lock by the same thread.
165182

166183
using local_t = LocalCounts *;
167184

168-
local_t GetLocal() { return &fLocalCounts.local(); }
185+
local_t GetLocal() {
186+
// O(1) lookup with minimal overhead thanks to std::array
187+
static thread_local std::array<LocalCounts, nMaxInstances> locals;
188+
return &locals[fId];
189+
}
169190

170191
Hint_t *IncrementReadCount(local_t &local)
171192
{
@@ -210,8 +231,27 @@ struct RecurseCountsTBB {
210231
void ResetIsWriter(local_t &local) { local->fIsWriter = false; }
211232

212233
size_t &GetLocalReadersCount(local_t &local) { return local->fReadersCount; }
234+
235+
private:
236+
// Only two instances are allowed to be created: one for gCoreMutex, and
237+
// one for testing in testRWLock.cxx.
238+
static constexpr std::size_t nMaxInstances = 2;
239+
240+
static size_t nextId()
241+
{
242+
static std::atomic<size_t> counter{0};
243+
size_t cnt = counter++;
244+
if (cnt >= nMaxInstances) {
245+
throw std::runtime_error(
246+
"Maximum number of ROOT::Internal::RecurseCountsThreadLocal instances reached!");
247+
}
248+
return cnt;
249+
}
250+
251+
size_t fId = 0;
213252
};
214253

254+
#ifdef R__HAS_TBB
215255
struct RecurseCountsTBBUnique {
216256
using Hint_t = TVirtualRWMutex::Hint_t;
217257

core/thread/src/TThread.cxx

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -347,11 +347,7 @@ void TThread::Init()
347347
if (!ROOT::gCoreMutex) {
348348
// To avoid dead locks, caused by shared library opening and/or static initialization
349349
// taking the same lock as 'tls_get_addr_tail', we can not use UniqueLockRecurseCount.
350-
#ifdef R__HAS_TBB
351-
ROOT::gCoreMutex = new ROOT::TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsTBBUnique>();
352-
#else
353-
ROOT::gCoreMutex = new ROOT::TRWMutexImp<std::mutex, ROOT::Internal::RecurseCounts>();
354-
#endif
350+
ROOT::gCoreMutex = new ROOT::TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsThreadLocal>();
355351
}
356352
gInterpreterMutex = ROOT::gCoreMutex;
357353
gROOTMutex = gInterpreterMutex;

core/thread/test/testRWLock.cxx

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -250,15 +250,15 @@ auto gMutex = new TMutex(kTRUE);
250250
auto gRWMutex = new TRWMutexImp<TMutex>();
251251
auto gRWMutexSpin = new TRWMutexImp<ROOT::TSpinMutex>();
252252
auto gRWMutexStd = new TRWMutexImp<std::mutex>();
253+
auto gRWMutexStdThreadLocal = new TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsThreadLocal>();
253254
#ifdef R__HAS_TBB
254-
auto gRWMutexStdTBB = new TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsTBB>();
255255
auto gRWMutexStdTBBUnique = new TRWMutexImp<std::mutex, ROOT::Internal::RecurseCountsTBBUnique>();
256256
#endif
257257
auto gReentrantRWMutex = new ROOT::TReentrantRWLock<TMutex>();
258258
auto gReentrantRWMutexSM = new ROOT::TReentrantRWLock<ROOT::TSpinMutex>();
259259
auto gReentrantRWMutexStd = new ROOT::TReentrantRWLock<std::mutex>();
260+
auto gReentrantRWMutexStdThreadLocal = new ROOT::TReentrantRWLock<std::mutex, ROOT::Internal::RecurseCountsThreadLocal>();
260261
#ifdef R__HAS_TBB
261-
auto gReentrantRWMutexStdTBB = new ROOT::TReentrantRWLock<std::mutex, ROOT::Internal::RecurseCountsTBB>();
262262
auto gReentrantRWMutexStdTBBUnique = new ROOT::TReentrantRWLock<std::mutex, ROOT::Internal::RecurseCountsTBBUnique>();
263263
#endif
264264
auto gSpinMutex = new ROOT::TSpinMutex();
@@ -335,17 +335,17 @@ TEST(RWLock, WriteStdDirectUnLock)
335335
testWriteUnLock(gReentrantRWMutexStd, gRepetition, gWriteHint);
336336
}
337337

338-
#ifdef R__HAS_TBB
339-
TEST(RWLock, WriteStdTBBDirectLock)
338+
TEST(RWLock, WriteStdThreadLocalDirectLock)
340339
{
341-
gWriteHint = testWriteLock(gReentrantRWMutexStdTBB, gRepetition);
340+
gWriteHint = testWriteLock(gReentrantRWMutexStdThreadLocal, gRepetition);
342341
}
343342

344-
TEST(RWLock, WriteStdTBBDirectUnLock)
343+
TEST(RWLock, WriteStdThreadLocalDirectUnLock)
345344
{
346-
testWriteUnLock(gReentrantRWMutexStdTBB, gRepetition, gWriteHint);
345+
testWriteUnLock(gReentrantRWMutexStdThreadLocal, gRepetition, gWriteHint);
347346
}
348347

348+
#ifdef R__HAS_TBB
349349
TEST(RWLock, WriteStdTBBUniqueDirectLock)
350350
{
351351
gWriteHint = testWriteLock(gReentrantRWMutexStdTBBUnique, gRepetition);
@@ -387,17 +387,17 @@ TEST(RWLock, ReadUnLockStdDirect)
387387
testReadUnLock(gReentrantRWMutexStd, gRepetition, gReadHint);
388388
}
389389

390-
#ifdef R__HAS_TBB
391-
TEST(RWLock, ReadLockStdTBBDirect)
390+
TEST(RWLock, ReadLockStdThreadLocalDirect)
392391
{
393-
gReadHint = testReadLock(gReentrantRWMutexStdTBB, gRepetition);
392+
gReadHint = testReadLock(gReentrantRWMutexStdThreadLocal, gRepetition);
394393
}
395394

396-
TEST(RWLock, ReadUnLockStdTBBDirect)
395+
TEST(RWLock, ReadUnLockStdThreadLocalDirect)
397396
{
398-
testReadUnLock(gReentrantRWMutexStdTBB, gRepetition, gReadHint);
397+
testReadUnLock(gReentrantRWMutexStdThreadLocal, gRepetition, gReadHint);
399398
}
400399

400+
#ifdef R__HAS_TBB
401401
TEST(RWLock, ReadLockStdTBBUniqueDirect)
402402
{
403403
gReadHint = testReadLock(gReentrantRWMutexStdTBBUnique, gRepetition);
@@ -494,12 +494,12 @@ TEST(RWLock, ReentrantStd)
494494
Reentrant(*gReentrantRWMutexStd);
495495
}
496496

497-
#ifdef R__HAS_TBB
498-
TEST(RWLock, ReentrantStdTBB)
497+
TEST(RWLock, ReentrantStdThreadLocal)
499498
{
500-
Reentrant(*gReentrantRWMutexStdTBB);
499+
Reentrant(*gReentrantRWMutexStdThreadLocal);
501500
}
502501

502+
#ifdef R__HAS_TBB
503503
TEST(RWLock, ReentrantStdTBBUnique)
504504
{
505505
Reentrant(*gReentrantRWMutexStdTBBUnique);
@@ -531,12 +531,12 @@ TEST(RWLock, ResetRestoreStd)
531531
ResetRestore(*gReentrantRWMutexStd);
532532
}
533533

534-
#ifdef R__HAS_TBB
535-
TEST(RWLock, ResetRestoreStdTBB)
534+
TEST(RWLock, ResetRestoreStdThreadLocal)
536535
{
537-
ResetRestore(*gReentrantRWMutexStdTBB);
536+
ResetRestore(*gReentrantRWMutexStdThreadLocal);
538537
}
539538

539+
#ifdef R__HAS_TBB
540540
TEST(RWLock, ResetRestoreStdTBBUnique)
541541
{
542542
ResetRestore(*gReentrantRWMutexStdTBBUnique);
@@ -579,12 +579,12 @@ TEST(RWLock, concurrentResetRestoreStd)
579579
concurrentResetRestore(gRWMutexStd, 2, gRepetition / 10000);
580580
}
581581

582-
#ifdef R__HAS_TBB
583-
TEST(RWLock, concurrentResetRestoreStdTBB)
582+
TEST(RWLock, concurrentResetRestoreStdThreadLocal)
584583
{
585-
concurrentResetRestore(gRWMutexStdTBB, 2, gRepetition / 10000);
584+
concurrentResetRestore(gRWMutexStdThreadLocal, 2, gRepetition / 10000);
586585
}
587586

587+
#ifdef R__HAS_TBB
588588
TEST(RWLock, concurrentResetRestoreStdTBBUnique)
589589
{
590590
concurrentResetRestore(gRWMutexStdTBBUnique, 2, gRepetition / 10000);
@@ -629,12 +629,12 @@ TEST(RWLock, concurrentReadsAndWritesStd)
629629
concurrentReadsAndWrites(gRWMutexStd, 1, 2, gRepetition / 10000);
630630
}
631631

632-
#ifdef R__HAS_TBB
633-
TEST(RWLock, concurrentReadsAndWritesStdTBB)
632+
TEST(RWLock, concurrentReadsAndWritesStdThreadLocal)
634633
{
635-
concurrentReadsAndWrites(gRWMutexStdTBB, 1, 2, gRepetition / 10000);
634+
concurrentReadsAndWrites(gRWMutexStdThreadLocal, 1, 2, gRepetition / 10000);
636635
}
637636

637+
#ifdef R__HAS_TBB
638638
TEST(RWLock, concurrentReadsAndWritesStdTBBUnique)
639639
{
640640
concurrentReadsAndWrites(gRWMutexStdTBBUnique, 1, 2, gRepetition / 10000);
@@ -651,12 +651,12 @@ TEST(RWLock, LargeconcurrentReadsAndWritesStd)
651651
concurrentReadsAndWrites(gRWMutex, 10, 20, gRepetition / 10000);
652652
}
653653

654-
#ifdef R__HAS_TBB
655-
TEST(RWLock, LargeconcurrentReadsAndWritesStdTBB)
654+
TEST(RWLock, LargeconcurrentReadsAndWritesStdThreadLocal)
656655
{
657-
concurrentReadsAndWrites(gRWMutexStdTBB, 10, 20, gRepetition / 10000);
656+
concurrentReadsAndWrites(gRWMutexStdThreadLocal, 10, 20, gRepetition / 10000);
658657
}
659658

659+
#ifdef R__HAS_TBB
660660
TEST(RWLock, LargeconcurrentReadsAndWritesStdTBBUnique)
661661
{
662662
concurrentReadsAndWrites(gRWMutexStdTBBUnique, 10, 20, gRepetition / 10000);
@@ -678,12 +678,12 @@ TEST(RWLock, VeryLargeconcurrentReadsAndWritesStd)
678678
concurrentReadsAndWrites(gRWMutexStd, 10, 200, gRepetition / 10000);
679679
}
680680

681-
#ifdef R__HAS_TBB
682-
TEST(RWLock, VeryLargeconcurrentReadsAndWritesStdTBB)
681+
TEST(RWLock, VeryLargeconcurrentReadsAndWritesStdThreadLocal)
683682
{
684-
concurrentReadsAndWrites(gRWMutexStdTBB, 10, 200, gRepetition / 10000);
683+
concurrentReadsAndWrites(gRWMutexStdThreadLocal, 10, 200, gRepetition / 10000);
685684
}
686685

686+
#ifdef R__HAS_TBB
687687
TEST(RWLock, VeryLargeconcurrentReadsAndWritesStdTBBUnique)
688688
{
689689
concurrentReadsAndWrites(gRWMutexStdTBBUnique, 10, 200, gRepetition / 10000);
@@ -706,12 +706,12 @@ TEST(RWLock, VeryLargeconcurrentReadsStd)
706706
concurrentReadsAndWrites(gRWMutexStd, 0, 200, gRepetition / 10000);
707707
}
708708

709-
#ifdef R__HAS_TBB
710-
TEST(RWLock, VeryLargeconcurrentReadsStdTBB)
709+
TEST(RWLock, VeryLargeconcurrentReadsStdThreadLocal)
711710
{
712-
concurrentReadsAndWrites(gRWMutexStdTBB, 0, 200, gRepetition / 10000);
711+
concurrentReadsAndWrites(gRWMutexStdThreadLocal, 0, 200, gRepetition / 10000);
713712
}
714713

714+
#ifdef R__HAS_TBB
715715
TEST(RWLock, VeryLargeconcurrentReadsStdTBBUnique)
716716
{
717717
concurrentReadsAndWrites(gRWMutexStdTBBUnique, 0, 200, gRepetition / 10000);

0 commit comments

Comments
 (0)