diff --git a/.github/workflows/root-ci-config/buildconfig/global.txt b/.github/workflows/root-ci-config/buildconfig/global.txt index 8120346124569..0b758433efe44 100644 --- a/.github/workflows/root-ci-config/buildconfig/global.txt +++ b/.github/workflows/root-ci-config/buildconfig/global.txt @@ -10,6 +10,7 @@ mathmore=ON minuit2_omp=ON roofit_multiprocess=ON roottest=ON +tbb=ON test_distrdf_dask=ON test_distrdf_pyspark=ON testing=ON diff --git a/core/thread/src/TRWMutexImp.cxx b/core/thread/src/TRWMutexImp.cxx index 174a00789d4e5..81a2bd88685f0 100644 --- a/core/thread/src/TRWMutexImp.cxx +++ b/core/thread/src/TRWMutexImp.cxx @@ -115,7 +115,7 @@ template class TRWMutexImp; template class TRWMutexImp; #ifdef R__HAS_TBB -template class TRWMutexImp; +template class TRWMutexImp; template class TRWMutexImp; #endif diff --git a/core/thread/src/TReentrantRWLock.cxx b/core/thread/src/TReentrantRWLock.cxx index e18ec0e466c07..e1edeec42827e 100644 --- a/core/thread/src/TReentrantRWLock.cxx +++ b/core/thread/src/TReentrantRWLock.cxx @@ -415,7 +415,7 @@ template class TReentrantRWLock; template class TReentrantRWLock; #ifdef R__HAS_TBB -template class TReentrantRWLock; +template class TReentrantRWLock; template class TReentrantRWLock; #endif } diff --git a/core/thread/src/TReentrantRWLock.hxx b/core/thread/src/TReentrantRWLock.hxx index 91140c0be730a..53f48de1a7d58 100644 --- a/core/thread/src/TReentrantRWLock.hxx +++ b/core/thread/src/TReentrantRWLock.hxx @@ -20,6 +20,7 @@ #include #include #include +#include #include #ifdef R__HAS_TBB @@ -152,20 +153,40 @@ struct RecurseCounts { }; -#ifdef R__HAS_TBB -struct RecurseCountsTBB { +// This class is similar to RecurseCountsTBBUnique, but it doesn't use +// different TLS keys for each instance of this class - just like +// tbb::enumerable_thread_specific would do with tbb::ets_no_key. +// +// Instead of implementing the logic with TBB, this class implements it in +// standard C++, using a fixed-sized array of thread local statics for the +// data. This allows for fast lookups, just as if using different TLS keys per +// instance (the alternative would have been the slower std::unordered_map). +// +// We can make this optimization because we know that only two instances of +// this class will ever be created: one for gCoreMutex, and one for testing. +// +// Like this, we can reach the performance of tbb::enumerable_thread_specific +// with tbb::ets_key_per_instance (as implemented in RecurseCountsTBBUnique), +// but without depending on TBB. +struct RecurseCountsThreadLocal { + + RecurseCountsThreadLocal() : fId{nextId()} {} + using Hint_t = TVirtualRWMutex::Hint_t; struct LocalCounts { size_t fReadersCount = 0; bool fIsWriter = false; }; - tbb::enumerable_thread_specific fLocalCounts; size_t fWriteRecurse = 0; /// locals; + return &locals[fId]; + } Hint_t *IncrementReadCount(local_t &local) { @@ -210,8 +231,27 @@ struct RecurseCountsTBB { void ResetIsWriter(local_t &local) { local->fIsWriter = false; } size_t &GetLocalReadersCount(local_t &local) { return local->fReadersCount; } + +private: + // Only two instances are allowed to be created: one for gCoreMutex, and + // one for testing in testRWLock.cxx. + static constexpr std::size_t nMaxInstances = 2; + + static size_t nextId() + { + static std::atomic counter{0}; + size_t cnt = counter++; + if (cnt >= nMaxInstances) { + throw std::runtime_error( + "Maximum number of ROOT::Internal::RecurseCountsThreadLocal instances reached!"); + } + return cnt; + } + + size_t fId = 0; }; +#ifdef R__HAS_TBB struct RecurseCountsTBBUnique { using Hint_t = TVirtualRWMutex::Hint_t; diff --git a/core/thread/src/TThread.cxx b/core/thread/src/TThread.cxx index da3a4a07db065..c3bbb784c546d 100644 --- a/core/thread/src/TThread.cxx +++ b/core/thread/src/TThread.cxx @@ -347,11 +347,7 @@ void TThread::Init() if (!ROOT::gCoreMutex) { // To avoid dead locks, caused by shared library opening and/or static initialization // taking the same lock as 'tls_get_addr_tail', we can not use UniqueLockRecurseCount. -#ifdef R__HAS_TBB - ROOT::gCoreMutex = new ROOT::TRWMutexImp(); -#else - ROOT::gCoreMutex = new ROOT::TRWMutexImp(); -#endif + ROOT::gCoreMutex = new ROOT::TRWMutexImp(); } gInterpreterMutex = ROOT::gCoreMutex; gROOTMutex = gInterpreterMutex; diff --git a/core/thread/test/testRWLock.cxx b/core/thread/test/testRWLock.cxx index 7e2eb80814846..50bcef7573ee3 100644 --- a/core/thread/test/testRWLock.cxx +++ b/core/thread/test/testRWLock.cxx @@ -250,15 +250,15 @@ auto gMutex = new TMutex(kTRUE); auto gRWMutex = new TRWMutexImp(); auto gRWMutexSpin = new TRWMutexImp(); auto gRWMutexStd = new TRWMutexImp(); +auto gRWMutexStdThreadLocal = new TRWMutexImp(); #ifdef R__HAS_TBB -auto gRWMutexStdTBB = new TRWMutexImp(); auto gRWMutexStdTBBUnique = new TRWMutexImp(); #endif auto gReentrantRWMutex = new ROOT::TReentrantRWLock(); auto gReentrantRWMutexSM = new ROOT::TReentrantRWLock(); auto gReentrantRWMutexStd = new ROOT::TReentrantRWLock(); +auto gReentrantRWMutexStdThreadLocal = new ROOT::TReentrantRWLock(); #ifdef R__HAS_TBB -auto gReentrantRWMutexStdTBB = new ROOT::TReentrantRWLock(); auto gReentrantRWMutexStdTBBUnique = new ROOT::TReentrantRWLock(); #endif auto gSpinMutex = new ROOT::TSpinMutex(); @@ -335,17 +335,17 @@ TEST(RWLock, WriteStdDirectUnLock) testWriteUnLock(gReentrantRWMutexStd, gRepetition, gWriteHint); } -#ifdef R__HAS_TBB -TEST(RWLock, WriteStdTBBDirectLock) +TEST(RWLock, WriteStdThreadLocalDirectLock) { - gWriteHint = testWriteLock(gReentrantRWMutexStdTBB, gRepetition); + gWriteHint = testWriteLock(gReentrantRWMutexStdThreadLocal, gRepetition); } -TEST(RWLock, WriteStdTBBDirectUnLock) +TEST(RWLock, WriteStdThreadLocalDirectUnLock) { - testWriteUnLock(gReentrantRWMutexStdTBB, gRepetition, gWriteHint); + testWriteUnLock(gReentrantRWMutexStdThreadLocal, gRepetition, gWriteHint); } +#ifdef R__HAS_TBB TEST(RWLock, WriteStdTBBUniqueDirectLock) { gWriteHint = testWriteLock(gReentrantRWMutexStdTBBUnique, gRepetition); @@ -387,17 +387,17 @@ TEST(RWLock, ReadUnLockStdDirect) testReadUnLock(gReentrantRWMutexStd, gRepetition, gReadHint); } -#ifdef R__HAS_TBB -TEST(RWLock, ReadLockStdTBBDirect) +TEST(RWLock, ReadLockStdThreadLocalDirect) { - gReadHint = testReadLock(gReentrantRWMutexStdTBB, gRepetition); + gReadHint = testReadLock(gReentrantRWMutexStdThreadLocal, gRepetition); } -TEST(RWLock, ReadUnLockStdTBBDirect) +TEST(RWLock, ReadUnLockStdThreadLocalDirect) { - testReadUnLock(gReentrantRWMutexStdTBB, gRepetition, gReadHint); + testReadUnLock(gReentrantRWMutexStdThreadLocal, gRepetition, gReadHint); } +#ifdef R__HAS_TBB TEST(RWLock, ReadLockStdTBBUniqueDirect) { gReadHint = testReadLock(gReentrantRWMutexStdTBBUnique, gRepetition); @@ -494,12 +494,12 @@ TEST(RWLock, ReentrantStd) Reentrant(*gReentrantRWMutexStd); } -#ifdef R__HAS_TBB -TEST(RWLock, ReentrantStdTBB) +TEST(RWLock, ReentrantStdThreadLocal) { - Reentrant(*gReentrantRWMutexStdTBB); + Reentrant(*gReentrantRWMutexStdThreadLocal); } +#ifdef R__HAS_TBB TEST(RWLock, ReentrantStdTBBUnique) { Reentrant(*gReentrantRWMutexStdTBBUnique); @@ -531,12 +531,12 @@ TEST(RWLock, ResetRestoreStd) ResetRestore(*gReentrantRWMutexStd); } -#ifdef R__HAS_TBB -TEST(RWLock, ResetRestoreStdTBB) +TEST(RWLock, ResetRestoreStdThreadLocal) { - ResetRestore(*gReentrantRWMutexStdTBB); + ResetRestore(*gReentrantRWMutexStdThreadLocal); } +#ifdef R__HAS_TBB TEST(RWLock, ResetRestoreStdTBBUnique) { ResetRestore(*gReentrantRWMutexStdTBBUnique); @@ -579,12 +579,12 @@ TEST(RWLock, concurrentResetRestoreStd) concurrentResetRestore(gRWMutexStd, 2, gRepetition / 10000); } -#ifdef R__HAS_TBB -TEST(RWLock, concurrentResetRestoreStdTBB) +TEST(RWLock, concurrentResetRestoreStdThreadLocal) { - concurrentResetRestore(gRWMutexStdTBB, 2, gRepetition / 10000); + concurrentResetRestore(gRWMutexStdThreadLocal, 2, gRepetition / 10000); } +#ifdef R__HAS_TBB TEST(RWLock, concurrentResetRestoreStdTBBUnique) { concurrentResetRestore(gRWMutexStdTBBUnique, 2, gRepetition / 10000); @@ -629,12 +629,12 @@ TEST(RWLock, concurrentReadsAndWritesStd) concurrentReadsAndWrites(gRWMutexStd, 1, 2, gRepetition / 10000); } -#ifdef R__HAS_TBB -TEST(RWLock, concurrentReadsAndWritesStdTBB) +TEST(RWLock, concurrentReadsAndWritesStdThreadLocal) { - concurrentReadsAndWrites(gRWMutexStdTBB, 1, 2, gRepetition / 10000); + concurrentReadsAndWrites(gRWMutexStdThreadLocal, 1, 2, gRepetition / 10000); } +#ifdef R__HAS_TBB TEST(RWLock, concurrentReadsAndWritesStdTBBUnique) { concurrentReadsAndWrites(gRWMutexStdTBBUnique, 1, 2, gRepetition / 10000); @@ -651,12 +651,12 @@ TEST(RWLock, LargeconcurrentReadsAndWritesStd) concurrentReadsAndWrites(gRWMutex, 10, 20, gRepetition / 10000); } -#ifdef R__HAS_TBB -TEST(RWLock, LargeconcurrentReadsAndWritesStdTBB) +TEST(RWLock, LargeconcurrentReadsAndWritesStdThreadLocal) { - concurrentReadsAndWrites(gRWMutexStdTBB, 10, 20, gRepetition / 10000); + concurrentReadsAndWrites(gRWMutexStdThreadLocal, 10, 20, gRepetition / 10000); } +#ifdef R__HAS_TBB TEST(RWLock, LargeconcurrentReadsAndWritesStdTBBUnique) { concurrentReadsAndWrites(gRWMutexStdTBBUnique, 10, 20, gRepetition / 10000); @@ -678,12 +678,12 @@ TEST(RWLock, VeryLargeconcurrentReadsAndWritesStd) concurrentReadsAndWrites(gRWMutexStd, 10, 200, gRepetition / 10000); } -#ifdef R__HAS_TBB -TEST(RWLock, VeryLargeconcurrentReadsAndWritesStdTBB) +TEST(RWLock, VeryLargeconcurrentReadsAndWritesStdThreadLocal) { - concurrentReadsAndWrites(gRWMutexStdTBB, 10, 200, gRepetition / 10000); + concurrentReadsAndWrites(gRWMutexStdThreadLocal, 10, 200, gRepetition / 10000); } +#ifdef R__HAS_TBB TEST(RWLock, VeryLargeconcurrentReadsAndWritesStdTBBUnique) { concurrentReadsAndWrites(gRWMutexStdTBBUnique, 10, 200, gRepetition / 10000); @@ -706,12 +706,12 @@ TEST(RWLock, VeryLargeconcurrentReadsStd) concurrentReadsAndWrites(gRWMutexStd, 0, 200, gRepetition / 10000); } -#ifdef R__HAS_TBB -TEST(RWLock, VeryLargeconcurrentReadsStdTBB) +TEST(RWLock, VeryLargeconcurrentReadsStdThreadLocal) { - concurrentReadsAndWrites(gRWMutexStdTBB, 0, 200, gRepetition / 10000); + concurrentReadsAndWrites(gRWMutexStdThreadLocal, 0, 200, gRepetition / 10000); } +#ifdef R__HAS_TBB TEST(RWLock, VeryLargeconcurrentReadsStdTBBUnique) { concurrentReadsAndWrites(gRWMutexStdTBBUnique, 0, 200, gRepetition / 10000);