Skip to content

Commit d8a842f

Browse files
committed
MB-54279: Pause / Resume: Unlock vb_mutexes from locking thread
As part of EPBucket::prepareForPause(), all of the vb_mutexes are lock()ed - and left locked: a) To ensure that any in-flight VBucket writes have completed and b) To inhibit and new writes from occurring. Then, when the EPBucket is resumed all the vb_mutexes are unlock()ed which allows VBucket writes to resume. However, EPBucket::prepareForResume() is not called on the same thread which called prepareForPause() - prepareForPause() runs on a background NonIO thread whereas resume runs synchronously in the front-end thread. As such, we are incorrectly unlocking a mutex from a different thread than the one which locked it - which is Undefined Behaviour - from cppreference.com[1]: void unlock(); Unlocks the mutex. The mutex must be locked by the current thread of execution, otherwise, the behavior is undefined. This is helpfully reported by ThreadSanitizer: WARNING: ThreadSanitizer: unlock of an unlocked mutex (or by a wrong thread) (pid=58528) #0 pthread_mutex_unlock <null> (libtsan.so.0+0x3bf9a) #1 __gthread_mutex_unlock(pthread_mutex_t*) c++/10.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h:779 (memcached+0x5b594f) #2 std::mutex::unlock() c++/10.2.0/bits/std_mutex.h:118 (memcached+0x602555) #3 EPBucket::prepareForResume() kv_engine/engines/ep/src/ep_bucket.cc:2575 (memcached+0x84b94f) #4 EventuallyPersistentEngine::resume() kv_engine/engines/ep/src/ep_engine.cc:7002 (memcached+0x7d52d3) ... Fix by changing how we achieve inhibition of future VBucket writes: - Introduce a EPBucket::paused flag which is set in prepareForPause after all vb_mutexes have been acquired (and hence all in-flight VBucket writes have finished), but then unlock all vb_mutexes before returning from prepareForPause(). - When attempting to acquire a locked VBucket, check new paused flag before attempting to acquire the vb_mutex - if paused is set then block / return early (for try() variant). This keeps the required pause behaviour but avoids keeping vb_mutexes locked and having to later unlock (on a different thread). [1]: https://en.cppreference.com/w/cpp/thread/mutex Change-Id: I062583951a101a866866b79dfd6329672bb4ff42 Reviewed-on: https://review.couchbase.org/c/kv_engine/+/182099 Tested-by: Build Bot <[email protected]> Reviewed-by: Jim Walker <[email protected]>
1 parent 09969cd commit d8a842f

File tree

4 files changed

+41
-8
lines changed

4 files changed

+41
-8
lines changed

engines/ep/src/ep_bucket.cc

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,12 @@ void EPBucket::initializeShards() {
324324
}
325325

326326
void EPBucket::deinitialize() {
327+
// If Bucket is currently paused; need to resume to allow flushers
328+
// etc to complete.
329+
if (paused) {
330+
prepareForResume();
331+
}
332+
327333
stopFlusher();
328334

329335
allVbucketsDeinitialize();
@@ -2498,11 +2504,16 @@ cb::engine_errc EPBucket::prepareForPause() {
24982504
// acquire _all_ of them here, which means any of the above in-flight
24992505
// operations will have to complete before we continue - and that no
25002506
// more can begin.
2507+
// Once all preparations have been completed; we set EPBucket::paused
2508+
// to true and unlock the mutexes - any future attempts to lock them
2509+
// will be blocked until EPBucket::paused is cleared
2510+
// (via prepareForResume()).
25012511
EP_LOG_DEBUG_RAW(
25022512
"EPBucket::prepareForPause: waiting for in-flight Flusher, "
25032513
"Rollback, DeleteVB tasks to complete");
2514+
std::vector<std::unique_lock<std::mutex>> vb_locks;
25042515
for (auto& mutex : vb_mutexes) {
2505-
mutex.lock();
2516+
vb_locks.emplace_back(mutex);
25062517
}
25072518

25082519
// b) Compaction - This only requires that the appropriate vb_mutexes is
@@ -2551,7 +2562,15 @@ cb::engine_errc EPBucket::prepareForPause() {
25512562
}
25522563
allSuccess &= success;
25532564
});
2554-
return allSuccess ? cb::engine_errc::success : cb::engine_errc::failed;
2565+
2566+
if (allSuccess) {
2567+
// Successfully prepared for pausing; set paused flag to true before
2568+
// we unlock all the vb_mutexes; that will inhibit anyone from acquiring
2569+
// the mutexes again until paused is set to false.
2570+
paused.store(true);
2571+
return cb::engine_errc::success;
2572+
}
2573+
return cb::engine_errc::failed;
25552574
}
25562575

25572576
cb::engine_errc EPBucket::prepareForResume() {
@@ -2567,17 +2586,16 @@ cb::engine_errc EPBucket::prepareForResume() {
25672586
});
25682587

25692588
// 2. Resume ep-engine operations.
2570-
// a) Unblock disk writing operations from ep-engine.
2589+
// a) Clear EPBucket::paused so disk writing operations can
2590+
// resume.
25712591
EP_LOG_DEBUG_RAW(
2572-
"EPBucket::prepareForPause: unblocking all Flusher, "
2592+
"EPBucket::prepareForResume: unblocking all Flusher, "
25732593
"Rollback, DeleteVB tasks.");
2574-
for (auto& mutex : vb_mutexes) {
2575-
mutex.unlock();
2576-
}
2594+
paused.store(false);
25772595

25782596
// b) Reset compaction concurrency
25792597
EP_LOG_DEBUG_RAW(
2580-
"EPBucket::prepareForPause: resuming all Compaction tasks");
2598+
"EPBucket::prepareForResume: resuming all Compaction tasks");
25812599
compactionSemaphore->release();
25822600
updateCompactionConcurrency();
25832601

engines/ep/src/kv_bucket.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ KVBucket::KVBucket(EventuallyPersistentEngine& theEngine)
291291
itemCompressorTask(nullptr),
292292
itemFreqDecayerTask(nullptr),
293293
vb_mutexes(engine.getConfiguration().getMaxVbuckets()),
294+
paused(false),
294295
backfillMemoryThreshold(0.95),
295296
lastTransTimePerItem(0),
296297
collectionsManager(std::make_shared<Collections::Manager>()),

engines/ep/src/kv_bucket.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,10 @@ class KVBucket : public KVBucketIface {
254254
* alongside a shared pointer to the requested VBucket.
255255
*/
256256
LockedVBucketPtr getLockedVBucket(Vbid vbid) {
257+
// While the Bucket is paused, no Locked VBuckets can be acquired.
258+
while (paused.load()) {
259+
std::this_thread::yield();
260+
}
257261
std::unique_lock<std::mutex> lock(vb_mutexes[vbid.get()]);
258262
return {vbMap.getBucket(vbid), std::move(lock)};
259263
}
@@ -270,6 +274,10 @@ class KVBucket : public KVBucketIface {
270274
* successfully acquired a locked VBucket.
271275
*/
272276
LockedVBucketPtr getLockedVBucket(Vbid vbid, std::try_to_lock_t) {
277+
// While the Bucket is paused, no Locked VBuckets can be acquired.
278+
if (paused.load()) {
279+
return {};
280+
}
273281
std::unique_lock<std::mutex> lock(vb_mutexes[vbid.get()],
274282
std::try_to_lock);
275283
if (!lock) {
@@ -1178,6 +1186,10 @@ class KVBucket : public KVBucketIface {
11781186
* Used by flush operations: flushVB, deleteVB, compactVB, snapshotVB */
11791187
std::vector<std::mutex> vb_mutexes;
11801188

1189+
/// Is this Bucket currently paused? If true, inhibits any of the vb_mutexes
1190+
/// from being acquired.
1191+
std::atomic_bool paused;
1192+
11811193
std::mutex vbsetMutex;
11821194
double backfillMemoryThreshold;
11831195

engines/ep/src/locked_vbucket_ptr.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
*/
3030
class LockedVBucketPtr {
3131
public:
32+
LockedVBucketPtr() = default;
33+
3234
LockedVBucketPtr(VBucketPtr vb, std::unique_lock<std::mutex>&& lock)
3335
: vb(std::move(vb)), lock(std::move(lock)) {
3436
}

0 commit comments

Comments
 (0)