Skip to content

Commit 3ec11d6

Browse files
committed
[BP] MB-59601: Fix data race in CheckpointManager::takeAndResetCursors
The method did not take a queueLock and could mutate the CheckpointManager while it is being accessed, e.g. in CheckpointManager::getListOfCursorsToDrop. CheckpointMemRecoveryTask calls getListOfCursorsToDrop which iterates CM::cursors. A concurrent RollbackTask can result in resetting the vbucket and calling CM::takeAndResetCursors, which among others mutates CM::cursors. WARNING: ThreadSanitizer: data race (pid=60355) Write of size 8 at 0x00010d1a5e68 by main thread (mutexes: write M0, write M1, write M2): #0 CheckpointManager::takeAndResetCursors(CheckpointManager&) checkpoint_manager.cc:1856 (ep-engine_ep_unit_tests:arm64+0x1003795b4) #1 KVBucket::resetVBucket_UNLOCKED(LockedVBucketPtr&, std::__1::unique_lock<std::__1::mutex>&) kv_bucket.cc:1271 (ep-engine_ep_unit_tests:arm64+0x1001da918) #2 KVBucket::rollback(Vbid, unsigned long long) kv_bucket.cc:2671 (ep-engine_ep_unit_tests:arm64+0x1001e8404) #3 CheckpointRemoverTest_MB59601_Test::TestBody() checkpoint_remover_test.cc:513 (ep-engine_ep_unit_tests:arm64+0x10054117c) #4 virtual thunk to CheckpointRemoverTest_MB59601_Test::TestBody() checkpoint_remover_test.cc (ep-engine_ep_unit_tests:arm64+0x100541448) #5 void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) gtest.cc:2643 (ep-engine_ep_unit_tests:arm64+0x10195a8e0) #6 <null> <null> (0x000186e390e0) Previous read of size 8 at 0x00010d1a5e68 by thread T1 (mutexes: write M3): #0 CheckpointManager::getListOfCursorsToDrop() checkpoint_manager.cc:802 (ep-engine_ep_unit_tests:arm64+0x100372bdc) #1 CheckpointMemRecoveryTask::attemptCursorDropping() checkpoint_remover.cc:174 (ep-engine_ep_unit_tests:arm64+0x10037c710) #2 CheckpointMemRecoveryTask::runInner() checkpoint_remover.cc:291 (ep-engine_ep_unit_tests:arm64+0x10037d068) #3 NotifiableTask::run() notifiable_task.cc:18 (ep-engine_ep_unit_tests:arm64+0x101934ed8) #4 void* std::__1::__thread_proxy[abi:v160006]<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, CheckpointRemoverTest_MB59601_Test::TestBody()::$_3::operator()() const::'lambda0'()>>(void*) thread:299 (ep-engine_ep_unit_tests:arm64+0x1005661f0) Change-Id: I7fe1ed1f6ebca811a5dfca6c2e69d04bfa91b2b8 Reviewed-on: https://review.couchbase.org/c/kv_engine/+/203991 Tested-by: Pavlos Georgiou <[email protected]> Reviewed-by: Vesko Karaganev <[email protected]> Reviewed-by: Paolo Cocchi <[email protected]> Well-Formed: Restriction Checker
1 parent c8d9921 commit 3ec11d6

File tree

3 files changed

+90
-5
lines changed

3 files changed

+90
-5
lines changed

engines/ep/src/checkpoint_manager.cc

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,8 @@ std::vector<Cursor> CheckpointManager::getListOfCursorsToDrop() {
796796
? *cursors.at(backupPCursorName)
797797
: *persistenceCursor;
798798

799+
getListOfCursorsToDropHook();
800+
799801
for (const auto& pair : cursors) {
800802
const auto cursor = pair.second;
801803
// Note: Strict condition here.
@@ -1836,13 +1838,23 @@ void CheckpointManager::addStats(const AddStatFn& add_stat,
18361838
}
18371839

18381840
void CheckpointManager::takeAndResetCursors(CheckpointManager& other) {
1839-
pCursor = other.pCursor;
1840-
persistenceCursor = pCursor.lock().get();
1841-
for (auto& cursor : other.cursors) {
1842-
cursors[cursor.second->getName()] = cursor.second;
1841+
other.takeAndResetCursorsHook();
1842+
1843+
Cursor otherPCursor;
1844+
cursor_index otherCursors;
1845+
{
1846+
std::lock_guard<std::mutex> otherLH(other.queueLock);
1847+
otherPCursor = other.pCursor;
1848+
otherCursors = std::move(other.cursors);
1849+
other.cursors.clear();
18431850
}
1844-
other.cursors.clear();
18451851

1852+
std::lock_guard<std::mutex> lh(queueLock);
1853+
pCursor = std::move(otherPCursor);
1854+
persistenceCursor = pCursor.lock().get();
1855+
for (auto& cursor : otherCursors) {
1856+
cursors[cursor.second->getName()] = std::move(cursor.second);
1857+
}
18461858
resetCursors();
18471859
}
18481860

engines/ep/src/checkpoint_manager.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,14 @@ class CheckpointManager {
616616
// (and not yet re-acquired) the CM::lock. Introduced in MB-56644.
617617
TestingHook<> expelHook;
618618

619+
/// Testing hook called at the start of CM::takeAndResetCursors.
620+
/// Introduced in MB-59601.
621+
TestingHook<> takeAndResetCursorsHook;
622+
623+
/// Testing hook called just before iterating CM::cursors in
624+
/// CM::getListOfCursorsToDrop. Introduced in MB-59601.
625+
TestingHook<> getListOfCursorsToDropHook;
626+
619627
protected:
620628
/**
621629
* Checks if eager checkpoint removal is enabled, then checks if the

engines/ep/tests/module_tests/checkpoint_remover_test.cc

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "collections/vbucket_manifest_handles.h"
2323
#include "dcp/response.h"
2424
#include "test_helpers.h"
25+
#include "thread_gate.h"
2526
#include "vbucket.h"
2627

2728
void CheckpointRemoverTest::SetUp() {
@@ -449,6 +450,70 @@ TEST_P(CheckpointRemoverTest, MemRecoveryByCheckpointCreation) {
449450
EXPECT_EQ(0, store->getRequiredCheckpointMemoryReduction());
450451
}
451452

453+
// Without the fix, there is a data race in
454+
// CheckpointManager::takeAndResetCursors which did not take a queueLock,
455+
// and could mutate the CheckpointManager while it is being accessed,
456+
// e.g. in CheckpointManager::getListOfCursorsToDrop.
457+
TEST_P(CheckpointRemoverTest, MB59601) {
458+
if (!isPersistent()) {
459+
GTEST_SKIP();
460+
}
461+
462+
setVBucketStateAndRunPersistTask(vbid, vbucket_state_active);
463+
auto& config = engine->getConfiguration();
464+
config.setChkExpelEnabled(false);
465+
config.setMaxSize(100UL * 1024 * 1024);
466+
// Disable the mem-based checkpoint creation in this test, we would end up
467+
// doing straight CheckpointRemoval rather than ItemExpel/CursorDrop
468+
config.setCheckpointMaxSize(std::numeric_limits<size_t>::max());
469+
const auto chkptMemRecoveryLimit =
470+
config.getMaxSize() * store->getCheckpointMemoryRatio() *
471+
store->getCheckpointMemoryRecoveryUpperMark();
472+
auto& stats = engine->getEpStats();
473+
stats.mem_low_wat.store(1);
474+
475+
int numItems = 0;
476+
const std::string value(1024 * 1024, 'x');
477+
while (stats.getCheckpointManagerEstimatedMemUsage() <
478+
chkptMemRecoveryLimit) {
479+
auto docKey = "key_" + std::to_string(++numItems);
480+
store_item(vbid, makeStoredDocKey(docKey), value);
481+
}
482+
flushVBucketToDiskIfPersistent(vbid, numItems);
483+
484+
// VB needs to be replica to rollback
485+
store->setVBucketState(vbid, vbucket_state_replica);
486+
487+
EXPECT_GT(stats.getNumCheckpoints(), 0);
488+
EXPECT_GT(store->getRequiredCheckpointMemoryReduction(), 0);
489+
490+
/// Synchronises just before accessing and mutating CM::cursors
491+
ThreadGate tg(2);
492+
std::thread bgThread;
493+
494+
auto& oldManager = *store->getVBucket(vbid)->checkpointManager;
495+
oldManager.takeAndResetCursorsHook = [this, &tg, &bgThread]() {
496+
// Note: takeAndResetCursorsHook is executed *after* the new VBucket
497+
// has already been created
498+
499+
auto& newManager = *store->getVBucket(vbid)->checkpointManager;
500+
newManager.getListOfCursorsToDropHook = [&tg]() { tg.threadUp(); };
501+
bgThread = std::thread([this]() {
502+
auto remover = std::make_shared<CheckpointMemRecoveryTask>(
503+
engine.get(),
504+
engine->getEpStats(),
505+
engine->getConfiguration().getChkRemoverStime(),
506+
0);
507+
remover->run();
508+
});
509+
510+
tg.threadUp();
511+
};
512+
513+
store->rollback(vbid, 0);
514+
bgThread.join();
515+
}
516+
452517
// Test written for MB-36366. With the fix removed this test failed because
453518
// post expel, we continued onto cursor dropping.
454519
// MB-36447 - unreliable test, disabling for now

0 commit comments

Comments
 (0)