Skip to content

Commit cfae3f9

Browse files
BenHuddlestondaverigby
authored andcommitted
MB-42918: Do not replace committed item with pending at add
Currently when we do an add we can replace certain committed items with pending ones. One of these cases is a when we have an unpersisted delete. If we replace this with a pending StoredValue then we allow a subsequent get to find no committed StoredValue in the HashTable and potentially do a BgFetch. This BgFetch may return the alive document back to the HashTable/client which it should not do. Fix this by creating a new StoredValue if we would have replace a committed one with a prepared one. Change-Id: I85bb2b249afa8894c594d9d4eaed056a7fe7e833 Reviewed-on: http://review.couchbase.org/c/kv_engine/+/140592 Reviewed-by: Dave Rigby <[email protected]> Tested-by: Build Bot <[email protected]>
1 parent cce6303 commit cfae3f9

File tree

2 files changed

+71
-14
lines changed

2 files changed

+71
-14
lines changed

engines/ep/src/vbucket.cc

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3033,19 +3033,8 @@ void VBucket::deletedOnDiskCbk(const Item& queuedItem, bool deleted) {
30333033
* be done to ensure that the item count is accurate in the case of full
30343034
* eviction. We should only decrement the counter for committed (via
30353035
* mutation or commit) items as we only increment for these.
3036-
* Note this is done irrespective of if a StoredValue was found in the
3037-
* HashTable - the SV may have already been replaced (e.g. with
3038-
* a pending SyncWrite). We "know" the delete happened on disk thus
3039-
* should decrement the total number of items.
3040-
*
3041-
* @todo: About "the SV may have already been replaced (e.g. with
3042-
* a pending SyncWrite)" above.. Need to check if that is true. If it
3043-
* is, that shouldn't happen as we may end up with this at FullEviction:
3044-
* - deleted key queued for persistence and removed from the HT
3045-
* - deletion not persisted yet
3046-
* - frontend read bg-fecthes a previous alive state (if any) from disk
30473036
*/
3048-
if (queuedItem.isCommitted()) {
3037+
if (v && queuedItem.isCommitted()) {
30493038
decrNumTotalItems();
30503039
++opsDelete;
30513040
}
@@ -3470,7 +3459,16 @@ std::pair<AddStatus, std::optional<VBNotifyCtx>> VBucket::processAdd(
34703459

34713460
std::pair<AddStatus, VBNotifyCtx> rv = {AddStatus::Success, {}};
34723461

3473-
if (v) {
3462+
// We cannot replace a committed SV with a pending one. If we were to do
3463+
// so then a delete that has not yet been persisted could be replaced
3464+
// with a prepare. A subsequent get could trigger a bg fetch that may
3465+
// return the old (not deleted) document from disk if it runs before the
3466+
// flusher. As such, we must keep the unpersisted delete and add a new
3467+
// prepare for the SyncWrite. Any get will see the unpersisted delete
3468+
// and return KEYNOENT.
3469+
auto replacingCommittedItemWithPending =
3470+
v && v == htRes.committed && itm.isPending();
3471+
if (v && !replacingCommittedItemWithPending) {
34743472
if (v->isTempInitialItem() && eviction == EvictionPolicy::Full &&
34753473
maybeKeyExists) {
34763474
// Need to figure out if an item exists on disk
@@ -3494,7 +3492,8 @@ std::pair<AddStatus, std::optional<VBNotifyCtx>> VBucket::processAdd(
34943492
std::tie(v, std::ignore, rv.second) =
34953493
updateStoredValue(htRes.getHBL(), *v, itm, queueItmCtx);
34963494
} else {
3497-
if (itm.getBySeqno() != StoredValue::state_temp_init) {
3495+
if (itm.getBySeqno() != StoredValue::state_temp_init &&
3496+
!replacingCommittedItemWithPending) {
34983497
if (eviction == EvictionPolicy::Full && maybeKeyExists) {
34993498
return {AddStatus::AddTmpAndBgFetch, VBNotifyCtx()};
35003499
}
@@ -3514,6 +3513,8 @@ std::pair<AddStatus, std::optional<VBNotifyCtx>> VBucket::processAdd(
35143513

35153514
if (v->isTempItem()) {
35163515
rv.first = AddStatus::BgFetch;
3516+
} else if (replacingCommittedItemWithPending) {
3517+
rv.first = AddStatus::UnDel;
35173518
}
35183519
}
35193520

engines/ep/tests/module_tests/evp_store_test.cc

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,62 @@ TEST_P(EPBucketFullEvictionTest, ExpiryFindsPrepareWithSameCas) {
974974
}
975975
}
976976

977+
TEST_P(EPBucketFullEvictionTest, UnDelWithPrepare) {
978+
setVBucketStateAndRunPersistTask(
979+
vbid,
980+
vbucket_state_active,
981+
{{"topology", nlohmann::json::array({{"active", "replica"}})}});
982+
983+
// 1) Store, delete, and persist the item. We need to do this to ensure that
984+
// the bloom filter tells us to go to disk when we try gets if there is
985+
// no delete in the HashTable.
986+
auto key = makeStoredDocKey("key");
987+
storeAndDeleteItem(vbid, key, "value");
988+
auto vb = store->getVBucket(vbid);
989+
EXPECT_EQ(0, vb->getNumItems());
990+
991+
// 1) Store the new item and persist
992+
store_item(vbid, key, "value");
993+
flushVBucketToDiskIfPersistent(vbid, 1);
994+
EXPECT_EQ(1, vb->getNumItems());
995+
996+
// 2) Store the new item but don't persist it yet. We want to test what
997+
// happens when it's dirty.
998+
delete_item(vbid, key);
999+
1000+
// 1 because we haven't persisted the delete yet
1001+
EXPECT_EQ(1, vb->getNumItems());
1002+
1003+
// 3) Get now returns not found
1004+
auto options = static_cast<get_options_t>(
1005+
QUEUE_BG_FETCH | HONOR_STATES | TRACK_REFERENCE | DELETE_TEMP |
1006+
HIDE_LOCKED_CAS | TRACK_STATISTICS);
1007+
auto gv = getInternal(key, vbid, cookie, ForGetReplicaOp::No, options);
1008+
EXPECT_EQ(ENGINE_KEY_ENOENT, gv.getStatus());
1009+
1010+
// 4) Add prepare
1011+
auto prepare = makePendingItem(key, "value");
1012+
EXPECT_EQ(ENGINE_SYNC_WRITE_PENDING, addItem(*prepare, cookie));
1013+
1014+
// 1 because we haven't persisted the delete yet
1015+
EXPECT_EQ(1, vb->getNumItems());
1016+
1017+
// 5) Check that the HashTable state is now correct
1018+
{
1019+
auto htRes = vb->ht.findForUpdate(key);
1020+
ASSERT_TRUE(htRes.committed);
1021+
EXPECT_TRUE(htRes.committed->isDeleted());
1022+
EXPECT_TRUE(htRes.pending);
1023+
}
1024+
1025+
// @TODO RDB: Rocks item counting is broken and overcounts assuming
1026+
// everything is a new item
1027+
if (!isRocksDB()) {
1028+
flushVBucketToDiskIfPersistent(vbid, 2);
1029+
EXPECT_EQ(0, vb->getNumItems());
1030+
}
1031+
}
1032+
9771033
/**
9781034
* Verify that when getIf is used it only fetches the metdata from disk for
9791035
* the filter, and not the complete document.

0 commit comments

Comments
 (0)