Skip to content

Commit 94ca2cf

Browse files
Ravi Nagarjun AkellaRavi Nagarjun Akella
authored andcommitted
Fix the case where there are empty nodes during recovery
1 parent d78a92e commit 94ca2cf

File tree

6 files changed

+57
-97
lines changed

6 files changed

+57
-97
lines changed

src/include/homestore/btree/detail/btree_node.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
310310
auto prevKey = get_nth_key< K >(i - 1, false);
311311
auto curKey = get_nth_key< K >(i, false);
312312
if (prevKey.compare(curKey) >= 0) {
313-
DEBUG_ASSERT(false, "Order check failed at entry={}", i);
313+
DEBUG_ASSERT(false, "Order check failed at entry={}, btree id={}", i, node_id());
314314
return false;
315315
}
316316
}

src/include/homestore/index/index_table.hpp

Lines changed: 30 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -618,19 +618,24 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
618618
K last_child_last_key;
619619
K last_child_neighbor_key;
620620
BtreeNodePtr cur_child = child_node;
621+
auto sibling_first_child = true_sibling_first_child(parent_node);
622+
LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child);
621623

622624
// We find the last child node by starting from the leftmost child and traversing through the
623625
// next_bnode links until we reach the end or find a sibling first child.
624626
bool found_child = false;
625-
auto sibling_first_child = true_sibling_first_child(parent_node);
626-
LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child);
627627
while (cur_child != nullptr) {
628628
LOGTRACEMOD(wbcache, "Processing child node [{}]", cur_child->to_string());
629629
if (!cur_child->is_node_deleted() && cur_child->total_entries() > 0) {
630630
last_child_last_key = cur_child->get_last_key< K >();
631631
found_child = true;
632632
}
633633

634+
if (child_node->total_entries() == 0 && orig_child_infos.contains(child_node->node_id())) {
635+
last_child_last_key = orig_child_infos[child_node->node_id()];
636+
found_child = true;
637+
}
638+
634639
next_cur_child = nullptr;
635640
if (cur_child->next_bnode() == empty_bnodeid ||
636641
read_node_impl(cur_child->next_bnode(), next_cur_child) != btree_status_t::success) {
@@ -685,7 +690,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
685690
LOGTRACEMOD(wbcache,
686691
"No undeleted child found for parent_node [{}], keep normal repair (regular recovery)",
687692
parent_node->to_string());
688-
next_cur_child = nullptr;
689693
}
690694
}
691695
}
@@ -700,6 +704,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
700704
// Walk across all child nodes until it gets the last_parent_key and keep fixing them.
701705
auto cur_parent = parent_node;
702706
BtreeNodeList new_parent_nodes;
707+
BtreeNodeList child_nodes_to_free;
703708
do {
704709
if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) {
705710
LOGTRACEMOD(wbcache, "Child node [{}] is an edge node or a leaf with no next", child_node->to_string());
@@ -760,61 +765,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
760765
LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(),
761766
child_node->to_string(), child_last_key.to_string());
762767

763-
// Check if we are beyond the last child node.
764-
//
765-
// There can be cases where the child level merge is successfully persisted but the parent level is
766-
// not. In this case, you may have your rightmost child node with last key greater than the
767-
// last_parent_key. That's why here we have to check if the child node is one of the original child
768-
// nodes first.
769-
if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) {
770-
LOGTRACEMOD(
771-
wbcache,
772-
"Child node [{}] is not one of the original child nodes, so we need to check if it is beyond the "
773-
"last parent key {}",
774-
child_node->to_string(), last_parent_key.to_string());
775-
if (child_last_key.compare(last_parent_key) > 0) {
776-
// We have reached a child beyond this parent, we can stop now
777-
// TODO this case if child last key is less than last parent key to update the parent node.
778-
// this case can potentially break the btree for put and remove op.
779-
break;
780-
}
781-
if (child_node->total_entries() == 0) {
782-
// this child has no entries, but maybe in the middle of the parent node, we need to update the key
783-
// of parent as previous one and go on
784-
LOGTRACEMOD(wbcache,
785-
"Reach to an empty child node {}, and this child doesn't belong to this parent; Hence "
786-
"loop ends",
787-
child_node->to_string());
788-
// now update the next of parent node by skipping all deleted siblings of this parent node
789-
auto valid_sibling = cur_parent->next_bnode();
790-
while (valid_sibling != empty_bnodeid) {
791-
BtreeNodePtr sibling;
792-
if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) {
793-
if (sibling->is_node_deleted()) {
794-
valid_sibling = sibling->next_bnode();
795-
continue;
796-
}
797-
// cur_parent->set_next_bnode(sibling->node_id());
798-
break;
799-
}
800-
LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}",
801-
valid_sibling, cur_parent->to_string(), ret);
802-
}
803-
if (valid_sibling != empty_bnodeid) {
804-
cur_parent->set_next_bnode(valid_sibling);
805-
LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop",
806-
cur_parent->to_string(), child_node->to_string());
807-
808-
} else {
809-
cur_parent->set_next_bnode(empty_bnodeid);
810-
LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop",
811-
cur_parent->to_string(), child_node->to_string());
812-
}
813-
814-
break;
815-
}
816-
}
817-
818768
if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(),
819769
BtreeLinkInfo::get_fixed_size())) {
820770
// No room in the parent_node, let us split the parent_node and continue
@@ -836,22 +786,25 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
836786
cur_parent = std::move(new_parent);
837787
}
838788

789+
// If the child node is empty, we mark it as deleted and remove them after the repair loop
790+
if (child_node->total_entries() == 0) {
791+
LOGTRACEMOD(wbcache, "Found an empty child node {}, marking it deleted",
792+
child_node->to_string());
793+
child_node->set_node_deleted();
794+
child_nodes_to_free.push_back(child_node);
795+
// the links to the previous child node will be fixed in the next code block
796+
}
797+
798+
839799
// Insert the last key of the child node into parent node
840800
if (!child_node->is_node_deleted()) {
841801
if (child_node->total_entries() == 0) {
842-
if (orig_child_infos.contains(child_node->node_id())) {
843-
child_last_key = orig_child_infos[child_node->node_id()];
844-
LOGTRACEMOD(wbcache,
845-
"Reach to an empty child node [{}], but not the end of the parent node, so we need "
846-
"to update the key of parent node as original one {}",
847-
child_node->to_string(), child_last_key.to_string());
848-
} else {
849-
LOGTRACEMOD(wbcache,
850-
"Reach to an empty child node [{}] but not belonging to this parent (probably next "
851-
"parent sibling); Hence end loop",
852-
child_node->to_string());
853-
break;
854-
}
802+
BT_LOG_ASSERT(false,
803+
"Child node [{}] is empty but not deleted and not an edge, while it doesn't "
804+
"belong to this parent node {}",
805+
child_node->to_string(), parent_node->to_string());
806+
ret = btree_status_t::not_found;
807+
break;
855808
}
856809
cur_parent->insert(cur_parent->total_entries(), child_last_key,
857810
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
@@ -873,13 +826,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
873826
IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get());
874827
idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN);
875828
write_node_impl(pre_child_node, cp_ctx);
876-
// update the key of last entry of the parent with the last key of deleted child
877-
child_last_key = orig_child_infos[child_node->node_id()];
878-
LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(),
879-
child_last_key.to_string());
880-
// update it here to go to the next child node and unlock this node
881-
LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node");
882-
child_node->set_next_bnode(child_node->next_bnode());
883829
}
884830
}
885831

@@ -914,6 +860,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
914860
child_node = nullptr;
915861
break;
916862
}
863+
917864
ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx);
918865
if (ret != btree_status_t::success) {
919866
BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}",
@@ -925,6 +872,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
925872
} while (true);
926873

927874
if (child_node) { this->unlock_node(child_node, locktype_t::READ); }
875+
876+
// free the deleted child nodes
877+
for (const auto& cnode : child_nodes_to_free) { free_node_impl(cnode, cp_ctx); }
878+
928879
// if last parent has the key less than the last child key, then we need to update the parent node with
929880
// the last child key if it doesn't have edge.
930881
auto last_parent = parent_node;
@@ -996,7 +947,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
996947
}
997948

998949
if (sibling_node->is_node_deleted()) {
999-
LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}",
950+
LOGTRACEMOD(wbcache, "Sibling node [{}] is not the true sibling for parent_node {}",
1000951
sibling_node->to_string(), node->to_string());
1001952
return find_true_sibling(sibling_node);
1002953
} else {

src/tests/btree_helpers/btree_test_helper.hpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,8 @@ struct BtreeTestHelper {
329329
do_range_remove(start_k, end_k, false /* removing_all_existing */);
330330
}
331331

332-
void range_remove_all(uint32_t start_k, uint32_t end_k) {
333-
do_range_remove(start_k, end_k, true /* removing_all_existing */);
332+
void range_remove_all(uint32_t start_k, uint32_t end_k, bool expect_success = true) {
333+
do_range_remove(start_k, end_k, true /* removing_all_existing */, expect_success);
334334
}
335335

336336
////////////////////// All query operation variants ///////////////////////////////
@@ -532,7 +532,7 @@ struct BtreeTestHelper {
532532
if (expect_success) { m_shadow_map.put_and_check(key, value, *existing_v, done); }
533533
}
534534

535-
void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing) {
535+
void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing, bool expect_success = true) {
536536
K start_key = K{start_k};
537537
K end_key = K{end_k};
538538

@@ -542,8 +542,10 @@ struct BtreeTestHelper {
542542

543543
if (all_existing) {
544544
m_shadow_map.range_erase(start_key, end_key);
545-
ASSERT_EQ((ret == btree_status_t::success), true)
546-
<< "not a successful remove op for range " << start_k << "-" << end_k;
545+
if (expect_success) {
546+
ASSERT_EQ(ret, btree_status_t::success)
547+
<< "not a successful remove op for range " << start_k << "-" << end_k;
548+
}
547549
} else if (start_k < m_max_range_input) {
548550
K end_range{std::min(end_k, uint64_cast(m_max_range_input - 1))};
549551
m_shadow_map.range_erase(start_key, end_range);

src/tests/btree_helpers/btree_test_kvs.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ class TestFixedKey : public BtreeKey {
8181
public:
8282
TestFixedKey() = default;
8383
TestFixedKey(uint64_t k) : m_key{k} {}
84-
TestFixedKey(const TestFixedKey& other) : TestFixedKey(other.serialize(), true) {}
84+
TestFixedKey(const TestFixedKey& other) : m_key{other.m_key} {}
8585
TestFixedKey(const BtreeKey& other) : TestFixedKey(other.serialize(), true) {}
8686
TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.cbytes()))} {}
8787
TestFixedKey& operator=(const TestFixedKey& other) = default;

src/tests/btree_helpers/shadow_map.hpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,17 @@ class ShadowMap {
7070
return std::pair(start_it->first, it->first);
7171
}
7272

73-
std::pair< K, K > pick_existing_range(const K& start_key, uint32_t max_count, pick_existing_range_cb_t cb) const {
73+
std::optional<std::pair< K, K > > pick_existing_range(const K& start_key, uint32_t max_count, pick_existing_range_cb_t cb) const {
7474
std::lock_guard lock{m_mutex};
7575
auto const start_it = m_map.lower_bound(start_key);
76+
if (start_it == m_map.cend()) { return std::nullopt; }
77+
auto end_it = start_it;
7678
auto it = start_it;
77-
uint32_t count = 0;
78-
while ((it != m_map.cend()) && (++count < max_count)) {
79+
for(uint32_t count = 0; count < max_count && it != m_map.cend(); ++count, ++it) {
7980
cb(it->first);
80-
++it;
81+
end_it = it;
8182
}
82-
return std::pair(start_it->first, it->first);
83+
return std::pair(start_it->first, end_it->first);
8384
}
8485

8586
uint32_t max_keys() const { return m_max_keys; }

src/tests/test_index_crash_recovery.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
437437
if (range_remove_keys) {
438438
auto [s_key, e_key] = *range_remove_keys;
439439
LOGDEBUG("Reapply: Range removing keys [{}, {})", s_key, e_key);
440-
this->range_remove_all(s_key, e_key);
440+
this->range_remove_all(s_key, e_key, false /* expect_success*/);
441441
}
442442
trigger_cp(true);
443443
}
@@ -555,11 +555,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
555555

556556
uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); }
557557

558-
std::pair<uint32_t, uint32_t> range_remove_op(SequenceGenerator& generator, uint32_t range_remove_count) {
558+
std::optional< std::pair< uint32_t, uint32_t > > range_remove_op(SequenceGenerator& generator, uint32_t range_remove_count) {
559559
uint32_t key = generator.getKeyDistribution()(g_re);
560-
auto [start_key, end_key] = this->m_shadow_map.pick_existing_range(K{key}, range_remove_count,
560+
auto range_opt = this->m_shadow_map.pick_existing_range(K{key}, range_remove_count,
561561
[&generator](const K& key) { generator.getKeyStates()[key.key()] = false; generator.getInUseKeyCount().fetch_sub(1); });
562-
return {start_key.key(), end_key.key()};
562+
std::optional< std::pair< uint32_t, uint32_t > > ret = std::nullopt;
563+
if (range_opt) {
564+
ret = std::make_pair(range_opt->first.key(), range_opt->second.key());
565+
}
566+
return ret;
563567
}
564568

565569
void long_running_crash(long_running_crash_options const& crash_test_options) {
@@ -740,8 +744,10 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
740744
if (crash_test_options.range_remove_) {
741745
// add one range remove operation
742746
range_remove_keys = this->range_remove_op(generator, crash_test_options.num_entries_per_rounds);
743-
LOGDEBUG("Range removing keys [{}, {})", range_remove_keys->first, range_remove_keys->second);
744-
this->range_remove_all(range_remove_keys->first, range_remove_keys->second);
747+
if (range_remove_keys) {
748+
LOGDEBUG("Range removing keys [{}, {})", range_remove_keys->first, range_remove_keys->second);
749+
this->range_remove_all(range_remove_keys->first, range_remove_keys->second);
750+
}
745751
}
746752
if (normal_execution) {
747753
if (clean_shutdown) {

0 commit comments

Comments
 (0)