Skip to content

Commit adb670f

Browse files
Ravi Nagarjun AkellaRavi Nagarjun Akella
authored andcommitted
Fix the case where there are empty nodes during recovery
1 parent c385553 commit adb670f

File tree

6 files changed

+38
-90
lines changed

6 files changed

+38
-90
lines changed

src/include/homestore/btree/detail/btree_node.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
310310
auto prevKey = get_nth_key< K >(i - 1, false);
311311
auto curKey = get_nth_key< K >(i, false);
312312
if (prevKey.compare(curKey) >= 0) {
313-
DEBUG_ASSERT(false, "Order check failed at entry={}", i);
313+
DEBUG_ASSERT(false, "Order check failed at entry={}, btree id={}", i, node_id());
314314
return false;
315315
}
316316
}

src/include/homestore/index/index_table.hpp

Lines changed: 24 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -618,12 +618,12 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
618618
K last_child_last_key;
619619
K last_child_neighbor_key;
620620
BtreeNodePtr cur_child = child_node;
621+
auto sibling_first_child = true_sibling_first_child(parent_node);
622+
LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child);
621623

622624
// We find the last child node by starting from the leftmost child and traversing through the
623625
// next_bnode links until we reach the end or find a sibling first child.
624626
bool found_child = false;
625-
auto sibling_first_child = true_sibling_first_child(parent_node);
626-
LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child);
627627
while (cur_child != nullptr) {
628628
LOGTRACEMOD(wbcache, "Processing child node [{}]", cur_child->to_string());
629629
if (!cur_child->is_node_deleted() && cur_child->total_entries() > 0) {
@@ -685,7 +685,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
685685
LOGTRACEMOD(wbcache,
686686
"No undeleted child found for parent_node [{}], keep normal repair (regular recovery)",
687687
parent_node->to_string());
688-
next_cur_child = nullptr;
689688
}
690689
}
691690
}
@@ -700,6 +699,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
700699
// Walk across all child nodes until it gets the last_parent_key and keep fixing them.
701700
auto cur_parent = parent_node;
702701
BtreeNodeList new_parent_nodes;
702+
BtreeNodeList child_nodes_to_free;
703703
do {
704704
if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) {
705705
LOGTRACEMOD(wbcache, "Child node [{}] is an edge node or a leaf with no next", child_node->to_string());
@@ -760,61 +760,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
760760
LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(),
761761
child_node->to_string(), child_last_key.to_string());
762762

763-
// Check if we are beyond the last child node.
764-
//
765-
// There can be cases where the child level merge is successfully persisted but the parent level is
766-
// not. In this case, you may have your rightmost child node with last key greater than the
767-
// last_parent_key. That's why here we have to check if the child node is one of the original child
768-
// nodes first.
769-
if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) {
770-
LOGTRACEMOD(
771-
wbcache,
772-
"Child node [{}] is not one of the original child nodes, so we need to check if it is beyond the "
773-
"last parent key {}",
774-
child_node->to_string(), last_parent_key.to_string());
775-
if (child_last_key.compare(last_parent_key) > 0) {
776-
// We have reached a child beyond this parent, we can stop now
777-
// TODO this case if child last key is less than last parent key to update the parent node.
778-
// this case can potentially break the btree for put and remove op.
779-
break;
780-
}
781-
if (child_node->total_entries() == 0) {
782-
// this child has no entries, but maybe in the middle of the parent node, we need to update the key
783-
// of parent as previous one and go on
784-
LOGTRACEMOD(wbcache,
785-
"Reach to an empty child node {}, and this child doesn't belong to this parent; Hence "
786-
"loop ends",
787-
child_node->to_string());
788-
// now update the next of parent node by skipping all deleted siblings of this parent node
789-
auto valid_sibling = cur_parent->next_bnode();
790-
while (valid_sibling != empty_bnodeid) {
791-
BtreeNodePtr sibling;
792-
if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) {
793-
if (sibling->is_node_deleted()) {
794-
valid_sibling = sibling->next_bnode();
795-
continue;
796-
}
797-
// cur_parent->set_next_bnode(sibling->node_id());
798-
break;
799-
}
800-
LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}",
801-
valid_sibling, cur_parent->to_string(), ret);
802-
}
803-
if (valid_sibling != empty_bnodeid) {
804-
cur_parent->set_next_bnode(valid_sibling);
805-
LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop",
806-
cur_parent->to_string(), child_node->to_string());
807-
808-
} else {
809-
cur_parent->set_next_bnode(empty_bnodeid);
810-
LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop",
811-
cur_parent->to_string(), child_node->to_string());
812-
}
813-
814-
break;
815-
}
816-
}
817-
818763
if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(),
819764
BtreeLinkInfo::get_fixed_size())) {
820765
// No room in the parent_node, let us split the parent_node and continue
@@ -836,22 +781,25 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
836781
cur_parent = std::move(new_parent);
837782
}
838783

784+
// If the child node is empty, we mark it as deleted and remove them after the repair loop
785+
if (child_node->total_entries() == 0) {
786+
LOGTRACEMOD(wbcache, "Found an empty child node {}, marking it deleted",
787+
child_node->to_string());
788+
child_node->set_node_deleted();
789+
child_nodes_to_free.push_back(child_node);
790+
// the links to the previous child node will be fixed in the next coe block
791+
}
792+
793+
839794
// Insert the last key of the child node into parent node
840795
if (!child_node->is_node_deleted()) {
841796
if (child_node->total_entries() == 0) {
842-
if (orig_child_infos.contains(child_node->node_id())) {
843-
child_last_key = orig_child_infos[child_node->node_id()];
844-
LOGTRACEMOD(wbcache,
845-
"Reach to an empty child node [{}], but not the end of the parent node, so we need "
846-
"to update the key of parent node as original one {}",
847-
child_node->to_string(), child_last_key.to_string());
848-
} else {
849-
LOGTRACEMOD(wbcache,
850-
"Reach to an empty child node [{}] but not belonging to this parent (probably next "
851-
"parent sibling); Hence end loop",
852-
child_node->to_string());
853-
break;
854-
}
797+
BT_LOG_ASSERT(child_node->total_entries() > 0,
798+
"Child node={} has 0 entries but is not marked deleted, parent_node={} repair is "
799+
"partial",
800+
child_node->node_id(), parent_node->node_id());
801+
ret = btree_status_t::not_found;
802+
break;
855803
}
856804
cur_parent->insert(cur_parent->total_entries(), child_last_key,
857805
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
@@ -873,13 +821,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
873821
IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get());
874822
idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN);
875823
write_node_impl(pre_child_node, cp_ctx);
876-
// update the key of last entry of the parent with the last key of deleted child
877-
child_last_key = orig_child_infos[child_node->node_id()];
878-
LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(),
879-
child_last_key.to_string());
880-
// update it here to go to the next child node and unlock this node
881-
LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node");
882-
child_node->set_next_bnode(child_node->next_bnode());
883824
}
884825
}
885826

@@ -914,6 +855,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
914855
child_node = nullptr;
915856
break;
916857
}
858+
917859
ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx);
918860
if (ret != btree_status_t::success) {
919861
BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}",
@@ -925,6 +867,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
925867
} while (true);
926868

927869
if (child_node) { this->unlock_node(child_node, locktype_t::READ); }
870+
871+
// free the deleted child nodes
872+
for (const auto& cnode : child_nodes_to_free) { free_node_impl(cnode, cp_ctx); }
873+
928874
// if last parent has the key less than the last child key, then we need to update the parent node with
929875
// the last child key if it doesn't have edge.
930876
auto last_parent = parent_node;

src/tests/btree_helpers/btree_test_helper.hpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,8 @@ struct BtreeTestHelper {
329329
do_range_remove(start_k, end_k, false /* removing_all_existing */);
330330
}
331331

332-
void range_remove_all(uint32_t start_k, uint32_t end_k) {
333-
do_range_remove(start_k, end_k, true /* removing_all_existing */);
332+
void range_remove_all(uint32_t start_k, uint32_t end_k, bool expect_success = true) {
333+
do_range_remove(start_k, end_k, true /* removing_all_existing */, expect_success);
334334
}
335335

336336
////////////////////// All query operation variants ///////////////////////////////
@@ -532,7 +532,7 @@ struct BtreeTestHelper {
532532
if (expect_success) { m_shadow_map.put_and_check(key, value, *existing_v, done); }
533533
}
534534

535-
void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing) {
535+
void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing, bool expect_success = true) {
536536
K start_key = K{start_k};
537537
K end_key = K{end_k};
538538

@@ -542,8 +542,10 @@ struct BtreeTestHelper {
542542

543543
if (all_existing) {
544544
m_shadow_map.range_erase(start_key, end_key);
545-
ASSERT_EQ((ret == btree_status_t::success), true)
546-
<< "not a successful remove op for range " << start_k << "-" << end_k;
545+
if (expect_success) {
546+
ASSERT_EQ(ret, btree_status_t::success)
547+
<< "not a successful remove op for range " << start_k << "-" << end_k;
548+
}
547549
} else if (start_k < m_max_range_input) {
548550
K end_range{std::min(end_k, uint64_cast(m_max_range_input - 1))};
549551
m_shadow_map.range_erase(start_key, end_range);

src/tests/btree_helpers/btree_test_kvs.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ class TestFixedKey : public BtreeKey {
8181
public:
8282
TestFixedKey() = default;
8383
TestFixedKey(uint64_t k) : m_key{k} {}
84-
TestFixedKey(const TestFixedKey& other) : TestFixedKey(other.serialize(), true) {}
84+
TestFixedKey(const TestFixedKey& other) : m_key{other.m_key} {}
8585
TestFixedKey(const BtreeKey& other) : TestFixedKey(other.serialize(), true) {}
8686
TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.cbytes()))} {}
8787
TestFixedKey& operator=(const TestFixedKey& other) = default;

src/tests/btree_helpers/shadow_map.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,13 @@ class ShadowMap {
7373
std::pair< K, K > pick_existing_range(const K& start_key, uint32_t max_count, pick_existing_range_cb_t cb) const {
7474
std::lock_guard lock{m_mutex};
7575
auto const start_it = m_map.lower_bound(start_key);
76+
auto end_it = start_it;
7677
auto it = start_it;
77-
uint32_t count = 0;
78-
while ((it != m_map.cend()) && (++count < max_count)) {
78+
for(uint32_t count = 0; count < max_count && it != m_map.cend(); ++count, ++it) {
7979
cb(it->first);
80-
++it;
80+
end_it = it;
8181
}
82-
return std::pair(start_it->first, it->first);
82+
return std::pair(start_it->first, end_it->first);
8383
}
8484

8585
uint32_t max_keys() const { return m_max_keys; }

src/tests/test_index_crash_recovery.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
437437
if (range_remove_keys) {
438438
auto [s_key, e_key] = *range_remove_keys;
439439
LOGDEBUG("Reapply: Range removing keys [{}, {})", s_key, e_key);
440-
this->range_remove_all(s_key, e_key);
440+
this->range_remove_all(s_key, e_key, false /* expect_success*/);
441441
}
442442
trigger_cp(true);
443443
}

0 commit comments

Comments
 (0)