Skip to content

Commit c90d7ae

Browse files
xxhdx1985126cyx1231st
authored andcommitted
crimson/os/seastore: adjust and fix rebalance logic in lba/backref/omap tree
* For fixed_kv_btree, drop `prefer_left`. * For omap btree, tolerate underflow nodes if rebalance has the same results. Fixes: https://tracker.ceph.com/issues/71307 Signed-off-by: Xuehan Xu <[email protected]> Signed-off-by: Yingxin Cheng <[email protected]>
1 parent 50e9ded commit c90d7ae

File tree

10 files changed

+68
-67
lines changed

10 files changed

+68
-67
lines changed

src/crimson/common/fixed_kv_node_layout.h

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -560,34 +560,33 @@ class FixedKVNodeLayout {
560560

561561
static uint32_t get_balance_pivot_idx(
562562
const FixedKVNodeLayout &left,
563-
const FixedKVNodeLayout &right,
564-
bool prefer_left)
563+
const FixedKVNodeLayout &right)
565564
{
566-
auto total = left.get_size() + right.get_size();
565+
auto l_size = left.get_size();
566+
auto r_size = right.get_size();
567+
auto total = l_size + r_size;
567568
auto pivot_idx = total / 2;
568-
if (total % 2 && prefer_left) {
569-
pivot_idx++;
570-
}
569+
assert(pivot_idx > std::min(l_size, r_size)
570+
&& pivot_idx < std::max(l_size, r_size));
571571
return pivot_idx;
572572
}
573573

574574
/**
575575
* balance_into_new_nodes
576576
*
577577
* Takes the contents of left and right and copies them into
578-
* replacement_left and replacement_right such that in the
579-
* event that the number of elements is odd the extra goes to
580-
* the left side iff prefer_left.
578+
* replacement_left and replacement_right such that the size
579+
* of both are balanced.
581580
*/
582581
static K balance_into_new_nodes(
583582
const FixedKVNodeLayout &left,
584583
const FixedKVNodeLayout &right,
585-
bool prefer_left,
584+
uint32_t pivot_idx,
586585
FixedKVNodeLayout &replacement_left,
587586
FixedKVNodeLayout &replacement_right)
588587
{
588+
assert(pivot_idx != left.get_size() && pivot_idx != right.get_size());
589589
auto total = left.get_size() + right.get_size();
590-
auto pivot_idx = get_balance_pivot_idx(left, right, prefer_left);
591590
auto replacement_pivot = pivot_idx >= left.get_size() ?
592591
right.iter_idx(pivot_idx - left.get_size())->get_key() :
593592
left.iter_idx(pivot_idx)->get_key();

src/crimson/os/seastore/backref/backref_tree_node.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ class BackrefLeafNode
173173
Transaction &t,
174174
BackrefLeafNode &left,
175175
BackrefLeafNode &right,
176-
bool prefer_left,
176+
uint32_t pivot_idx,
177177
BackrefLeafNode &replacement_left,
178178
BackrefLeafNode &replacement_right) final {}
179179

@@ -191,7 +191,7 @@ class BackrefLeafNode
191191
Transaction &t,
192192
BackrefLeafNode &left,
193193
BackrefLeafNode &right,
194-
bool prefer_left,
194+
uint32_t pivot_idx,
195195
BackrefLeafNode &replacement_left,
196196
BackrefLeafNode &replacement_right) final {}
197197
// backref leaf nodes don't have to resolve relative addresses

src/crimson/os/seastore/btree/fixed_kv_btree.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2064,12 +2064,13 @@ class FixedKVBtree {
20642064
c.cache.retire_extent(c.trans, r);
20652065
get_tree_stats<self_type>(c.trans).extents_num_delta--;
20662066
} else {
2067+
auto pivot_idx = l->get_balance_pivot_idx(*l, *r);
20672068
LOG_PREFIX(FixedKVBtree::merge_level);
20682069
auto [replacement_l, replacement_r, pivot] =
20692070
l->make_balanced(
20702071
c,
20712072
r,
2072-
!donor_is_left);
2073+
pivot_idx);
20732074

20742075
parent_pos.node->update(
20752076
liter,

src/crimson/os/seastore/btree/fixed_kv_node.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ struct FixedKVInternalNode
341341
make_balanced(
342342
op_context_t<NODE_KEY> c,
343343
Ref &_right,
344-
bool prefer_left) {
344+
uint32_t pivot_idx) {
345345
ceph_assert(_right->get_type() == this->get_type());
346346
auto &right = *_right->template cast<node_type_t>();
347347
auto replacement_left = c.cache.template alloc_new_non_data_extent<node_type_t>(
@@ -353,13 +353,13 @@ struct FixedKVInternalNode
353353
c.trans,
354354
static_cast<node_type_t&>(*this),
355355
right,
356-
prefer_left,
356+
pivot_idx,
357357
*replacement_left,
358358
*replacement_right);
359359
auto pivot = this->balance_into_new_nodes(
360360
*this,
361361
right,
362-
prefer_left,
362+
pivot_idx,
363363
*replacement_left,
364364
*replacement_right);
365365
replacement_left->range = replacement_left->get_meta();
@@ -368,7 +368,7 @@ struct FixedKVInternalNode
368368
c.trans,
369369
static_cast<node_type_t&>(*this),
370370
right,
371-
prefer_left,
371+
pivot_idx,
372372
*replacement_left,
373373
*replacement_right);
374374
return std::make_tuple(
@@ -688,22 +688,22 @@ struct FixedKVLeafNode
688688
Transaction &t,
689689
node_type_t &left,
690690
node_type_t &right,
691-
bool prefer_left,
691+
uint32_t pivot_idx,
692692
node_type_t &replacement_left,
693693
node_type_t &replacement_right) = 0;
694694
virtual void adjust_copy_src_dest_on_balance(
695695
Transaction &t,
696696
node_type_t &left,
697697
node_type_t &right,
698-
bool prefer_left,
698+
uint32_t pivot_idx,
699699
node_type_t &replacement_left,
700700
node_type_t &replacement_right) = 0;
701701

702702
std::tuple<Ref, Ref, NODE_KEY>
703703
make_balanced(
704704
op_context_t<NODE_KEY> c,
705705
Ref &_right,
706-
bool prefer_left) {
706+
uint32_t pivot_idx) {
707707
ceph_assert(_right->get_type() == this->get_type());
708708
auto &right = *_right->template cast<node_type_t>();
709709
auto replacement_left = c.cache.template alloc_new_non_data_extent<node_type_t>(
@@ -715,13 +715,13 @@ struct FixedKVLeafNode
715715
c.trans,
716716
static_cast<node_type_t&>(*this),
717717
right,
718-
prefer_left,
718+
pivot_idx,
719719
*replacement_left,
720720
*replacement_right);
721721
auto pivot = this->balance_into_new_nodes(
722722
*this,
723723
right,
724-
prefer_left,
724+
pivot_idx,
725725
*replacement_left,
726726
*replacement_right);
727727
replacement_left->range = replacement_left->get_meta();
@@ -730,7 +730,7 @@ struct FixedKVLeafNode
730730
c.trans,
731731
static_cast<node_type_t&>(*this),
732732
right,
733-
prefer_left,
733+
pivot_idx,
734734
*replacement_left,
735735
*replacement_right);
736736
return std::make_tuple(

src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -304,21 +304,21 @@ struct LBALeafNode
304304
Transaction &t,
305305
LBALeafNode &left,
306306
LBALeafNode &right,
307-
bool prefer_left,
307+
uint32_t pivot_idx,
308308
LBALeafNode &replacement_left,
309309
LBALeafNode &replacement_right) final {
310310
this->balance_child_ptrs(
311-
t, left, right, prefer_left, replacement_left, replacement_right);
311+
t, left, right, pivot_idx, replacement_left, replacement_right);
312312
}
313313
void adjust_copy_src_dest_on_balance(
314314
Transaction &t,
315315
LBALeafNode &left,
316316
LBALeafNode &right,
317-
bool prefer_left,
317+
uint32_t pivot_idx,
318318
LBALeafNode &replacement_left,
319319
LBALeafNode &replacement_right) final {
320320
this->parent_node_t::adjust_copy_src_dest_on_balance(
321-
t, left, right, prefer_left, replacement_left, replacement_right);
321+
t, left, right, pivot_idx, replacement_left, replacement_right);
322322
}
323323

324324
CachedExtentRef duplicate_for_write(Transaction&) final {

src/crimson/os/seastore/linked_tree_node.h

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -632,14 +632,14 @@ class ParentNode {
632632
Transaction &t,
633633
T &left,
634634
T &right,
635-
bool prefer_left,
635+
uint32_t pivot_idx,
636636
T &replacement_left,
637637
T &replacement_right)
638638
{
639639
size_t l_size = left.get_size();
640640
size_t r_size = right.get_size();
641-
size_t pivot_idx = T::get_balance_pivot_idx(left, right, prefer_left);
642641

642+
ceph_assert(pivot_idx != l_size && pivot_idx != r_size);
643643
replacement_left.maybe_expand_children(pivot_idx);
644644
replacement_right.maybe_expand_children(r_size + l_size - pivot_idx);
645645

@@ -678,17 +678,11 @@ class ParentNode {
678678
Transaction &t,
679679
T &left,
680680
T &right,
681-
bool prefer_left,
681+
uint32_t pivot_idx,
682682
T &replacement_left,
683683
T &replacement_right)
684684
{
685685
size_t l_size = left.get_size();
686-
size_t r_size = right.get_size();
687-
size_t total = l_size + r_size;
688-
size_t pivot_idx = (l_size + r_size) / 2;
689-
if (total % 2 && prefer_left) {
690-
pivot_idx++;
691-
}
692686

693687
if (left.is_initial_pending()) {
694688
for (auto &cs : left.copy_sources) {

src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ struct OMapNode : LogicalChildNode {
101101

102102
using make_balanced_iertr = base_iertr;
103103
using make_balanced_ret = make_balanced_iertr::future
104-
<std::tuple<OMapNodeRef, OMapNodeRef, std::string>>;
104+
<std::tuple<OMapNodeRef, OMapNodeRef, std::optional<std::string>>>;
105105
virtual make_balanced_ret make_balanced(
106106
omap_context_t oc,
107107
OMapNodeRef _right) = 0;

src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -394,18 +394,24 @@ OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
394394
LOG_PREFIX(OMapInnerNode::make_balanced);
395395
DEBUGT("l: {}, r: {}", oc.t, *this, *_right);
396396
ceph_assert(_right->get_type() == TYPE);
397+
auto &right = *_right->cast<OMapInnerNode>();
398+
auto pivot_idx = get_balance_pivot_idx(*this, right);
399+
if (!pivot_idx) {
400+
return make_balanced_ret(
401+
interruptible::ready_future_marker{},
402+
std::make_tuple(OMapNodeRef{}, OMapNodeRef{}, std::nullopt));
403+
}
397404
return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
398405
OMAP_INNER_BLOCK_SIZE, 2)
399-
.si_then([this, _right, oc] (auto &&replacement_pair){
406+
.si_then([this, &right, pivot_idx, oc] (auto &&replacement_pair){
400407
auto replacement_left = replacement_pair.front();
401408
auto replacement_right = replacement_pair.back();
402-
auto &right = *_right->cast<OMapInnerNode>();
403-
this->balance_child_ptrs(oc.t, *this, right, true,
409+
this->balance_child_ptrs(oc.t, *this, right, *pivot_idx,
404410
*replacement_left, *replacement_right);
405411
return make_balanced_ret(
406412
interruptible::ready_future_marker{},
407413
std::make_tuple(replacement_left, replacement_right,
408-
balance_into_new_nodes(*this, right,
414+
balance_into_new_nodes(*this, right, *pivot_idx,
409415
*replacement_left, *replacement_right)));
410416
}).handle_error_interruptible(
411417
crimson::ct_error::enospc::assert_failure{"unexpected enospc"},
@@ -490,8 +496,14 @@ OMapInnerNode::merge_entry(
490496
).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) {
491497
LOG_PREFIX(OMapInnerNode::merge_entry);
492498
auto [replacement_l, replacement_r, replacement_pivot] = tuple;
493-
replacement_l->init_range(l->get_begin(), replacement_pivot);
494-
replacement_r->init_range(replacement_pivot, r->get_end());
499+
if (!replacement_pivot) {
500+
return merge_entry_ret(
501+
interruptible::ready_future_marker{},
502+
mutation_result_t(mutation_status_t::SUCCESS,
503+
std::nullopt, std::nullopt));
504+
}
505+
replacement_l->init_range(l->get_begin(), *replacement_pivot);
506+
replacement_r->init_range(*replacement_pivot, r->get_end());
495507
DEBUGT("to update parent: {} {} {}",
496508
oc.t, *this, *replacement_l, *replacement_r);
497509
if (get_meta().depth > 2) { // l and r are inner nodes
@@ -511,7 +523,7 @@ OMapInnerNode::merge_entry(
511523
liter,
512524
replacement_l->get_laddr(),
513525
maybe_get_delta_buffer());
514-
bool overflow = extent_will_overflow(replacement_pivot.size(),
526+
bool overflow = extent_will_overflow(replacement_pivot->size(),
515527
std::nullopt);
516528
if (!overflow) {
517529
this->update_child_ptr(
@@ -521,7 +533,7 @@ OMapInnerNode::merge_entry(
521533
journal_inner_insert(
522534
riter,
523535
replacement_r->get_laddr(),
524-
replacement_pivot,
536+
*replacement_pivot,
525537
maybe_get_delta_buffer());
526538
std::vector<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr()};
527539
return dec_ref(oc, dec_laddrs
@@ -539,7 +551,7 @@ OMapInnerNode::merge_entry(
539551
this->remove_child_ptr(riter.get_offset());
540552
journal_inner_remove(riter, maybe_get_delta_buffer());
541553
return make_split_insert(
542-
oc, riter, replacement_pivot, replacement_r
554+
oc, riter, *replacement_pivot, replacement_r
543555
).si_then([this, oc, l = l, r = r](auto mresult) {
544556
std::vector<laddr_t> dec_laddrs{
545557
l->get_laddr(),

src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -738,11 +738,9 @@ class StringKVInnerNodeLayout {
738738
set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta()));
739739
}
740740

741-
static uint32_t get_balance_pivot_idx(
741+
static std::optional<uint32_t> get_balance_pivot_idx(
742742
const StringKVInnerNodeLayout &left,
743-
const StringKVInnerNodeLayout &right,
744-
// this is just for the consistency with linked tree nodes
745-
bool prefer_left = false)
743+
const StringKVInnerNodeLayout &right)
746744
{
747745
uint32_t left_size = omap_inner_key_t(
748746
left.get_node_key_ptr()[left.get_size()-1]).key_off;
@@ -773,7 +771,9 @@ class StringKVInnerNodeLayout {
773771
}
774772
}
775773
}
776-
return pivot_idx;
774+
return pivot_idx == left.get_size()
775+
? std::nullopt
776+
: std::make_optional<uint32_t>(pivot_idx);
777777
}
778778

779779
/**
@@ -786,14 +786,15 @@ class StringKVInnerNodeLayout {
786786
static std::string balance_into_new_nodes(
787787
const StringKVInnerNodeLayout &left,
788788
const StringKVInnerNodeLayout &right,
789+
uint32_t pivot_idx,
789790
StringKVInnerNodeLayout &replacement_left,
790791
StringKVInnerNodeLayout &replacement_right)
791792
{
793+
ceph_assert(!(left.below_min() && right.below_min()));
792794
uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
793795
uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
794796
uint32_t total = left_size + right_size;
795797
uint32_t pivot_size = total / 2;
796-
uint32_t pivot_idx = get_balance_pivot_idx(left, right);
797798

798799
auto replacement_pivot = pivot_idx >= left.get_size() ?
799800
right.iter_idx(pivot_idx - left.get_size())->get_key() :

0 commit comments

Comments
 (0)