Skip to content

Commit 3b3242c

Browse files
authored
Merge pull request ceph#57709 from xxhdx1985126/wip-seastore-fix-parent_invalid-mappings
crimson/os/seastore: avoid getting wrong logical extents through "parent-invalid" lba mappings Reviewed-by: Yingxin Cheng <[email protected]>
2 parents 5ca380a + 4ad6f86 commit 3b3242c

File tree

11 files changed

+327
-81
lines changed

11 files changed

+327
-81
lines changed

src/crimson/os/seastore/btree/btree_range_pin.cc

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ get_child_ret_t<LogicalCachedExtent>
1111
BtreeNodeMapping<key_t, val_t>::get_logical_extent(
1212
Transaction &t)
1313
{
14-
assert(parent);
15-
assert(parent->is_valid());
14+
ceph_assert(is_parent_valid());
1615
assert(pos != std::numeric_limits<uint16_t>::max());
1716
ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id());
1817
auto &p = (FixedKVNode<key_t>&)*parent;
@@ -29,21 +28,25 @@ BtreeNodeMapping<key_t, val_t>::get_logical_extent(
2928
template <typename key_t, typename val_t>
3029
bool BtreeNodeMapping<key_t, val_t>::is_stable() const
3130
{
32-
assert(parent);
33-
assert(parent->is_valid());
31+
assert(!this->parent_modified());
3432
assert(pos != std::numeric_limits<uint16_t>::max());
3533
auto &p = (FixedKVNode<key_t>&)*parent;
36-
return p.is_child_stable(ctx, pos);
34+
auto k = this->is_indirect()
35+
? this->get_intermediate_base()
36+
: get_key();
37+
return p.is_child_stable(ctx, pos, k);
3738
}
3839

3940
template <typename key_t, typename val_t>
4041
bool BtreeNodeMapping<key_t, val_t>::is_data_stable() const
4142
{
42-
assert(parent);
43-
assert(parent->is_valid());
43+
assert(!this->parent_modified());
4444
assert(pos != std::numeric_limits<uint16_t>::max());
4545
auto &p = (FixedKVNode<key_t>&)*parent;
46-
return p.is_child_data_stable(ctx, pos);
46+
auto k = this->is_indirect()
47+
? this->get_intermediate_base()
48+
: get_key();
49+
return p.is_child_data_stable(ctx, pos, k);
4750
}
4851

4952
template class BtreeNodeMapping<laddr_t, paddr_t>;

src/crimson/os/seastore/btree/btree_range_pin.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,37 @@ class BtreeNodeMapping : public PhysicalNodeMapping<key_t, val_t> {
194194
return parent->has_been_invalidated();
195195
}
196196

197+
bool is_unviewable_by_trans(CachedExtent& extent, Transaction &t) const {
198+
if (!extent.is_valid()) {
199+
return true;
200+
}
201+
if (extent.is_pending()) {
202+
assert(extent.is_pending_in_trans(t.get_trans_id()));
203+
return false;
204+
}
205+
auto &pendings = extent.mutation_pendings;
206+
auto trans_id = t.get_trans_id();
207+
bool unviewable = (pendings.find(trans_id, trans_spec_view_t::cmp_t()) !=
208+
pendings.end());
209+
if (!unviewable) {
210+
auto &trans = extent.retired_transactions;
211+
unviewable = (trans.find(trans_id, trans_spec_view_t::cmp_t()) !=
212+
trans.end());
213+
assert(unviewable == t.is_retired(extent.get_paddr(), extent.get_length()));
214+
}
215+
return unviewable;
216+
}
217+
197218
get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction&) final;
198219
bool is_stable() const final;
199220
bool is_data_stable() const final;
221+
bool is_parent_valid() const final {
222+
ceph_assert(parent);
223+
if (!parent->is_valid()) {
224+
return false;
225+
}
226+
return !is_unviewable_by_trans(*parent, ctx.trans);
227+
}
200228
};
201229

202230
}

src/crimson/os/seastore/btree/fixed_kv_node.h

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,14 @@ struct FixedKVNode : ChildableCachedExtent {
265265
set_child_ptracker(child);
266266
}
267267

268-
virtual bool is_child_stable(op_context_t<node_key_t>, uint16_t pos) const = 0;
269-
virtual bool is_child_data_stable(op_context_t<node_key_t>, uint16_t pos) const = 0;
268+
virtual bool is_child_stable(
269+
op_context_t<node_key_t>,
270+
uint16_t pos,
271+
node_key_t key) const = 0;
272+
virtual bool is_child_data_stable(
273+
op_context_t<node_key_t>,
274+
uint16_t pos,
275+
node_key_t key) const = 0;
270276

271277
template <typename T>
272278
get_child_ret_t<T> get_child(
@@ -275,6 +281,7 @@ struct FixedKVNode : ChildableCachedExtent {
275281
node_key_t key)
276282
{
277283
assert(children.capacity());
284+
assert(key == get_key_from_idx(pos));
278285
auto child = children[pos];
279286
ceph_assert(!is_reserved_ptr(child));
280287
if (is_valid_child_ptr(child)) {
@@ -632,11 +639,17 @@ struct FixedKVInternalNode
632639
}
633640
}
634641

635-
bool is_child_stable(op_context_t<NODE_KEY>, uint16_t pos) const final {
642+
bool is_child_stable(
643+
op_context_t<NODE_KEY>,
644+
uint16_t pos,
645+
NODE_KEY key) const final {
636646
ceph_abort("impossible");
637647
return false;
638648
}
639-
bool is_child_data_stable(op_context_t<NODE_KEY>, uint16_t pos) const final {
649+
bool is_child_data_stable(
650+
op_context_t<NODE_KEY>,
651+
uint16_t pos,
652+
NODE_KEY key) const final {
640653
ceph_abort("impossible");
641654
return false;
642655
}
@@ -1004,14 +1017,29 @@ struct FixedKVLeafNode
10041017
node_layout_t(this->get_bptr().c_str()) {}
10051018
FixedKVLeafNode(const FixedKVLeafNode &rhs)
10061019
: FixedKVNode<NODE_KEY>(rhs),
1007-
node_layout_t(this->get_bptr().c_str()) {}
1020+
node_layout_t(this->get_bptr().c_str()),
1021+
modifications(rhs.modifications) {}
10081022

10091023
static constexpr bool do_has_children = has_children;
1024+
// for the stable extent, modifications is always 0;
1025+
// it will increase for each transaction-local change, so that
1026+
// modifications can be detected (see BtreeLBAMapping.parent_modifications)
1027+
uint64_t modifications = 0;
1028+
10101029

10111030
bool have_children() const final {
10121031
return do_has_children;
10131032
}
10141033

1034+
void on_modify() {
1035+
modifications++;
1036+
}
1037+
1038+
bool modified_since(uint64_t v) const {
1039+
ceph_assert(v <= modifications);
1040+
return v != modifications;
1041+
}
1042+
10151043
bool is_leaf_and_has_children() const final {
10161044
return has_children;
10171045
}
@@ -1025,14 +1053,25 @@ struct FixedKVLeafNode
10251053
// 2. The child extent is stable
10261054
//
10271055
// For reserved mappings, the return values are undefined.
1028-
bool is_child_stable(op_context_t<NODE_KEY> c, uint16_t pos) const final {
1029-
return _is_child_stable(c, pos);
1056+
bool is_child_stable(
1057+
op_context_t<NODE_KEY> c,
1058+
uint16_t pos,
1059+
NODE_KEY key) const final {
1060+
return _is_child_stable(c, pos, key);
10301061
}
1031-
bool is_child_data_stable(op_context_t<NODE_KEY> c, uint16_t pos) const final {
1032-
return _is_child_stable(c, pos, true);
1062+
bool is_child_data_stable(
1063+
op_context_t<NODE_KEY> c,
1064+
uint16_t pos,
1065+
NODE_KEY key) const final {
1066+
return _is_child_stable(c, pos, key, true);
10331067
}
10341068

1035-
bool _is_child_stable(op_context_t<NODE_KEY> c, uint16_t pos, bool data_only = false) const {
1069+
bool _is_child_stable(
1070+
op_context_t<NODE_KEY> c,
1071+
uint16_t pos,
1072+
NODE_KEY key,
1073+
bool data_only = false) const {
1074+
assert(key == get_key_from_idx(pos));
10361075
auto child = this->children[pos];
10371076
if (is_reserved_ptr(child)) {
10381077
return true;
@@ -1108,6 +1147,7 @@ struct FixedKVLeafNode
11081147
this->copy_sources.clear();
11091148
}
11101149
}
1150+
modifications = 0;
11111151
assert(this->is_initial_pending()
11121152
? this->copy_sources.empty():
11131153
true);
@@ -1129,6 +1169,7 @@ struct FixedKVLeafNode
11291169
} else {
11301170
this->set_parent_tracker_from_prior_instance();
11311171
}
1172+
modifications = 0;
11321173
}
11331174

11341175
uint16_t lower_bound_offset(NODE_KEY key) const final {

src/crimson/os/seastore/cache.cc

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -884,7 +884,7 @@ void Cache::mark_transaction_conflicted(
884884
if (t.get_src() != Transaction::src_t::READ) {
885885
io_stat_t retire_stat;
886886
for (auto &i: t.retired_set) {
887-
retire_stat.increment(i->get_length());
887+
retire_stat.increment(i.extent->get_length());
888888
}
889889
efforts.retire.increment_stat(retire_stat);
890890

@@ -1249,18 +1249,19 @@ record_t Cache::prepare_record(
12491249
alloc_delta_t rel_delta;
12501250
rel_delta.op = alloc_delta_t::op_types_t::CLEAR;
12511251
for (auto &i: t.retired_set) {
1252+
auto &extent = i.extent;
12521253
get_by_ext(efforts.retire_by_ext,
1253-
i->get_type()).increment(i->get_length());
1254-
retire_stat.increment(i->get_length());
1255-
DEBUGT("retired and remove extent -- {}", t, *i);
1256-
commit_retire_extent(t, i);
1257-
if (is_backref_mapped_extent_node(i)
1258-
|| is_retired_placeholder(i->get_type())) {
1254+
extent->get_type()).increment(extent->get_length());
1255+
retire_stat.increment(extent->get_length());
1256+
DEBUGT("retired and remove extent -- {}", t, *extent);
1257+
commit_retire_extent(t, extent);
1258+
if (is_backref_mapped_extent_node(extent)
1259+
|| is_retired_placeholder(extent->get_type())) {
12591260
rel_delta.alloc_blk_ranges.emplace_back(
1260-
i->get_paddr(),
1261+
extent->get_paddr(),
12611262
L_ADDR_NULL,
1262-
i->get_length(),
1263-
i->get_type());
1263+
extent->get_length(),
1264+
extent->get_type());
12641265
}
12651266
}
12661267
alloc_deltas.emplace_back(std::move(rel_delta));
@@ -1621,7 +1622,8 @@ void Cache::complete_commit(
16211622
}
16221623

16231624
for (auto &i: t.retired_set) {
1624-
epm.mark_space_free(i->get_paddr(), i->get_length());
1625+
auto &extent = i.extent;
1626+
epm.mark_space_free(extent->get_paddr(), extent->get_length());
16251627
}
16261628
for (auto &i: t.existing_block_list) {
16271629
if (i->is_valid()) {
@@ -1638,24 +1640,25 @@ void Cache::complete_commit(
16381640

16391641
last_commit = start_seq;
16401642
for (auto &i: t.retired_set) {
1641-
i->dirty_from_or_retired_at = start_seq;
1642-
if (is_backref_mapped_extent_node(i)
1643-
|| is_retired_placeholder(i->get_type())) {
1643+
auto &extent = i.extent;
1644+
extent->dirty_from_or_retired_at = start_seq;
1645+
if (is_backref_mapped_extent_node(extent)
1646+
|| is_retired_placeholder(extent->get_type())) {
16441647
DEBUGT("backref_list free {} len {}",
16451648
t,
1646-
i->get_paddr(),
1647-
i->get_length());
1649+
extent->get_paddr(),
1650+
extent->get_length());
16481651
backref_list.emplace_back(
16491652
std::make_unique<backref_entry_t>(
1650-
i->get_paddr(),
1653+
extent->get_paddr(),
16511654
L_ADDR_NULL,
1652-
i->get_length(),
1653-
i->get_type(),
1655+
extent->get_length(),
1656+
extent->get_type(),
16541657
start_seq));
1655-
} else if (is_backref_node(i->get_type())) {
1656-
remove_backref_extent(i->get_paddr());
1658+
} else if (is_backref_node(extent->get_type())) {
1659+
remove_backref_extent(extent->get_paddr());
16571660
} else {
1658-
ERRORT("{}", t, *i);
1661+
ERRORT("{}", t, *extent);
16591662
ceph_abort("not possible");
16601663
}
16611664
}

src/crimson/os/seastore/cached_extent.h

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,7 @@ class CachedExtent
651651
friend struct paddr_cmp;
652652
friend struct ref_paddr_cmp;
653653
friend class ExtentIndex;
654+
friend struct trans_retired_extent_link_t;
654655

655656
/// Pointer to containing index (or null)
656657
ExtentIndex *parent_index = nullptr;
@@ -735,6 +736,7 @@ class CachedExtent
735736

736737
protected:
737738
trans_view_set_t mutation_pendings;
739+
trans_view_set_t retired_transactions;
738740

739741
CachedExtent(CachedExtent &&other) = delete;
740742
CachedExtent(ceph::bufferptr &&_ptr) : ptr(std::move(_ptr)) {
@@ -884,17 +886,54 @@ struct paddr_cmp {
884886
}
885887
};
886888

889+
// trans_retired_extent_link_t is used to link stable extents with
890+
// the transactions that retired them. With this link, we can find
891+
// out whether an extent has been retired by a specific transaction
892+
// in a way that's more efficient than searching through the transaction's
893+
// retired_set (Transaction::is_retired())
894+
struct trans_retired_extent_link_t {
895+
CachedExtentRef extent;
896+
// We use trans_spec_view_t instead of transaction_id_t, so that,
897+
// when a transaction is deleted or reset, we can efficiently remove
898+
// that transaction from the extents' extent-transaction link set.
899+
// Otherwise, we have to search through each extent's "retired_transactions"
900+
// to remove the transaction
901+
trans_spec_view_t trans_view;
902+
trans_retired_extent_link_t(CachedExtentRef extent, transaction_id_t id)
903+
: extent(extent), trans_view{id}
904+
{
905+
assert(extent->is_stable());
906+
extent->retired_transactions.insert(trans_view);
907+
}
908+
};
909+
887910
/// Compare extent refs by paddr
888911
struct ref_paddr_cmp {
889912
using is_transparent = paddr_t;
890-
bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const {
891-
return lhs->poffset < rhs->poffset;
892-
}
893-
bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const {
894-
return lhs < rhs->poffset;
895-
}
896-
bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const {
897-
return lhs->poffset < rhs;
913+
bool operator()(
914+
const trans_retired_extent_link_t &lhs,
915+
const trans_retired_extent_link_t &rhs) const {
916+
return lhs.extent->poffset < rhs.extent->poffset;
917+
}
918+
bool operator()(
919+
const paddr_t &lhs,
920+
const trans_retired_extent_link_t &rhs) const {
921+
return lhs < rhs.extent->poffset;
922+
}
923+
bool operator()(
924+
const trans_retired_extent_link_t &lhs,
925+
const paddr_t &rhs) const {
926+
return lhs.extent->poffset < rhs;
927+
}
928+
bool operator()(
929+
const CachedExtentRef &lhs,
930+
const trans_retired_extent_link_t &rhs) const {
931+
return lhs->poffset < rhs.extent->poffset;
932+
}
933+
bool operator()(
934+
const trans_retired_extent_link_t &lhs,
935+
const CachedExtentRef &rhs) const {
936+
return lhs.extent->poffset < rhs->poffset;
898937
}
899938
};
900939

@@ -910,7 +949,7 @@ class addr_extent_set_base_t
910949

911950
using pextent_set_t = addr_extent_set_base_t<
912951
paddr_t,
913-
CachedExtentRef,
952+
trans_retired_extent_link_t,
914953
ref_paddr_cmp
915954
>;
916955

@@ -1112,6 +1151,15 @@ class PhysicalNodeMapping {
11121151
bool is_zero_reserved() const {
11131152
return !get_val().is_real();
11141153
}
1154+
virtual bool is_parent_valid() const = 0;
1155+
virtual bool parent_modified() const {
1156+
ceph_abort("impossible");
1157+
return false;
1158+
};
1159+
1160+
virtual void maybe_fix_pos() {
1161+
ceph_abort("impossible");
1162+
}
11151163

11161164
virtual ~PhysicalNodeMapping() {}
11171165
protected:

0 commit comments

Comments
 (0)