Skip to content

Commit 58ae168

Browse files
authored
Merge pull request ceph#62513 from xxhdx1985126/wip-seastore-transaction-manager-iterator
crimson/os/seastore: LBACursor based LBAManager/TM interfaces and the related ObjectDataHandler refactor Reviewed-by: Samuel Just <[email protected]>
2 parents 470e980 + 30ffda1 commit 58ae168

25 files changed

+3335
-2202
lines changed

src/common/options/crimson.yaml.in

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,6 @@ options:
173173
Note this size ratio does not reflect actual memory usage, as it represents the size of evicted
174174
pages from A1_in queue.
175175
default: 0.5
176-
- name: seastore_obj_data_write_amplification
177-
type: float
178-
level: advanced
179-
desc: split extent if ratio of total extent size to write size exceeds this value
180-
default: 1.25
181-
# TODO: seastore_obj_data_write_amplification is no longer correct if
182-
# seastore_data_delta_based_overwrite is enabled. So, this should be reconsidered.
183176
- name: seastore_max_concurrent_transactions
184177
type: uint
185178
level: advanced

src/crimson/os/seastore/btree/btree_types.cc

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,85 @@
44
#include "crimson/os/seastore/btree/btree_types.h"
55
#include "crimson/os/seastore/lba/lba_btree_node.h"
66
#include "crimson/os/seastore/backref/backref_tree_node.h"
7+
#include "crimson/os/seastore/lba/btree_lba_manager.h"
78

89
namespace crimson::os::seastore {
910

11+
LBACursor::base_iertr::future<> LBACursor::refresh()
12+
{
13+
LOG_PREFIX(LBACursor::refresh);
14+
return with_btree<lba::LBABtree>(
15+
ctx.cache,
16+
ctx,
17+
[this, FNAME, c=ctx](auto &btree) {
18+
c.trans.cursor_stats.num_refresh_parent_total++;
19+
20+
if (!parent->is_valid()) {
21+
c.trans.cursor_stats.num_refresh_invalid_parent++;
22+
SUBTRACET(
23+
seastore_lba,
24+
"cursor {} parent is invalid, re-search from scratch",
25+
c.trans, *this);
26+
return btree.lower_bound(c, this->get_laddr()
27+
).si_then([this](lba::LBABtree::iterator iter) {
28+
auto leaf = iter.get_leaf_node();
29+
parent = leaf;
30+
modifications = leaf->modifications;
31+
pos = iter.get_leaf_pos();
32+
if (!is_end()) {
33+
ceph_assert(!iter.is_end());
34+
ceph_assert(iter.get_key() == get_laddr());
35+
val = iter.get_val();
36+
assert(is_viewable());
37+
}
38+
});
39+
}
40+
assert(parent->is_stable() ||
41+
parent->is_pending_in_trans(c.trans.get_trans_id()));
42+
auto leaf = parent->cast<lba::LBALeafNode>();
43+
if (leaf->is_pending_in_trans(c.trans.get_trans_id())) {
44+
if (leaf->modified_since(modifications)) {
45+
c.trans.cursor_stats.num_refresh_modified_viewable_parent++;
46+
} else {
47+
// no need to refresh
48+
return base_iertr::now();
49+
}
50+
} else {
51+
auto [viewable, l] = leaf->resolve_transaction(c.trans, key);
52+
SUBTRACET(
53+
seastore_lba,
54+
"cursor: {} viewable: {}",
55+
c.trans, *this, viewable);
56+
if (!viewable) {
57+
leaf = l;
58+
c.trans.cursor_stats.num_refresh_unviewable_parent++;
59+
parent = leaf;
60+
} else {
61+
assert(leaf.get() == l.get());
62+
assert(leaf->is_stable());
63+
return base_iertr::now();
64+
}
65+
}
66+
67+
modifications = leaf->modifications;
68+
if (is_end()) {
69+
pos = leaf->get_size();
70+
assert(!val);
71+
} else {
72+
auto i = leaf->lower_bound(get_laddr());
73+
pos = i.get_offset();
74+
val = i.get_val();
75+
76+
auto iter = lba::LBALeafNode::iterator(leaf.get(), pos);
77+
ceph_assert(iter.get_key() == key);
78+
ceph_assert(iter.get_val() == val);
79+
assert(is_viewable());
80+
}
81+
82+
return base_iertr::now();
83+
});
84+
}
85+
1086
namespace lba {
1187

1288
std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v)
@@ -58,7 +134,6 @@ bool BtreeCursor<key_t, val_t>::is_viewable() const {
58134
}
59135

60136
auto [viewable, state] = parent->is_viewable_by_trans(ctx.trans);
61-
assert(state != CachedExtent::viewable_state_t::invalid);
62137
SUBTRACET(seastore_cache, "{} with viewable state {}",
63138
ctx.trans, *parent, state);
64139
return viewable;

src/crimson/os/seastore/btree/btree_types.h

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,9 @@ struct __attribute__((packed)) backref_map_val_le_t {
198198
* time.
199199
*/
200200
template <typename key_t, typename val_t>
201-
struct BtreeCursor {
201+
struct BtreeCursor
202+
: public boost::intrusive_ref_counter<
203+
BtreeCursor<key_t, val_t>, boost::thread_unsafe_counter> {
202204
BtreeCursor(
203205
op_context_t &ctx,
204206
CachedExtentRef parent,
@@ -253,8 +255,7 @@ struct LBACursor : BtreeCursor<laddr_t, lba::lba_map_val_t> {
253255
using Base = BtreeCursor<laddr_t, lba::lba_map_val_t>;
254256
using Base::BtreeCursor;
255257
bool is_indirect() const {
256-
assert(!is_end());
257-
return val->pladdr.is_laddr();
258+
return !is_end() && val->pladdr.is_laddr();
258259
}
259260
laddr_t get_laddr() const {
260261
return key;
@@ -274,21 +275,27 @@ struct LBACursor : BtreeCursor<laddr_t, lba::lba_map_val_t> {
274275
assert(!is_indirect());
275276
return val->checksum;
276277
}
278+
bool contains(laddr_t laddr) const {
279+
return get_laddr() <= laddr && get_laddr() + get_length() > laddr;
280+
}
277281
extent_ref_count_t get_refcount() const {
278282
assert(!is_end());
279283
assert(!is_indirect());
280284
return val->refcount;
281285
}
282-
std::unique_ptr<LBACursor> duplicate() const {
283-
return std::make_unique<LBACursor>(*this);
284-
}
286+
287+
using base_ertr = crimson::errorator<
288+
crimson::ct_error::input_output_error>;
289+
using base_iertr = trans_iertr<base_ertr>;
290+
base_iertr::future<> refresh();
285291
};
286-
using LBACursorRef = std::unique_ptr<LBACursor>;
292+
using LBACursorRef = boost::intrusive_ptr<LBACursor>;
287293

288294
struct BackrefCursor : BtreeCursor<paddr_t, backref::backref_map_val_t> {
289295
using Base = BtreeCursor<paddr_t, backref::backref_map_val_t>;
290296
using Base::BtreeCursor;
291297
paddr_t get_paddr() const {
298+
assert(key.is_absolute());
292299
return key;
293300
}
294301
laddr_t get_laddr() const {
@@ -300,7 +307,7 @@ struct BackrefCursor : BtreeCursor<paddr_t, backref::backref_map_val_t> {
300307
return val->type;
301308
}
302309
};
303-
using BackrefCursorRef = std::unique_ptr<BackrefCursor>;
310+
using BackrefCursorRef = boost::intrusive_ptr<BackrefCursor>;
304311

305312
template <typename key_t, typename val_t>
306313
std::ostream &operator<<(

src/crimson/os/seastore/btree/fixed_kv_btree.h

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -286,14 +286,13 @@ class FixedKVBtree {
286286
}
287287

288288
// handle_boundary() must be called before get_cursor
289-
std::unique_ptr<cursor_t> get_cursor(op_context_t ctx) const {
290-
assert(!is_end());
291-
return std::make_unique<cursor_t>(
289+
boost::intrusive_ptr<cursor_t> get_cursor(op_context_t ctx) const {
290+
return new cursor_t(
292291
ctx,
293292
leaf.node,
294293
leaf.node->modifications,
295-
get_key(),
296-
std::make_optional(get_val()),
294+
is_end() ? min_max_t<node_key_t>::max : get_key(),
295+
is_end() ? std::nullopt : std::make_optional(get_val()),
297296
leaf.pos);
298297
}
299298

@@ -490,32 +489,38 @@ class FixedKVBtree {
490489
}
491490

492491
iterator make_partial_iter(
492+
op_context_t c,
493+
cursor_t &cursor)
494+
{
495+
return make_partial_iter(
496+
c,
497+
cursor.parent->template cast<leaf_node_t>(),
498+
cursor.key,
499+
cursor.pos);
500+
}
501+
502+
boost::intrusive_ptr<cursor_t> get_cursor(
503+
op_context_t c,
504+
TCachedExtentRef<leaf_node_t> leaf,
505+
node_key_t key)
506+
{
507+
auto it = leaf->lower_bound(key);
508+
assert(it != leaf->end());
509+
return new cursor_t(
510+
c, leaf, leaf->modifications,
511+
key, it.get_val(), it.get_offset());
512+
}
513+
514+
boost::intrusive_ptr<cursor_t> get_cursor(
493515
op_context_t c,
494516
TCachedExtentRef<leaf_node_t> leaf,
495517
node_key_t key,
496518
uint16_t pos)
497519
{
498-
assert(leaf->is_valid());
499-
assert(leaf->is_viewable_by_trans(c.trans).first);
500-
501-
auto depth = get_root().get_depth();
502-
#ifndef NDEBUG
503-
auto ret = iterator(
504-
depth,
505-
depth == 1
506-
? iterator::state_t::FULL
507-
: iterator::state_t::PARTIAL);
508-
#else
509-
auto ret = iterator(depth);
510-
#endif
511-
ret.leaf.node = leaf;
512-
ret.leaf.pos = pos;
513-
if (ret.is_end()) {
514-
ceph_assert(key == min_max_t<node_key_t>::max);
515-
} else {
516-
ceph_assert(key == ret.get_key());
517-
}
518-
return ret;
520+
assert(leaf->get_size() != pos);
521+
auto it = leaf->iter_idx(pos);
522+
assert(it.get_key() == key);
523+
return new cursor_t(c, leaf, leaf->modifications, key, it.get_val(), pos);
519524
}
520525

521526
/**
@@ -1358,6 +1363,35 @@ class FixedKVBtree {
13581363
private:
13591364
RootBlockRef root_block;
13601365

1366+
iterator make_partial_iter(
1367+
op_context_t c,
1368+
TCachedExtentRef<leaf_node_t> leaf,
1369+
node_key_t key,
1370+
uint16_t pos)
1371+
{
1372+
assert(leaf->is_valid());
1373+
assert(leaf->is_viewable_by_trans(c.trans).first);
1374+
1375+
auto depth = get_root().get_depth();
1376+
#ifndef NDEBUG
1377+
auto ret = iterator(
1378+
depth,
1379+
depth == 1
1380+
? iterator::state_t::FULL
1381+
: iterator::state_t::PARTIAL);
1382+
#else
1383+
auto ret = iterator(depth);
1384+
#endif
1385+
ret.leaf.node = leaf;
1386+
ret.leaf.pos = pos;
1387+
if (ret.is_end()) {
1388+
ceph_assert(key == min_max_t<node_key_t>::max);
1389+
} else {
1390+
ceph_assert(key == ret.get_key());
1391+
}
1392+
return ret;
1393+
}
1394+
13611395
template <typename T>
13621396
using node_position_t = typename iterator::template node_position_t<T>;
13631397

src/crimson/os/seastore/cache.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,26 @@ void Cache::register_metrics()
213213
},
214214
sm::description("total number of cache hits")
215215
),
216+
sm::make_counter(
217+
"refresh_parent_total",
218+
cursor_stats.num_refresh_parent_total,
219+
sm::description("total number of refreshed cursors")
220+
),
221+
sm::make_counter(
222+
"refresh_invalid_parent",
223+
cursor_stats.num_refresh_invalid_parent,
224+
sm::description("total number of refreshed cursors with invalid parents")
225+
),
226+
sm::make_counter(
227+
"refresh_unviewable_parent",
228+
cursor_stats.num_refresh_unviewable_parent,
229+
sm::description("total number of refreshed cursors with unviewable parents")
230+
),
231+
sm::make_counter(
232+
"refresh_modified_viewable_parent",
233+
cursor_stats.num_refresh_modified_viewable_parent,
234+
sm::description("total number of refreshed cursors with viewable but modified parents")
235+
),
216236
}
217237
);
218238

@@ -1743,6 +1763,7 @@ record_t Cache::prepare_record(
17431763
assert(rewrite_stats.is_clear());
17441764
}
17451765

1766+
cursor_stats.apply(t.cursor_stats);
17461767
return record;
17471768
}
17481769

src/crimson/os/seastore/cache.h

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -449,22 +449,12 @@ class Cache : public ExtentTransViewRetriever {
449449
std::forward<Func>(extent_init_func));
450450
}
451451

452-
bool is_viewable_extent_stable(
452+
CachedExtentRef peek_extent_viewable_by_trans(
453453
Transaction &t,
454454
CachedExtentRef extent) final
455455
{
456456
assert(extent);
457-
auto view = extent->get_transactional_view(t);
458-
return view->is_stable();
459-
}
460-
461-
bool is_viewable_extent_data_stable(
462-
Transaction &t,
463-
CachedExtentRef extent) final
464-
{
465-
assert(extent);
466-
auto view = extent->get_transactional_view(t);
467-
return view->is_data_stable();
457+
return extent->get_transactional_view(t);
468458
}
469459

470460
get_extent_iertr::future<> maybe_wait_accessible(
@@ -1670,6 +1660,7 @@ class Cache : public ExtentTransViewRetriever {
16701660
uint64_t hit = 0;
16711661
};
16721662

1663+
btree_cursor_stats_t cursor_stats;
16731664
struct invalid_trans_efforts_t {
16741665
io_stat_t read;
16751666
io_stat_t mutate;

src/crimson/os/seastore/cached_extent.cc

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,7 @@ void CachedExtent::set_invalid(Transaction &t) {
104104

105105
std::pair<bool, CachedExtent::viewable_state_t>
106106
CachedExtent::is_viewable_by_trans(Transaction &t) {
107-
if (!is_valid()) {
108-
return std::make_pair(false, viewable_state_t::invalid);
109-
}
107+
ceph_assert(is_valid());
110108

111109
auto trans_id = t.get_trans_id();
112110
if (is_pending()) {
@@ -142,8 +140,6 @@ std::ostream &operator<<(
142140
return out << "stable";
143141
case CachedExtent::viewable_state_t::pending:
144142
return out << "pending";
145-
case CachedExtent::viewable_state_t::invalid:
146-
return out << "invalid";
147143
case CachedExtent::viewable_state_t::stable_become_retired:
148144
return out << "stable_become_retired";
149145
case CachedExtent::viewable_state_t::stable_become_pending:

src/crimson/os/seastore/cached_extent.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,6 @@ class CachedExtent
806806
enum class viewable_state_t {
807807
stable, // viewable
808808
pending, // viewable
809-
invalid, // unviewable
810809
stable_become_retired, // unviewable
811810
stable_become_pending, // unviewable
812811
};

0 commit comments

Comments
 (0)