Skip to content

Commit e299273

Browse files
committed
crimson/os/seastore/object_data_handler: we don't have to maintain the
symmetric indirect lba relationship at the time of clone OP_CLONE is done in the following way: 1. First, swap the layout of the head onode and the clone onode, so that clone onode's object_data, omap_root, xattr_root and log_root all point to the head onode's corresponding fields; 2. Do SeaStore::_clone() from the clone onode to the head onode, which is exactly what rollback is done. This makes the code of ObjectDataHandler::clone() and ObjectDataHandler::copy_on_write() even simpler, and can facilitate the clone/rollback scenarios when the "128-bit" lba key layout is involved. Signed-off-by: Xuehan Xu <[email protected]>
1 parent b8a9b77 commit e299273

File tree

7 files changed

+117
-64
lines changed

7 files changed

+117
-64
lines changed

src/crimson/os/seastore/lba_mapping.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ class LBAMapping {
111111
bool is_zero_reserved() const {
112112
return !is_indirect() && get_val().is_zero();
113113
}
114+
// true if the mapping corresponds to real data
114115
bool is_real() const {
115116
return !is_indirect() && !get_val().is_zero();
116117
}

src/crimson/os/seastore/object_data_handler.cc

Lines changed: 36 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,17 +1383,43 @@ ObjectDataHandler::clear_ret ObjectDataHandler::clear(
13831383
});
13841384
}
13851385

1386+
ObjectDataHandler::clone_ret
1387+
ObjectDataHandler::do_clone(
1388+
context_t ctx,
1389+
object_data_t &object_data,
1390+
object_data_t &d_object_data,
1391+
LBAMapping first_mapping,
1392+
bool updateref)
1393+
{
1394+
LOG_PREFIX("ObjectDataHandler::do_clone");
1395+
assert(d_object_data.is_null());
1396+
auto old_base = object_data.get_reserved_data_base();
1397+
auto old_len = object_data.get_reserved_data_len();
1398+
auto mapping = co_await prepare_data_reservation(
1399+
ctx, d_object_data, old_len);
1400+
ceph_assert(mapping.has_value());
1401+
DEBUGT("new obj reserve_data_base: {}, len 0x{:x}",
1402+
ctx.t,
1403+
d_object_data.get_reserved_data_base(),
1404+
d_object_data.get_reserved_data_len());
1405+
auto pos = co_await ctx.tm.remove(ctx.t, std::move(*mapping)
1406+
).handle_error_interruptible(
1407+
clone_iertr::pass_further{},
1408+
crimson::ct_error::assert_all{"unexpected enoent"}
1409+
);
1410+
auto base = d_object_data.get_reserved_data_base();
1411+
auto len = d_object_data.get_reserved_data_len();
1412+
auto cr_ret = co_await ctx.tm.clone_range(
1413+
ctx.t, old_base, base, 0, len, std::move(pos),
1414+
std::move(first_mapping), updateref);
1415+
if (cr_ret.shared_direct_mapping) {
1416+
ctx.onode.set_need_cow(ctx.t);
1417+
}
1418+
}
1419+
13861420
ObjectDataHandler::clone_ret ObjectDataHandler::clone(
13871421
context_t ctx)
13881422
{
1389-
// the whole clone procedure can be seperated into the following steps:
1390-
// 1. let clone onode(d_object_data) take the head onode's
1391-
// object data base;
1392-
// 2. reserve a new region in lba tree for the head onode;
1393-
// 3. clone all extents of the clone onode, see transaction_manager.h
1394-
// for the details of clone_pin;
1395-
// 4. reserve the space between the head onode's size and its reservation
1396-
// length.
13971423
return with_objects_data(
13981424
ctx,
13991425
[ctx, this](auto &object_data, auto &d_object_data) {
@@ -1403,50 +1429,8 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone(
14031429
}
14041430
return ctx.tm.get_pin(ctx.t, object_data.get_reserved_data_base()
14051431
).si_then([this, &object_data, &d_object_data, ctx](auto mapping) {
1406-
auto old_base = object_data.get_reserved_data_base();
1407-
auto old_len = object_data.get_reserved_data_len();
1408-
return prepare_data_reservation(
1409-
ctx,
1410-
d_object_data,
1411-
object_data.get_reserved_data_len()
1412-
).si_then([&object_data, &d_object_data, ctx](auto mapping) {
1413-
assert(!object_data.is_null());
1414-
assert(mapping);
1415-
LOG_PREFIX(ObjectDataHandler::clone);
1416-
DEBUGT("cloned obj reserve_data_base: {}, len 0x{:x}",
1417-
ctx.t,
1418-
d_object_data.get_reserved_data_base(),
1419-
d_object_data.get_reserved_data_len());
1420-
return ctx.tm.remove(ctx.t, std::move(*mapping));
1421-
}).si_then([mapping, &d_object_data, ctx](auto pos) mutable {
1422-
auto base = d_object_data.get_reserved_data_base();
1423-
auto len = d_object_data.get_reserved_data_len();
1424-
return ctx.tm.clone_range(
1425-
ctx.t, base, len, std::move(pos), std::move(mapping), true);
1426-
}).si_then([ctx, &object_data, &d_object_data, this] {
1427-
object_data.clear();
1428-
return prepare_data_reservation(
1429-
ctx,
1430-
object_data,
1431-
d_object_data.get_reserved_data_len()
1432-
).si_then([ctx, &object_data](auto mapping) {
1433-
LOG_PREFIX("ObjectDataHandler::clone");
1434-
DEBUGT("head obj reserve_data_base: {}, len 0x{:x}",
1435-
ctx.t,
1436-
object_data.get_reserved_data_base(),
1437-
object_data.get_reserved_data_len());
1438-
return ctx.tm.remove(ctx.t, std::move(*mapping));
1439-
});
1440-
}).si_then([ctx, &object_data, mapping](auto pos) mutable {
1441-
auto base = object_data.get_reserved_data_base();
1442-
auto len = object_data.get_reserved_data_len();
1443-
return ctx.tm.clone_range(
1444-
ctx.t, base, len, std::move(pos), std::move(mapping), false);
1445-
}).si_then([ctx, mapping, old_base, old_len] {
1446-
return ctx.tm.remove_mappings_in_range(
1447-
ctx.t, old_base, old_len, std::move(mapping), {false, true}
1448-
).discard_result();
1449-
});
1432+
ceph_assert(ctx.d_onode);
1433+
return do_clone(ctx, object_data, d_object_data, std::move(mapping), true);
14501434
}).handle_error_interruptible(
14511435
clone_iertr::pass_further{},
14521436
crimson::ct_error::assert_all{"unexpected enoent"}

src/crimson/os/seastore/object_data_handler.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,10 @@ class ObjectDataHandler {
323323
clear_ret clear(context_t ctx);
324324

325325
/// Clone data of an Onode
326+
/// Note that the clone always assume that ctx.onode
327+
/// is a snap onode, so, for OP_CLONE, the caller of
328+
/// this method should swap the layout of the onode
329+
/// and the dest_onode first.
326330
using clone_iertr = base_iertr;
327331
using clone_ret = clone_iertr::future<>;
328332
clone_ret clone(context_t ctx);
@@ -337,6 +341,21 @@ class ObjectDataHandler {
337341
std::optional<bufferlist> &&bl,
338342
LBAMapping first_mapping);
339343

344+
/**
345+
* do_clone
346+
*
347+
* Clone lba mappings from object_data to d_object_data.
348+
* object_data must belong to ctx.onode, and d_object_data must belong to ctx.d_onode
349+
* This implementation is asymmetric and optimizes for (but does not require) the case
350+
* that source is not further mutated.
351+
*/
352+
clone_ret do_clone(
353+
context_t ctx,
354+
object_data_t &object_data,
355+
object_data_t &d_object_data,
356+
LBAMapping first_mapping,
357+
bool updateref);
358+
340359
/// Ensures object_data reserved region is prepared
341360
write_iertr::future<std::optional<LBAMapping>>
342361
prepare_data_reservation(

src/crimson/os/seastore/onode.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,15 @@ class Onode : public boost::intrusive_ref_counter<
9393
virtual const onode_layout_t &get_layout() const = 0;
9494
virtual ~Onode() = default;
9595

96+
bool is_head() const {
97+
return hobj.is_head();
98+
}
99+
bool is_snap() const {
100+
return hobj.is_snap();
101+
}
102+
bool need_cow() const {
103+
return get_layout().need_cow;
104+
}
96105
virtual void update_onode_size(Transaction&, uint32_t) = 0;
97106
virtual void update_omap_root(Transaction&, omap_root_t&) = 0;
98107
virtual void update_log_root(Transaction&, omap_root_t&) = 0;

src/crimson/os/seastore/seastore.cc

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1957,21 +1957,49 @@ SeaStore::Shard::_clone(
19571957
{
19581958
auto &object_size = onode.get_layout().size;
19591959
d_onode.update_onode_size(*ctx.transaction, object_size);
1960-
return objHandler.clone(
1961-
ObjectDataHandler::context_t{
1962-
*transaction_manager,
1963-
*ctx.transaction,
1964-
onode,
1965-
&d_onode});
1960+
if (onode.is_head()) { // OP_CLONE
1961+
assert(onode.is_head());
1962+
assert(d_onode.is_snap());
1963+
/* The most common usage of OP_CLONE is during a write operation.
1964+
* The osd will submit a transaction cloning HEAD to clone and
1965+
* then mutating HEAD. ObjectDataHandler::do_clone optimizes for
1966+
* the case where the *source* is not further mutated, so here we
1967+
* reverse the two onodes so that HEAD will be the target.
1968+
*/
1969+
onode.swap_layout(*ctx.transaction, d_onode);
1970+
return objHandler.clone(
1971+
ObjectDataHandler::context_t{
1972+
*transaction_manager,
1973+
*ctx.transaction,
1974+
d_onode,
1975+
&onode});
1976+
} else { // OP_ROLLBACK
1977+
assert(d_onode.is_head());
1978+
return objHandler.clone(
1979+
ObjectDataHandler::context_t{
1980+
*transaction_manager,
1981+
*ctx.transaction,
1982+
onode,
1983+
&d_onode});
1984+
}
19661985
}).si_then([&ctx, &onode, &d_onode, this] {
19671986
return omaptree_clone(
1968-
*ctx.transaction, omap_type_t::XATTR, onode, d_onode);
1987+
*ctx.transaction,
1988+
omap_type_t::XATTR,
1989+
onode.is_head() ? d_onode : onode,
1990+
onode.is_head() ? onode : d_onode);
19691991
}).si_then([&ctx, &onode, &d_onode, this] {
19701992
return omaptree_clone(
1971-
*ctx.transaction, omap_type_t::OMAP, onode, d_onode);
1993+
*ctx.transaction,
1994+
omap_type_t::OMAP,
1995+
onode.is_head() ? d_onode : onode,
1996+
onode.is_head() ? onode : d_onode);
19721997
}).si_then([&ctx, &onode, &d_onode, this] {
19731998
return omaptree_clone(
1974-
*ctx.transaction, omap_type_t::LOG, onode, d_onode);
1999+
*ctx.transaction,
2000+
omap_type_t::LOG,
2001+
onode.is_head() ? d_onode : onode,
2002+
onode.is_head() ? onode : d_onode);
19752003
});
19762004
}
19772005

src/crimson/os/seastore/seastore_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,6 +1824,9 @@ class object_data_t {
18241824
reserved_data_len = 0;
18251825
}
18261826
};
1827+
constexpr object_data_t get_null_object_data() {
1828+
return object_data_t{L_ADDR_NULL, 0};
1829+
}
18271830

18281831
struct __attribute__((packed)) object_data_le_t {
18291832
laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL);

src/crimson/os/seastore/transaction_manager.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -611,8 +611,10 @@ class TransactionManager : public ExtentCallbackInterface {
611611
});
612612
}
613613

614+
// clone the mappings in range base~len, returns true if there exists
615+
// direct mappings that are cloned.
614616
using clone_iertr = base_iertr;
615-
using clone_ret = clone_iertr::future<>;
617+
using clone_ret = clone_iertr::future<bool>;
616618
clone_ret clone_range(
617619
Transaction &t,
618620
laddr_t base,
@@ -628,9 +630,11 @@ class TransactionManager : public ExtentCallbackInterface {
628630
std::move(pos),
629631
std::move(mapping),
630632
(extent_len_t)0,
631-
[&t, this, updateref, base, len](auto &pos, auto &mapping, auto &offset) {
633+
false,
634+
[&t, this, updateref, base, len]
635+
(auto &pos, auto &mapping, auto &offset, auto &ret) {
632636
return trans_intr::repeat(
633-
[&t, this, &pos, &mapping, &offset, updateref, base, len]()
637+
[&t, this, &pos, &mapping, &offset, updateref, base, len, &ret]()
634638
-> clone_iertr::future<seastar::stop_iteration> {
635639
if (offset >= len) {
636640
return clone_iertr::make_ready_future<
@@ -657,6 +661,9 @@ class TransactionManager : public ExtentCallbackInterface {
657661
crimson::ct_error::assert_all{"unexpected error"}
658662
);
659663
}
664+
if (mapping.is_real()) {
665+
ret = true;
666+
}
660667
return clone_pin(
661668
t, std::move(pos), std::move(mapping),
662669
(base + offset).checked_to_laddr(), updateref
@@ -671,6 +678,8 @@ class TransactionManager : public ExtentCallbackInterface {
671678
return seastar::stop_iteration::no;
672679
});
673680
});
681+
}).si_then([&ret] {
682+
return ret;
674683
});
675684
});
676685
}

0 commit comments

Comments
 (0)