Skip to content

Commit 39df30d

Browse files
authored
Merge pull request ceph#50419 from myoungwon/wip-rbm-partial-overwrite
crimson/os/seastore: introduce delta-based overwrite Reviewed-by: Yingxin Cheng <[email protected]>
2 parents c7493fe + 99a6a32 commit 39df30d

File tree

8 files changed

+482
-131
lines changed

8 files changed

+482
-131
lines changed

src/common/options/crimson.yaml.in

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,8 @@ options:
117117
level: advanced
118118
desc: Begin fast eviction when the used ratio of the main tier reaches this value.
119119
default: 0.7
120+
- name: seastore_data_delta_based_overwrite
121+
type: size
122+
level: dev
123+
desc: overwrite the existing data block based on delta if the original size is smaller than the value, otherwise do overwrite based on remapping, set to 0 to enforce the remap-based overwrite.
124+
default: 0

src/crimson/os/seastore/backref/btree_backref_manager.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ class BtreeBackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> {
3535
return type;
3636
}
3737

38+
bool is_clone() const final {
39+
return false;
40+
}
41+
3842
protected:
3943
std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>> _duplicate(
4044
op_context_t<paddr_t> ctx) const final {

src/crimson/os/seastore/cached_extent.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,10 @@ class PhysicalNodeMapping {
10541054
}
10551055

10561056
virtual bool is_stable() const = 0;
1057+
virtual bool is_clone() const = 0;
1058+
bool is_zero_reserved() const {
1059+
return !get_val().is_real();
1060+
}
10571061

10581062
virtual ~PhysicalNodeMapping() {}
10591063
protected:

src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,10 @@ class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
137137
return intermediate_length;
138138
}
139139

140+
bool is_clone() const final {
141+
return get_map_val().refcount > 1;
142+
}
143+
140144
protected:
141145
std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
142146
op_context_t<laddr_t> ctx) const final {

src/crimson/os/seastore/object_data_handler.cc

Lines changed: 136 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ using extent_to_write_list_t = std::list<extent_to_write_t>;
9898
// Encapsulates extents to be written out using do_remappings.
9999
struct extent_to_remap_t {
100100
enum class type_t {
101-
REMAP,
101+
REMAP1,
102+
REMAP2,
102103
OVERWRITE
103104
};
104105
type_t type;
@@ -114,54 +115,75 @@ struct extent_to_remap_t {
114115
extent_to_remap_t(const extent_to_remap_t &) = delete;
115116
extent_to_remap_t(extent_to_remap_t &&) = default;
116117

117-
bool is_remap() const {
118-
return type == type_t::REMAP;
118+
bool is_remap1() const {
119+
return type == type_t::REMAP1;
119120
}
120121

121-
bool is_overwrite() const {
122+
bool is_remap2() const {
122123
assert((new_offset != 0) && (pin->get_length() != new_offset + new_len));
124+
return type == type_t::REMAP2;
125+
}
126+
127+
bool is_overwrite() const {
123128
return type == type_t::OVERWRITE;
124129
}
125130

126131
using remap_entry = TransactionManager::remap_entry;
127132
remap_entry create_remap_entry() {
128-
assert(is_remap());
133+
assert(is_remap1());
129134
return remap_entry(
130135
new_offset,
131136
new_len);
132137
}
133138

134139
remap_entry create_left_remap_entry() {
135-
assert(is_overwrite());
140+
assert(is_remap2());
136141
return remap_entry(
137142
0,
138143
new_offset);
139144
}
140145

141146
remap_entry create_right_remap_entry() {
142-
assert(is_overwrite());
147+
assert(is_remap2());
143148
return remap_entry(
144149
new_offset + new_len,
145150
pin->get_length() - new_offset - new_len);
146151
}
147152

148-
static extent_to_remap_t create_remap(
153+
static extent_to_remap_t create_remap1(
149154
LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) {
150-
return extent_to_remap_t(type_t::REMAP,
155+
return extent_to_remap_t(type_t::REMAP1,
151156
std::move(pin), new_offset, new_len);
152157
}
153158

154-
static extent_to_remap_t create_overwrite(
159+
static extent_to_remap_t create_remap2(
155160
LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) {
156-
return extent_to_remap_t(type_t::OVERWRITE,
161+
return extent_to_remap_t(type_t::REMAP2,
157162
std::move(pin), new_offset, new_len);
158163
}
159164

165+
static extent_to_remap_t create_overwrite(
166+
extent_len_t new_offset, extent_len_t new_len, LBAMappingRef p,
167+
bufferlist b) {
168+
return extent_to_remap_t(type_t::OVERWRITE,
169+
nullptr, new_offset, new_len, p->get_key(), p->get_length(), b);
170+
}
171+
172+
uint64_t laddr_start;
173+
extent_len_t length;
174+
std::optional<bufferlist> bl;
175+
160176
private:
161177
extent_to_remap_t(type_t type,
162178
LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len)
163179
: type(type),
164180
pin(std::move(pin)), new_offset(new_offset), new_len(new_len) {}
181+
extent_to_remap_t(type_t type,
182+
LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len,
183+
uint64_t ori_laddr, extent_len_t ori_len, std::optional<bufferlist> b)
184+
: type(type),
185+
pin(std::move(pin)), new_offset(new_offset), new_len(new_len),
186+
laddr_start(ori_laddr), length(ori_len), bl(b) {}
165187
};
166188
using extent_to_remap_list_t = std::list<extent_to_remap_t>;
167189

@@ -222,7 +244,8 @@ struct overwrite_ops_t {
222244
// prepare to_remap, to_retire, to_insert list
223245
overwrite_ops_t prepare_ops_list(
224246
lba_pin_list_t &pins_to_remove,
225-
extent_to_write_list_t &to_write) {
247+
extent_to_write_list_t &to_write,
248+
size_t delta_based_overwrite_max_extent_size) {
226249
assert(pins_to_remove.size() != 0);
227250
overwrite_ops_t ops;
228251
ops.to_remove.swap(pins_to_remove);
@@ -241,7 +264,7 @@ overwrite_ops_t prepare_ops_list(
241264
assert(to_write.size() > 2);
242265
assert(front.addr == front.pin->get_key());
243266
assert(back.addr > back.pin->get_key());
244-
ops.to_remap.push_back(extent_to_remap_t::create_overwrite(
267+
ops.to_remap.push_back(extent_to_remap_t::create_remap2(
245268
std::move(front.pin),
246269
front.len,
247270
back.addr - front.addr - front.len));
@@ -252,7 +275,7 @@ overwrite_ops_t prepare_ops_list(
252275
visitted++;
253276
assert(to_write.size() > 1);
254277
assert(front.addr == front.pin->get_key());
255-
ops.to_remap.push_back(extent_to_remap_t::create_remap(
278+
ops.to_remap.push_back(extent_to_remap_t::create_remap1(
256279
std::move(front.pin),
257280
0,
258281
front.len));
@@ -263,28 +286,81 @@ overwrite_ops_t prepare_ops_list(
263286
assert(to_write.size() > 1);
264287
assert(back.addr + back.len ==
265288
back.pin->get_key() + back.pin->get_length());
266-
ops.to_remap.push_back(extent_to_remap_t::create_remap(
289+
ops.to_remap.push_back(extent_to_remap_t::create_remap1(
267290
std::move(back.pin),
268291
back.addr - back.pin->get_key(),
269292
back.len));
270293
ops.to_remove.pop_back();
271294
}
272295
}
273296

274-
// prepare to_insert
297+
interval_set<uint64_t> pre_alloc_addr_removed, pre_alloc_addr_remapped;
298+
if (delta_based_overwrite_max_extent_size) {
299+
for (auto &r : ops.to_remove) {
300+
if (r->is_stable() && !r->is_zero_reserved()) {
301+
pre_alloc_addr_removed.insert(r->get_key(), r->get_length());
302+
303+
}
304+
}
305+
for (auto &r : ops.to_remap) {
306+
if (r.pin && r.pin->is_stable() && !r.pin->is_zero_reserved()) {
307+
pre_alloc_addr_remapped.insert(r.pin->get_key(), r.pin->get_length());
308+
}
309+
}
310+
}
311+
312+
// prepare to insert
313+
extent_to_remap_list_t to_remap;
275314
for (auto &region : to_write) {
276315
if (region.is_data()) {
277316
visitted++;
278317
assert(region.to_write.has_value());
279-
ops.to_insert.push_back(extent_to_insert_t::create_data(
280-
region.addr, region.len, region.to_write));
318+
int erased_num = 0;
319+
if (pre_alloc_addr_removed.contains(region.addr, region.len) &&
320+
region.len <= delta_based_overwrite_max_extent_size) {
321+
erased_num = std::erase_if(
322+
ops.to_remove,
323+
[&region, &to_remap](auto &r) {
324+
interval_set<uint64_t> range;
325+
range.insert(r->get_key(), r->get_length());
326+
if (range.contains(region.addr, region.len) && !r->is_clone()) {
327+
to_remap.push_back(extent_to_remap_t::create_overwrite(
328+
0, region.len, std::move(r), *region.to_write));
329+
return true;
330+
}
331+
return false;
332+
});
333+
// if the size of the region is wider than the ragne from the enry in to_remove,
334+
// we create a separated extent in the original way.
335+
} else if (pre_alloc_addr_remapped.contains(region.addr, region.len) &&
336+
region.len <= delta_based_overwrite_max_extent_size) {
337+
erased_num = std::erase_if(
338+
ops.to_remap,
339+
[&region, &to_remap](auto &r) {
340+
interval_set<uint64_t> range;
341+
range.insert(r.pin->get_key(), r.pin->get_length());
342+
if (range.contains(region.addr, region.len) && !r.pin->is_clone()) {
343+
to_remap.push_back(extent_to_remap_t::create_overwrite(
344+
region.addr - range.begin().get_start(), region.len,
345+
std::move(r.pin), *region.to_write));
346+
return true;
347+
}
348+
return false;
349+
});
350+
assert(erased_num > 0);
351+
}
352+
if (erased_num == 0) {
353+
ops.to_insert.push_back(extent_to_insert_t::create_data(
354+
region.addr, region.len, region.to_write));
355+
}
281356
} else if (region.is_zero()) {
282357
visitted++;
283358
assert(!(region.to_write.has_value()));
284359
ops.to_insert.push_back(extent_to_insert_t::create_zero(
285360
region.addr, region.len));
286361
}
287362
}
363+
ops.to_remap.splice(ops.to_remap.end(), to_remap);
288364

289365
logger().debug(
290366
"to_remap list size: {}"
@@ -334,6 +410,22 @@ void splice_extent_to_write(
334410
}
335411
}
336412

413+
ceph::bufferlist ObjectDataBlock::get_delta() {
414+
ceph::bufferlist bl;
415+
encode(delta, bl);
416+
return bl;
417+
}
418+
419+
void ObjectDataBlock::apply_delta(const ceph::bufferlist &bl) {
420+
auto biter = bl.begin();
421+
decltype(delta) deltas;
422+
decode(deltas, biter);
423+
for (auto &&d : deltas) {
424+
auto iter = d.bl.cbegin();
425+
iter.copy(d.len, get_bptr().c_str() + d.offset);
426+
}
427+
}
428+
337429
/// Creates remap extents in to_remap
338430
ObjectDataHandler::write_ret do_remappings(
339431
context_t ctx,
@@ -342,7 +434,7 @@ ObjectDataHandler::write_ret do_remappings(
342434
return trans_intr::do_for_each(
343435
to_remap,
344436
[ctx](auto &region) {
345-
if (region.is_remap()) {
437+
if (region.is_remap1()) {
346438
return ctx.tm.remap_pin<ObjectDataBlock, 1>(
347439
ctx.t,
348440
std::move(region.pin),
@@ -355,6 +447,22 @@ ObjectDataHandler::write_ret do_remappings(
355447
return ObjectDataHandler::write_iertr::now();
356448
});
357449
} else if (region.is_overwrite()) {
450+
return ctx.tm.get_mutable_extent_by_laddr<ObjectDataBlock>(
451+
ctx.t,
452+
region.laddr_start,
453+
region.length
454+
).handle_error_interruptible(
455+
TransactionManager::base_iertr::pass_further{},
456+
crimson::ct_error::assert_all{
457+
"ObjectDataHandler::do_remapping hit invalid error"
458+
}
459+
).si_then([&region](auto extent) {
460+
extent_len_t off = region.new_offset;
461+
assert(region.bl->length() == region.new_len);
462+
extent->overwrite(off, *region.bl);
463+
return ObjectDataHandler::write_iertr::now();
464+
});
465+
} else if (region.is_remap2()) {
358466
return ctx.tm.remap_pin<ObjectDataBlock, 2>(
359467
ctx.t,
360468
std::move(region.pin),
@@ -960,7 +1068,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
9601068
return seastar::do_with(
9611069
lba_pin_list_t(),
9621070
extent_to_write_list_t(),
963-
[ctx, size, &object_data](auto &pins, auto &to_write) {
1071+
[ctx, size, &object_data, this](auto &pins, auto &to_write) {
9641072
LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
9651073
DEBUGT("object_data: {}~{}",
9661074
ctx.t,
@@ -1038,9 +1146,10 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
10381146
});
10391147
}
10401148
}
1041-
}).si_then([ctx, size, &to_write, &object_data, &pins] {
1149+
}).si_then([ctx, size, &to_write, &object_data, &pins, this] {
10421150
return seastar::do_with(
1043-
prepare_ops_list(pins, to_write),
1151+
prepare_ops_list(pins, to_write,
1152+
delta_based_overwrite_max_extent_size),
10441153
[ctx, size, &object_data](auto &ops) {
10451154
return do_remappings(ctx, ops.to_remap
10461155
).si_then([ctx, &ops] {
@@ -1162,7 +1271,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
11621271
return seastar::do_with(
11631272
std::move(_pins),
11641273
extent_to_write_list_t(),
1165-
[ctx, len, offset, overwrite_plan, bl=std::move(bl)]
1274+
[ctx, len, offset, overwrite_plan, bl=std::move(bl), this]
11661275
(auto &pins, auto &to_write) mutable
11671276
{
11681277
LOG_PREFIX(ObjectDataHandler::overwrite);
@@ -1178,7 +1287,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
11781287
pins.front(),
11791288
overwrite_plan
11801289
).si_then([ctx, len, offset, overwrite_plan, bl=std::move(bl),
1181-
&to_write, &pins](auto p) mutable {
1290+
&to_write, &pins, this](auto p) mutable {
11821291
auto &[left_extent, headptr] = p;
11831292
if (left_extent) {
11841293
ceph_assert(left_extent->addr == overwrite_plan.pin_begin);
@@ -1195,7 +1304,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
11951304
pin_begin=overwrite_plan.pin_begin,
11961305
pin_end=overwrite_plan.pin_end,
11971306
bl=std::move(bl), headptr=std::move(headptr),
1198-
&to_write, &pins](auto p) mutable {
1307+
&to_write, &pins, this](auto p) mutable {
11991308
auto &[right_extent, tailptr] = p;
12001309
if (bl.has_value()) {
12011310
auto write_offset = offset;
@@ -1232,7 +1341,8 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
12321341
assert(pin_end == to_write.back().get_end_addr());
12331342

12341343
return seastar::do_with(
1235-
prepare_ops_list(pins, to_write),
1344+
prepare_ops_list(pins, to_write,
1345+
delta_based_overwrite_max_extent_size),
12361346
[ctx](auto &ops) {
12371347
return do_remappings(ctx, ops.to_remap
12381348
).si_then([ctx, &ops] {

0 commit comments

Comments
 (0)