Skip to content

Commit 76b8785

Browse files
committed
crimson/os/seastore: avoid new allocation when overwriting data in RBM for performance
In 4K random write test, after seastore is filled up by 4MB extents, current implementation performs deep copy in duplicate_for_write(), resulting in significant performance degradation by 80%. Therefore, this commit changes the deep copy behavior for bufferptr during the overwrite situation to shallow copy, leaving the original data untouched. Signed-off-by: Myoungwon Oh <[email protected]> Signed-off-by: Yingxin Cheng <[email protected]>
1 parent ac7e052 commit 76b8785

File tree

3 files changed

+211
-13
lines changed

3 files changed

+211
-13
lines changed

src/crimson/os/seastore/cached_extent.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -547,11 +547,11 @@ class CachedExtent
547547
}
548548

549549
/// Get ref to raw buffer
550-
bufferptr &get_bptr() {
550+
virtual bufferptr &get_bptr() {
551551
assert(ptr.has_value());
552552
return *ptr;
553553
}
554-
const bufferptr &get_bptr() const {
554+
virtual const bufferptr &get_bptr() const {
555555
assert(ptr.has_value());
556556
return *ptr;
557557
}
@@ -648,11 +648,6 @@ class CachedExtent
648648
return extent_index_hook.is_linked();
649649
}
650650

651-
/// set bufferptr
652-
void set_bptr(ceph::bufferptr &&nptr) {
653-
ptr = nptr;
654-
}
655-
656651
/// hook for intrusive ref list (mainly dirty or lru list)
657652
boost::intrusive::list_member_hook<> primary_ref_list_hook;
658653
using primary_ref_list_member_options = boost::intrusive::member_hook<
@@ -799,6 +794,11 @@ class CachedExtent
799794
poffset = offset;
800795
}
801796

797+
/// set bufferptr
798+
void set_bptr(ceph::bufferptr &&nptr) {
799+
ptr = nptr;
800+
}
801+
802802
/**
803803
* maybe_generate_relative
804804
*

src/crimson/os/seastore/object_data_handler.h

Lines changed: 86 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,22 +30,72 @@ struct block_delta_t {
3030
}
3131
};
3232

33+
class overwrite_buf_t {
34+
public:
35+
overwrite_buf_t() = default;
36+
bool is_empty() const {
37+
return changes.empty() && !has_cached_bptr();
38+
}
39+
bool has_cached_bptr() const {
40+
return ptr.has_value();
41+
}
42+
void add(const block_delta_t &b) {
43+
changes.push_back(b);
44+
}
45+
void apply_changes_to(bufferptr &b) const {
46+
assert(!changes.empty());
47+
for (auto p : changes) {
48+
auto iter = p.bl.cbegin();
49+
iter.copy(p.bl.length(), b.c_str() + p.offset);
50+
}
51+
changes.clear();
52+
}
53+
const bufferptr &get_cached_bptr(const bufferptr &_ptr) const {
54+
apply_changes_to_cache(_ptr);
55+
return *ptr;
56+
}
57+
bufferptr &get_cached_bptr(const bufferptr &_ptr) {
58+
apply_changes_to_cache(_ptr);
59+
return *ptr;
60+
}
61+
bufferptr &&move_cached_bptr() {
62+
assert(has_cached_bptr());
63+
apply_changes_to(*ptr);
64+
return std::move(*ptr);
65+
}
66+
private:
67+
void apply_changes_to_cache(const bufferptr &_ptr) const {
68+
assert(!is_empty());
69+
if (!has_cached_bptr()) {
70+
ptr = ceph::buffer::copy(_ptr.c_str(), _ptr.length());
71+
}
72+
if (!changes.empty()) {
73+
apply_changes_to(*ptr);
74+
}
75+
}
76+
mutable std::vector<block_delta_t> changes = {};
77+
mutable std::optional<ceph::bufferptr> ptr = std::nullopt;
78+
};
79+
3380
struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent {
3481
using Ref = TCachedExtentRef<ObjectDataBlock>;
3582

3683
std::vector<block_delta_t> delta = {};
3784

3885
interval_set<extent_len_t> modified_region;
3986

87+
// to provide the local modified view during transaction
88+
overwrite_buf_t cached_overwrites;
89+
4090
explicit ObjectDataBlock(ceph::bufferptr &&ptr)
4191
: LogicalCachedExtent(std::move(ptr)) {}
42-
explicit ObjectDataBlock(const ObjectDataBlock &other)
43-
: LogicalCachedExtent(other), modified_region(other.modified_region) {}
92+
explicit ObjectDataBlock(const ObjectDataBlock &other, share_buffer_t s)
93+
: LogicalCachedExtent(other, s), modified_region(other.modified_region) {}
4494
explicit ObjectDataBlock(extent_len_t length)
4595
: LogicalCachedExtent(length) {}
4696

4797
CachedExtentRef duplicate_for_write(Transaction&) final {
48-
return CachedExtentRef(new ObjectDataBlock(*this));
98+
return CachedExtentRef(new ObjectDataBlock(*this, share_buffer_t{}));
4999
};
50100

51101
static constexpr extent_types_t TYPE = extent_types_t::OBJECT_DATA_BLOCK;
@@ -54,9 +104,9 @@ struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent {
54104
}
55105

56106
void overwrite(extent_len_t offset, bufferlist bl) {
57-
auto iter = bl.cbegin();
58-
iter.copy(bl.length(), get_bptr().c_str() + offset);
59-
delta.push_back({offset, bl.length(), bl});
107+
block_delta_t b {offset, bl.length(), bl};
108+
cached_overwrites.add(b);
109+
delta.push_back(b);
60110
modified_region.union_insert(offset, bl.length());
61111
}
62112

@@ -76,9 +126,39 @@ struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent {
76126
modified_region.clear();
77127
}
78128

129+
void prepare_commit() final {
130+
if (is_mutation_pending() || is_exist_mutation_pending()) {
131+
ceph_assert(!cached_overwrites.is_empty());
132+
if (cached_overwrites.has_cached_bptr()) {
133+
set_bptr(cached_overwrites.move_cached_bptr());
134+
} else {
135+
// The optimized path to minimize data copy
136+
cached_overwrites.apply_changes_to(CachedExtent::get_bptr());
137+
}
138+
} else {
139+
assert(cached_overwrites.is_empty());
140+
}
141+
}
142+
79143
void logical_on_delta_write() final {
80144
delta.clear();
81145
}
146+
147+
bufferptr &get_bptr() override {
148+
if (cached_overwrites.is_empty()) {
149+
return CachedExtent::get_bptr();
150+
} else {
151+
return cached_overwrites.get_cached_bptr(CachedExtent::get_bptr());
152+
}
153+
}
154+
155+
const bufferptr &get_bptr() const override {
156+
if (cached_overwrites.is_empty()) {
157+
return CachedExtent::get_bptr();
158+
} else {
159+
return cached_overwrites.get_cached_bptr(CachedExtent::get_bptr());
160+
}
161+
}
82162
};
83163
using ObjectDataBlockRef = TCachedExtentRef<ObjectDataBlock>;
84164

src/test/crimson/seastore/test_object_data_handler.cc

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,38 @@ struct object_data_handler_test_t:
221221
return ret;
222222
}
223223

224+
using remap_entry = TransactionManager::remap_entry;
225+
LBAMappingRef remap_pin(
226+
Transaction &t,
227+
LBAMappingRef &&opin,
228+
extent_len_t new_offset,
229+
extent_len_t new_len) {
230+
auto pin = with_trans_intr(t, [&](auto& trans) {
231+
return tm->remap_pin<ObjectDataBlock>(
232+
trans, std::move(opin), std::array{
233+
remap_entry(new_offset, new_len)}
234+
).si_then([](auto ret) {
235+
return std::move(ret[0]);
236+
});
237+
}).handle_error(crimson::ct_error::eagain::handle([] {
238+
LBAMappingRef t = nullptr;
239+
return t;
240+
}), crimson::ct_error::pass_further_all{}).unsafe_get0();
241+
EXPECT_TRUE(pin);
242+
return pin;
243+
}
244+
245+
ObjectDataBlockRef get_extent(
246+
Transaction &t,
247+
laddr_t addr,
248+
extent_len_t len) {
249+
auto ext = with_trans_intr(t, [&](auto& trans) {
250+
return tm->read_extent<ObjectDataBlock>(trans, addr, len);
251+
}).unsafe_get0();
252+
EXPECT_EQ(addr, ext->get_laddr());
253+
return ext;
254+
}
255+
224256
seastar::future<> set_up_fut() final {
225257
onode = new TestOnode(
226258
DEFAULT_OBJECT_DATA_RESERVATION,
@@ -713,6 +745,92 @@ TEST_P(object_data_handler_test_t, random_overwrite) {
713745
});
714746
}
715747

748+
TEST_P(object_data_handler_test_t, overwrite_then_read_within_transaction) {
749+
run_async([this] {
750+
set_overwrite_threshold();
751+
auto t = create_mutate_transaction();
752+
auto base = 4096 * 4;
753+
auto len = 4096 * 6;
754+
write(*t, base, len, 'a');
755+
submit_transaction(std::move(t));
756+
757+
t = create_mutate_transaction();
758+
{
759+
auto pins = get_mappings(base, len);
760+
assert(pins.size() == 1);
761+
auto pin1 = remap_pin(*t, std::move(pins.front()), 4096, 8192);
762+
auto ext = get_extent(*t, base + 4096, 4096 * 2);
763+
ASSERT_TRUE(ext->is_exist_clean());
764+
ext = tm->get_mutable_extent(*t, ext)->cast<ObjectDataBlock>();
765+
766+
auto l = 4096;
767+
memset(
768+
known_contents.c_str() + base + 4096,
769+
'z',
770+
l);
771+
bufferlist bl;
772+
bl.append(
773+
bufferptr(
774+
known_contents,
775+
base + 4096,
776+
l));
777+
778+
ext->overwrite(0, bl);
779+
ASSERT_TRUE(ext->is_exist_mutation_pending());
780+
}
781+
submit_transaction(std::move(t));
782+
read(base + 4096, 4096);
783+
read(base + 4096, 8192);
784+
restart();
785+
epm->check_usage();
786+
read(base + 4096, 8192);
787+
788+
t = create_mutate_transaction();
789+
base = 0;
790+
len = 4096 * 3;
791+
write(*t, base, len, 'a');
792+
submit_transaction(std::move(t));
793+
794+
t = create_mutate_transaction();
795+
write(*t, base + 4096, 4096, 'b');
796+
read(*t, base + 1024, 4096 + 1024);
797+
write(*t, base + 8192, 4096, 'c');
798+
read(*t, base + 2048, 8192);
799+
write(*t, base, 4096, 'd');
800+
write(*t, base + 4096, 4096, 'x');
801+
submit_transaction(std::move(t));
802+
read(base + 1024, 8192 - 1024);
803+
read(base, 4096 * 3);
804+
restart();
805+
epm->check_usage();
806+
read(base, 4096 * 3);
807+
808+
auto t1 = create_mutate_transaction();
809+
write(*t1, base + 4096, 4096, 'e');
810+
read(*t1, base + 4096, 4096);
811+
auto t2 = create_read_transaction();
812+
bufferlist committed = with_trans_intr(*t2, [&](auto &t) {
813+
return ObjectDataHandler(MAX_OBJECT_SIZE).read(
814+
ObjectDataHandler::context_t{
815+
*tm,
816+
t,
817+
*onode
818+
},
819+
base + 4096,
820+
4096);
821+
}).unsafe_get0();
822+
bufferlist pending;
823+
pending.append(
824+
bufferptr(
825+
known_contents,
826+
base + 4096,
827+
4096));
828+
EXPECT_EQ(committed.length(), pending.length());
829+
EXPECT_NE(committed, pending);
830+
unset_overwrite_threshold();
831+
});
832+
}
833+
716834
INSTANTIATE_TEST_SUITE_P(
717835
object_data_handler_test,
718836
object_data_handler_test_t,

0 commit comments

Comments
 (0)