Skip to content

Commit a0c9fec

Browse files
committed
os/Transaction: page align write data buffers to improve performance
Align write data in Objectstore::Transaction encoded buffers so that RepOp and ECSubOpWrite messages align data on the receiving OSD to avoid a later memmove. Also fix ECSubOpReadReply messages in a similar way so that read data is aligned on the receiving OSD. Signed-off-by: Radoslaw Zarzynski <[email protected]> Signed-off-by: Bill Scales <[email protected]>
1 parent f3cb73c commit a0c9fec

17 files changed

+1537
-201
lines changed

src/crimson/osd/ops_executer.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,17 +1102,18 @@ void OpsExecuter::apply_stats()
11021102
pg->apply_stats(get_target(), delta_stats);
11031103
}
11041104

1105-
OpsExecuter::OpsExecuter(Ref<PG> pg,
1105+
OpsExecuter::OpsExecuter(Ref<PG> _pg,
11061106
ObjectContextRef _obc,
11071107
const OpInfo& op_info,
11081108
abstracted_msg_t&& msg,
11091109
crimson::net::ConnectionXcoreRef conn,
11101110
const SnapContext& _snapc)
1111-
: pg(std::move(pg)),
1111+
: pg(std::move(_pg)),
11121112
obc(std::move(_obc)),
11131113
op_info(op_info),
11141114
msg(std::move(msg)),
11151115
conn(conn),
1116+
txn(pg->min_peer_features()),
11161117
snapc(_snapc)
11171118
{
11181119
if (op_info.may_write() && should_clone(*obc, snapc)) {

src/crimson/osd/pg.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,8 +1221,10 @@ PG::handle_rep_op_fut PG::handle_rep_op(Ref<MOSDRepOp> req)
12211221
DEBUGDPP("{}", *this, *req);
12221222

12231223
ceph::os::Transaction txn;
1224-
auto encoded_txn = req->get_data().cbegin();
1225-
decode(txn, encoded_txn);
1224+
auto encoded_txn_p = req->get_middle().cbegin();
1225+
auto encoded_txn_d = req->get_data().cbegin();
1226+
txn.decode(req->get_middle().length() != 0 ? encoded_txn_p : encoded_txn_d,
1227+
encoded_txn_d);
12261228
auto p = req->logbl.cbegin();
12271229
std::vector<pg_log_entry_t> log_entries;
12281230
decode(log_entries, p);

src/crimson/osd/replicated_backend.cc

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ ReplicatedBackend::_read(const hobject_t& hoid,
3939
MURef<MOSDRepOp> ReplicatedBackend::new_repop_msg(
4040
const pg_shard_t &pg_shard,
4141
const hobject_t &hoid,
42-
const bufferlist &encoded_txn,
42+
bufferlist &encoded_txn_p_bl,
43+
bufferlist &encoded_txn_d_bl,
4344
const osd_op_params_t &osd_op_p,
4445
epoch_t min_epoch,
4546
epoch_t map_epoch,
@@ -59,7 +60,13 @@ MURef<MOSDRepOp> ReplicatedBackend::new_repop_msg(
5960
tid,
6061
osd_op_p.at_version);
6162
if (send_op) {
62-
m->set_data(encoded_txn);
63+
if (encoded_txn_d_bl.length() != 0) {
64+
m->set_txn_payload(encoded_txn_p_bl);
65+
m->set_data(encoded_txn_d_bl);
66+
} else {
67+
// Pre-tentacle format - everything in data
68+
m->set_data(encoded_txn_p_bl);
69+
}
6370
} else {
6471
ceph::os::Transaction t;
6572
bufferlist bl;
@@ -97,8 +104,8 @@ ReplicatedBackend::submit_transaction(
97104
pg_shards.size(),
98105
osd_op_p.at_version,
99106
pg.get_last_complete()).first;
100-
bufferlist encoded_txn;
101-
encode(txn, encoded_txn);
107+
bufferlist encoded_txn_p_bl, encoded_txn_d_bl;
108+
txn.encode(encoded_txn_p_bl, encoded_txn_d_bl, pg.min_peer_features());
102109

103110
bool is_delete = false;
104111
for (auto &le : log_entries) {
@@ -120,11 +127,11 @@ ReplicatedBackend::submit_transaction(
120127
MURef<MOSDRepOp> m;
121128
if (pg.should_send_op(pg_shard, hoid)) {
122129
m = new_repop_msg(
123-
pg_shard, hoid, encoded_txn, osd_op_p,
130+
pg_shard, hoid, encoded_txn_p_bl, encoded_txn_d_bl, osd_op_p,
124131
min_epoch, map_epoch, log_entries, true, tid);
125132
} else {
126133
m = new_repop_msg(
127-
pg_shard, hoid, encoded_txn, osd_op_p,
134+
pg_shard, hoid, encoded_txn_p_bl, encoded_txn_d_bl, osd_op_p,
128135
min_epoch, map_epoch, log_entries, false, tid);
129136
if (pg.is_missing_on_peer(pg_shard, hoid)) {
130137
if (_new_clone) {

src/crimson/osd/replicated_backend.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ class ReplicatedBackend : public PGBackend
7070
MURef<MOSDRepOp> new_repop_msg(
7171
const pg_shard_t &pg_shard,
7272
const hobject_t &hoid,
73-
const bufferlist &encoded_txn,
73+
bufferlist &encoded_txn_p_bl,
74+
bufferlist &encoded_txn_d_bl,
7475
const osd_op_params_t &osd_op_p,
7576
epoch_t min_epoch,
7677
epoch_t map_epoch,

src/messages/MOSDECSubOpReadReply.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,10 @@ class MOSDECSubOpReadReply : public MOSDFastDispatchOp {
4848
void decode_payload() override {
4949
using ceph::decode;
5050
auto p = payload.cbegin();
51+
auto d = data.cbegin();
5152
decode(pgid, p);
5253
decode(map_epoch, p);
53-
decode(op, p);
54+
op.decode(p, d);
5455
if (header.version >= 2) {
5556
decode(min_epoch, p);
5657
decode_trace(p);
@@ -63,7 +64,7 @@ class MOSDECSubOpReadReply : public MOSDFastDispatchOp {
6364
using ceph::encode;
6465
encode(pgid, payload);
6566
encode(map_epoch, payload);
66-
encode(op, payload);
67+
op.encode(payload, data, features);
6768
encode(min_epoch, payload);
6869
encode_trace(payload, features);
6970
}

src/messages/MOSDECSubOpWrite.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,10 @@ class MOSDECSubOpWrite : public MOSDFastDispatchOp {
5252
void decode_payload() override {
5353
using ceph::decode;
5454
auto p = payload.cbegin();
55+
auto d = data.cbegin();
5556
decode(pgid, p);
5657
decode(map_epoch, p);
57-
decode(op, p);
58+
op.decode(p, d);
5859
if (header.version >= 2) {
5960
decode(min_epoch, p);
6061
decode_trace(p);
@@ -67,7 +68,7 @@ class MOSDECSubOpWrite : public MOSDFastDispatchOp {
6768
using ceph::encode;
6869
encode(pgid, payload);
6970
encode(map_epoch, payload);
70-
encode(op, payload);
71+
op.encode(payload, data, features);
7172
encode(min_epoch, payload);
7273
encode_trace(payload, features);
7374
}

src/messages/MOSDRepOp.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ class MOSDRepOp final : public MOSDFastDispatchOp {
8585
/// non-empty if this transaction involves a hit_set history update
8686
std::optional<pg_hit_set_history_t> updated_hit_set_history;
8787

88+
bufferlist txn_payload;
89+
8890
epoch_t get_map_epoch() const override {
8991
return map_epoch;
9092
}
@@ -99,6 +101,11 @@ class MOSDRepOp final : public MOSDFastDispatchOp {
99101
return data.length();
100102
}
101103

104+
void set_txn_payload(bufferlist bl)
105+
{
106+
txn_payload = bl;
107+
}
108+
102109
void decode_payload() override {
103110
using ceph::decode;
104111
p = payload.cbegin();
@@ -159,6 +166,8 @@ class MOSDRepOp final : public MOSDFastDispatchOp {
159166
encode(from, payload);
160167
encode(updated_hit_set_history, payload);
161168
encode(pg_committed_to, payload);
169+
bufferlist middle(txn_payload);
170+
set_middle(middle);
162171
}
163172

164173
MOSDRepOp()

src/msg/Message.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ class Message : public RefCountedObject {
441441
byte_throttler->take(middle.length());
442442
}
443443
ceph::buffer::list& get_middle() { return middle; }
444+
const ceph::buffer::list& get_middle() const { return middle; }
444445

445446
void set_data(const ceph::buffer::list &bl) {
446447
if (byte_throttler)

0 commit comments

Comments
 (0)