Skip to content

Commit 127457a

Browse files
committed
osd: Implement sync reads and sparse reads for EC for direct reads
Sparse reads for EC are simple to implement, as the code is essentially identical to that of replica, with some address translation. When doing a direct read in EC, only a single OSD is involved and that OSD, by definition is the only OSD involved. As such we can do the more performant sync read, rather than async read. Signed-off-by: Alex Ainscow <[email protected]>
1 parent c437014 commit 127457a

File tree

4 files changed

+96
-3
lines changed

4 files changed

+96
-3
lines changed

src/osd/ECBackend.cc

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1007,7 +1007,31 @@ int ECBackend::objects_read_sync(
10071007
uint64_t len,
10081008
uint32_t op_flags,
10091009
bufferlist *bl) {
1010-
return -EOPNOTSUPP;
1010+
1011+
if (!sinfo.supports_direct_reads()) {
1012+
return -EOPNOTSUPP;
1013+
}
1014+
1015+
if (get_parent()->get_local_missing().is_missing(hoid)) {
1016+
return -EIO; // Permission denied (cos its missing)
1017+
}
1018+
1019+
auto [shard_offset, shard_len] = extent_to_shard_extent(off, len);
1020+
1021+
1022+
dout(20) << __func__ << " Submitting sync read: "
1023+
<< " hoid=" << hoid
1024+
<< " shard_offset=" << shard_offset
1025+
<< " shard_len=" << shard_len
1026+
<< " op_flags=" << op_flags
1027+
<< " primary=" << switcher->is_primary()
1028+
<< dendl;
1029+
1030+
1031+
return switcher->store->read(switcher->ch,
1032+
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
1033+
shard_offset,
1034+
shard_len, *bl, op_flags);
10111035
}
10121036

10131037
std::pair<uint64_t, uint64_t> ECBackend::extent_to_shard_extent(uint64_t off, uint64_t len) {
@@ -1034,6 +1058,41 @@ std::pair<uint64_t, uint64_t> ECBackend::extent_to_shard_extent(uint64_t off, ui
10341058
return std::pair(shard_offset, shard_len);
10351059
}
10361060

1061+
int ECBackend::objects_readv_sync(const hobject_t &hoid,
1062+
std::map<uint64_t, uint64_t>& m,
1063+
uint32_t op_flags,
1064+
ceph::buffer::list *bl) {
1065+
if (get_parent()->get_local_missing().is_missing(hoid)) {
1066+
return -EACCES; // Permission denied (cos its missing)
1067+
}
1068+
1069+
// Not using extent set, since we need the one used by readv.
1070+
1071+
auto shard = get_parent()->whoami_shard().shard;
1072+
interval_set im(std::move(m));
1073+
m.clear(); // Make m safe to write to again.
1074+
auto r = switcher->store->readv(switcher->ch, ghobject_t(hoid, ghobject_t::NO_GEN, shard), im, *bl, op_flags);
1075+
if (r >= 0) {
1076+
uint64_t chunk_size = sinfo.get_chunk_size();
1077+
for (auto [off, len] : im) {
1078+
uint64_t ro_offset = sinfo.shard_offset_to_ro_offset(shard, off);
1079+
uint64_t to_next_chunk = ((off / chunk_size) + 1) * chunk_size - off;
1080+
uint64_t ro_len = std::min(to_next_chunk, len);
1081+
while (len > 0 ) {
1082+
dout(20) << __func__ << " shard=" << shard << " extent=" << off << "~" << len << ">" << ro_offset << "~" << ro_len << dendl;
1083+
m.emplace(ro_offset, ro_len);
1084+
len -= ro_len;
1085+
ro_offset += ro_len + sinfo.get_stripe_width() - chunk_size;
1086+
ro_len = std::min(len, chunk_size);
1087+
}
1088+
}
1089+
} else {
1090+
return r;
1091+
}
1092+
1093+
return 0;
1094+
}
1095+
10371096
void ECBackend::objects_read_async(
10381097
const hobject_t &hoid,
10391098
uint64_t object_size,

src/osd/ECBackend.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ class ECBackend : public ECCommon {
140140

141141
std::pair<uint64_t, uint64_t> extent_to_shard_extent(uint64_t off, uint64_t len);
142142

143+
int objects_readv_sync(const hobject_t &hoid,
144+
std::map<uint64_t, uint64_t>& m,
145+
uint32_t op_flags,
146+
ceph::buffer::list *bl);
147+
143148
/**
144149
* Async read mechanism
145150
*
@@ -198,6 +203,14 @@ class ECBackend : public ECCommon {
198203

199204
void kick_reads();
200205

206+
int _objects_read_sync(
207+
const hobject_t &hoid,
208+
uint64_t off,
209+
uint64_t len,
210+
uint32_t op_flags,
211+
ceph::buffer::list *bl
212+
);
213+
201214
public:
202215
struct ECRecoveryBackend : RecoveryBackend {
203216
ECRecoveryBackend(CephContext *cct,

src/osd/ECSwitch.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,17 @@ class ECSwitch : public PGBackend
267267
return legacy.objects_read_sync(hoid, off, len, op_flags, bl);
268268
}
269269

270+
int objects_readv_sync(const hobject_t &hoid,
271+
std::map<uint64_t, uint64_t>& m,
272+
uint32_t op_flags,
273+
ceph::buffer::list *bl) override
274+
{
275+
if (is_optimized()) {
276+
return optimized.objects_readv_sync(hoid, m, op_flags, bl);
277+
}
278+
ceph_abort_msg("Sync reads legacy EC");
279+
}
280+
270281
std::pair<uint64_t, uint64_t> extent_to_shard_extent(
271282
uint64_t off, uint64_t len) override {
272283
if (is_optimized()) {

src/osd/PrimaryLogPG.cc

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5894,6 +5894,13 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
58945894
if (oi.is_data_digest() && op.extent.offset == 0 &&
58955895
op.extent.length >= oi.size)
58965896
maybe_crc = oi.data_digest;
5897+
5898+
if (ctx->op->ec_direct_read()) {
5899+
result = pgbackend->objects_read_sync(
5900+
soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5901+
5902+
dout(20) << " EC sync read for " << soid << " result=" << result << dendl;
5903+
} else {
58975904
ctx->pending_async_reads.push_back(
58985905
make_pair(
58995906
boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
@@ -5905,6 +5912,7 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
59055912

59065913
ctx->op_finishers[ctx->current_osd_subop_num].reset(
59075914
new ReadFinisher(osd_op));
5915+
}
59085916
} else {
59095917
int r = pgbackend->objects_read_sync(
59105918
soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
@@ -5964,7 +5972,7 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
59645972
}
59655973

59665974
++ctx->num_read;
5967-
if (pool.info.is_erasure()) {
5975+
if (pool.info.is_erasure() && !ctx->op->ec_direct_read()) {
59685976
// translate sparse read to a normal one if not supported
59695977

59705978
if (length > 0) {
@@ -5987,9 +5995,10 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
59875995
} else {
59885996
// read into a buffer
59895997
map<uint64_t, uint64_t> m;
5998+
auto [shard_offset, shard_length] = pgbackend->extent_to_shard_extent(offset, length);
59905999
int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
59916000
info.pgid.shard),
5992-
offset, length, m);
6001+
shard_offset, shard_length, m);
59936002
if (r < 0) {
59946003
return r;
59956004
}
@@ -6000,6 +6009,7 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
60006009
r = rep_repair_primary_object(soid, ctx);
60016010
}
60026011
if (r < 0) {
6012+
dout(10) << " sparse_read failed r=" << r << " from object " << soid << dendl;
60036013
return r;
60046014
}
60056015

0 commit comments

Comments
 (0)