Skip to content

Commit f25386f

Browse files
authored
Merge pull request ceph#55196 from rzarzynski/wip-osd-ec-partial-reads
osd: EC Partial Stripe Reads (Retry of ceph#23138 and ceph#52746) Reviewed-by: Samuel Just <[email protected]>
2 parents 82c91f6 + a1084f5 commit f25386f

26 files changed

+667
-154
lines changed

qa/standalone/erasure-code/test-erasure-eio.sh

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,19 @@ function rados_put_get_data() {
178178
wait_for_clean || return 1
179179
# Won't check for eio on get here -- recovery above might have fixed it
180180
else
181-
shard_id=$(expr $shard_id + 1)
182-
inject_$inject ec data $poolname $objname $dir $shard_id || return 1
183-
rados_get $dir $poolname $objname fail || return 1
181+
local another_shard_id=$(expr $shard_id + 1)
182+
inject_$inject ec data $poolname $objname $dir $another_shard_id || return 1
183+
if [ $shard_id -eq 1 -a $another_shard_id -eq 2 ];
184+
then
185+
# we're reading 4 kb long object while the stripe size is 8 kb.
186+
# as we do partial reads and this request can be satisfied
187+
# from the undamaged shard 0, we expect a success.
188+
rados_get $dir $poolname $objname || return 1
189+
else
190+
# both shards 0 and 1 are demaged. there is no way no serve
191+
# the requests, regardless of partial reads
192+
rados_get $dir $poolname $objname fail || return 1
193+
fi
184194
rm $dir/ORIGINAL
185195
fi
186196

@@ -238,9 +248,19 @@ function rados_get_data_bad_size() {
238248
rados_get $dir $poolname $objname || return 1
239249

240250
# Leave objname and modify another shard
241-
shard_id=$(expr $shard_id + 1)
242-
set_size $objname $dir $shard_id $bytes $mode || return 1
243-
rados_get $dir $poolname $objname fail || return 1
251+
local another_shard_id=$(expr $shard_id + 1)
252+
set_size $objname $dir $another_shard_id $bytes $mode || return 1
253+
if [ $shard_id -eq 1 -a $another_shard_id -eq 2 ];
254+
then
255+
# we're reading 4 kb long object while the stripe size is 8 kb.
256+
# as we do partial reads and this request can be satisfied
257+
# from the undamaged shard 0, we expect a success.
258+
rados_get $dir $poolname $objname || return 1
259+
else
260+
# both shards 0 and 1 are demaged. there is no way no serve
261+
# the requests, regardless of partial reads
262+
rados_get $dir $poolname $objname fail || return 1
263+
fi
244264
rm $dir/ORIGINAL
245265
}
246266

src/common/options/osd.yaml.in

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,6 +1234,11 @@ options:
12341234
level: advanced
12351235
default: false
12361236
with_legacy: true
1237+
- name: osd_ec_partial_reads
1238+
type: bool
1239+
level: advanced
1240+
default: true
1241+
with_legacy: true
12371242
- name: osd_recovery_delay_start
12381243
type: float
12391244
level: advanced

src/erasure-code/ErasureCode.cc

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -348,21 +348,35 @@ int ErasureCode::to_string(const std::string &name,
348348
return 0;
349349
}
350350

351-
int ErasureCode::decode_concat(const map<int, bufferlist> &chunks,
351+
int ErasureCode::decode_concat(const set<int>& want_to_read,
352+
const map<int, bufferlist> &chunks,
352353
bufferlist *decoded)
353354
{
354-
set<int> want_to_read;
355-
356-
for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
357-
want_to_read.insert(chunk_index(i));
358-
}
359355
map<int, bufferlist> decoded_map;
360356
int r = _decode(want_to_read, chunks, &decoded_map);
361357
if (r == 0) {
362358
for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
363-
decoded->claim_append(decoded_map[chunk_index(i)]);
359+
// XXX: the ErasureCodeInterface allows `decode()` to return
360+
// *at least* `want_to_read chunks`; that is, they may more.
361+
// Some implementations are consistently exact but jerasure
362+
// is quirky: it outputs more only when deailing with degraded.
363+
// The check below uniforms the behavior.
364+
if (want_to_read.contains(chunk_index(i)) &&
365+
decoded_map.contains(chunk_index(i))) {
366+
decoded->claim_append(decoded_map[chunk_index(i)]);
367+
}
364368
}
365369
}
366370
return r;
367371
}
372+
373+
int ErasureCode::decode_concat(const map<int, bufferlist> &chunks,
374+
bufferlist *decoded)
375+
{
376+
set<int> want_to_read;
377+
for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
378+
want_to_read.insert(chunk_index(i));
379+
}
380+
return decode_concat(want_to_read, chunks, decoded);
381+
}
368382
}

src/erasure-code/ErasureCode.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,11 @@ namespace ceph {
112112
const std::string &default_value,
113113
std::ostream *ss);
114114

115+
int decode_concat(const std::set<int>& want_to_read,
116+
const std::map<int, bufferlist> &chunks,
117+
bufferlist *decoded) override;
115118
int decode_concat(const std::map<int, bufferlist> &chunks,
116-
bufferlist *decoded) override;
119+
bufferlist *decoded) override;
117120

118121
protected:
119122
int parse(const ErasureCodeProfile &profile,

src/erasure-code/ErasureCodeInterface.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -453,12 +453,20 @@ namespace ceph {
453453
*
454454
* Returns 0 on success.
455455
*
456-
* @param [in] chunks map chunk indexes to chunk data
457-
* @param [out] decoded concatenante of the data chunks
456+
* @param [in] want_to_read mapped std::set of chunks caller wants
457+
* concatenated to `decoded`. This works as
458+
* selectors for `chunks`
459+
* @param [in] chunks set of chunks with data available for decoding
460+
* @param [out] decoded must be non-null, chunks specified in `want_to_read`
461+
* will be concatenated into `decoded` in index order
458462
* @return **0** on success or a negative errno on error.
459463
*/
464+
virtual int decode_concat(const std::set<int>& want_to_read,
465+
const std::map<int, bufferlist> &chunks,
466+
bufferlist *decoded) = 0;
460467
virtual int decode_concat(const std::map<int, bufferlist> &chunks,
461468
bufferlist *decoded) = 0;
469+
462470
};
463471

464472
typedef std::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;

src/erasure-code/clay/ErasureCodeClay.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,14 @@ int ErasureCodeClay::is_repair(const set<int> &want_to_read,
306306

307307
if (includes(available_chunks.begin(), available_chunks.end(),
308308
want_to_read.begin(), want_to_read.end())) return 0;
309+
// Oops, before the attempt to EC partial reads the fellowing
310+
// condition was always true as `get_want_to_read_shards()` yields
311+
// entire stripe. Unfortunately, we built upon this assumption and
312+
// even `ECUtil::decode()` asserts on chunks being multiply of
313+
// `chunk_size`.
314+
// XXX: for now returning 0 and knocking the optimization out.
309315
if (want_to_read.size() > 1) return 0;
316+
else return 0;
310317

311318
int i = *want_to_read.begin();
312319
int lost_node_id = (i < k) ? i: i+nu;

src/osd/ECBackend.cc

Lines changed: 56 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,8 @@ struct RecoveryMessages {
195195
const map<pg_shard_t, vector<pair<int, int>>> &need,
196196
bool attrs)
197197
{
198-
list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
199-
to_read.push_back(boost::make_tuple(off, len, 0));
198+
list<ECCommon::ec_align_t> to_read;
199+
to_read.emplace_back(ECCommon::ec_align_t{off, len, 0});
200200
ceph_assert(!recovery_reads.count(hoid));
201201
want_to_read.insert(make_pair(hoid, std::move(_want_to_read)));
202202
recovery_reads.insert(
@@ -463,7 +463,8 @@ struct RecoveryReadCompleter : ECCommon::ReadCompleter {
463463
void finish_single_request(
464464
const hobject_t &hoid,
465465
ECCommon::read_result_t &res,
466-
list<boost::tuple<uint64_t, uint64_t, uint32_t> >) override
466+
list<ECCommon::ec_align_t>,
467+
set<int> wanted_to_read) override
467468
{
468469
if (!(res.r == 0 && res.errors.empty())) {
469470
backend._failed_push(hoid, res);
@@ -1028,17 +1029,19 @@ void ECBackend::handle_sub_read(
10281029
if ((op.subchunks.find(i->first)->second.size() == 1) &&
10291030
(op.subchunks.find(i->first)->second.front().second ==
10301031
ec_impl->get_sub_chunk_count())) {
1031-
dout(25) << __func__ << " case1: reading the complete chunk/shard." << dendl;
1032+
dout(20) << __func__ << " case1: reading the complete chunk/shard." << dendl;
10321033
r = store->read(
10331034
ch,
10341035
ghobject_t(i->first, ghobject_t::NO_GEN, shard),
10351036
j->get<0>(),
10361037
j->get<1>(),
10371038
bl, j->get<2>()); // Allow EIO return
10381039
} else {
1039-
dout(25) << __func__ << " case2: going to do fragmented read." << dendl;
10401040
int subchunk_size =
10411041
sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
1042+
dout(20) << __func__ << " case2: going to do fragmented read;"
1043+
<< " subchunk_size=" << subchunk_size
1044+
<< " chunk_size=" << sinfo.get_chunk_size() << dendl;
10421045
bool error = false;
10431046
for (int m = 0; m < (int)j->get<1>() && !error;
10441047
m += sinfo.get_chunk_size()) {
@@ -1214,7 +1217,7 @@ void ECBackend::handle_sub_read_reply(
12141217
dout(20) << __func__ << " to_read skipping" << dendl;
12151218
continue;
12161219
}
1217-
list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator req_iter =
1220+
list<ec_align_t>::const_iterator req_iter =
12181221
rop.to_read.find(i->first)->second.to_read.begin();
12191222
list<
12201223
boost::tuple<
@@ -1225,10 +1228,10 @@ void ECBackend::handle_sub_read_reply(
12251228
++j, ++req_iter, ++riter) {
12261229
ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end());
12271230
ceph_assert(riter != rop.complete[i->first].returned.end());
1228-
pair<uint64_t, uint64_t> adjusted =
1229-
sinfo.aligned_offset_len_to_chunk(
1230-
make_pair(req_iter->get<0>(), req_iter->get<1>()));
1231-
ceph_assert(adjusted.first == j->first);
1231+
pair<uint64_t, uint64_t> aligned =
1232+
sinfo.chunk_aligned_offset_len_to_chunk(
1233+
make_pair(req_iter->offset, req_iter->size));
1234+
ceph_assert(aligned.first == j->first);
12321235
riter->get<2>()[from] = std::move(j->second);
12331236
}
12341237
}
@@ -1529,62 +1532,70 @@ int ECBackend::objects_read_sync(
15291532
return -EOPNOTSUPP;
15301533
}
15311534

1535+
static bool should_partial_read(
1536+
const ECUtil::stripe_info_t& sinfo,
1537+
uint64_t off,
1538+
uint32_t len,
1539+
bool fast_read)
1540+
{
1541+
// Don't partial read if we are doing a fast_read
1542+
if (fast_read) {
1543+
return false;
1544+
}
1545+
// Same stripe only
1546+
return sinfo.offset_length_is_same_stripe(off, len);
1547+
}
1548+
15321549
void ECBackend::objects_read_async(
15331550
const hobject_t &hoid,
1534-
const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
1535-
pair<bufferlist*, Context*> > > &to_read,
1551+
const list<pair<ECCommon::ec_align_t,
1552+
pair<bufferlist*, Context*>>> &to_read,
15361553
Context *on_complete,
15371554
bool fast_read)
15381555
{
1539-
map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > >
1540-
reads;
1556+
map<hobject_t,std::list<ec_align_t>> reads;
15411557

15421558
uint32_t flags = 0;
15431559
extent_set es;
1544-
for (list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
1545-
pair<bufferlist*, Context*> > >::const_iterator i =
1546-
to_read.begin();
1547-
i != to_read.end();
1548-
++i) {
1549-
pair<uint64_t, uint64_t> tmp =
1550-
sinfo.offset_len_to_stripe_bounds(
1551-
make_pair(i->first.get<0>(), i->first.get<1>()));
1552-
1560+
for (const auto& [read, ctx] : to_read) {
1561+
pair<uint64_t, uint64_t> tmp;
1562+
if (!cct->_conf->osd_ec_partial_reads ||
1563+
!should_partial_read(sinfo, read.offset, read.size, fast_read)) {
1564+
tmp = sinfo.offset_len_to_stripe_bounds(make_pair(read.offset, read.size));
1565+
} else {
1566+
tmp = sinfo.offset_len_to_chunk_bounds(make_pair(read.offset, read.size));
1567+
}
15531568
es.union_insert(tmp.first, tmp.second);
1554-
flags |= i->first.get<2>();
1569+
flags |= read.flags;
15551570
}
15561571

15571572
if (!es.empty()) {
15581573
auto &offsets = reads[hoid];
15591574
for (auto j = es.begin();
15601575
j != es.end();
15611576
++j) {
1562-
offsets.push_back(
1563-
boost::make_tuple(
1564-
j.get_start(),
1565-
j.get_len(),
1566-
flags));
1577+
offsets.emplace_back(ec_align_t{j.get_start(), j.get_len(), flags});
15671578
}
15681579
}
15691580

15701581
struct cb {
15711582
ECBackend *ec;
15721583
hobject_t hoid;
1573-
list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
1584+
list<pair<ECCommon::ec_align_t,
15741585
pair<bufferlist*, Context*> > > to_read;
15751586
unique_ptr<Context> on_complete;
15761587
cb(const cb&) = delete;
15771588
cb(cb &&) = default;
15781589
cb(ECBackend *ec,
15791590
const hobject_t &hoid,
1580-
const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
1591+
const list<pair<ECCommon::ec_align_t,
15811592
pair<bufferlist*, Context*> > > &to_read,
15821593
Context *on_complete)
15831594
: ec(ec),
15841595
hoid(hoid),
15851596
to_read(to_read),
15861597
on_complete(on_complete) {}
1587-
void operator()(map<hobject_t,pair<int, extent_map> > &&results) {
1598+
void operator()(ECCommon::ec_extents_t &&results) {
15881599
auto dpp = ec->get_parent()->get_dpp();
15891600
ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results
15901601
<< dendl;
@@ -1595,24 +1606,24 @@ void ECBackend::objects_read_async(
15951606

15961607
int r = 0;
15971608
for (auto &&read: to_read) {
1598-
if (got.first < 0) {
1609+
if (got.err < 0) {
15991610
// error handling
16001611
if (read.second.second) {
1601-
read.second.second->complete(got.first);
1612+
read.second.second->complete(got.err);
16021613
}
16031614
if (r == 0)
1604-
r = got.first;
1615+
r = got.err;
16051616
} else {
16061617
ceph_assert(read.second.first);
1607-
uint64_t offset = read.first.get<0>();
1608-
uint64_t length = read.first.get<1>();
1609-
auto range = got.second.get_containing_range(offset, length);
1618+
uint64_t offset = read.first.offset;
1619+
uint64_t length = read.first.size;
1620+
auto range = got.emap.get_containing_range(offset, length);
16101621
ceph_assert(range.first != range.second);
16111622
ceph_assert(range.first.get_off() <= offset);
1612-
ldpp_dout(dpp, 30) << "offset: " << offset << dendl;
1613-
ldpp_dout(dpp, 30) << "range offset: " << range.first.get_off() << dendl;
1614-
ldpp_dout(dpp, 30) << "length: " << length << dendl;
1615-
ldpp_dout(dpp, 30) << "range length: " << range.first.get_len() << dendl;
1623+
ldpp_dout(dpp, 20) << "offset: " << offset << dendl;
1624+
ldpp_dout(dpp, 20) << "range offset: " << range.first.get_off() << dendl;
1625+
ldpp_dout(dpp, 20) << "length: " << length << dendl;
1626+
ldpp_dout(dpp, 20) << "range length: " << range.first.get_len() << dendl;
16161627
ceph_assert(
16171628
(offset + length) <=
16181629
(range.first.get_off() + range.first.get_len()));
@@ -1642,7 +1653,7 @@ void ECBackend::objects_read_async(
16421653
reads,
16431654
fast_read,
16441655
make_gen_lambda_context<
1645-
map<hobject_t,pair<int, extent_map> > &&, cb>(
1656+
ECCommon::ec_extents_t &&, cb>(
16461657
cb(this,
16471658
hoid,
16481659
to_read,
@@ -1651,10 +1662,10 @@ void ECBackend::objects_read_async(
16511662

16521663
void ECBackend::objects_read_and_reconstruct(
16531664
const map<hobject_t,
1654-
std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >
1665+
std::list<ECBackend::ec_align_t>
16551666
> &reads,
16561667
bool fast_read,
1657-
GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func)
1668+
GenContextURef<ECCommon::ec_extents_t &&> &&func)
16581669
{
16591670
return read_pipeline.objects_read_and_reconstruct(
16601671
reads, fast_read, std::move(func));

src/osd/ECBackend.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,14 @@ class ECBackend : public PGBackend, public ECCommon {
141141
* check_recovery_sources.
142142
*/
143143
void objects_read_and_reconstruct(
144-
const std::map<hobject_t, std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >
145-
> &reads,
144+
const std::map<hobject_t, std::list<ECCommon::ec_align_t>> &reads,
146145
bool fast_read,
147-
GenContextURef<std::map<hobject_t,std::pair<int, extent_map> > &&> &&func) override;
146+
GenContextURef<ECCommon::ec_extents_t &&> &&func) override;
148147

149148
void objects_read_async(
150149
const hobject_t &hoid,
151-
const std::list<std::pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
152-
std::pair<ceph::buffer::list*, Context*> > > &to_read,
150+
const std::list<std::pair<ECCommon::ec_align_t,
151+
std::pair<ceph::buffer::list*, Context*>>> &to_read,
153152
Context *on_complete,
154153
bool fast_read = false) override;
155154

0 commit comments

Comments
 (0)