Skip to content

Commit 1becd2c

Browse files
committed
osd: Improve backfill in new EC.
In old EC, the full stripe was always read and written. In new EC, we only attempt to recover the shards that were missing. If an old OSD is available, the read can be directed there. Signed-off-by: Alex Ainscow <[email protected]>
1 parent 8c92dcf commit 1becd2c

File tree

3 files changed

+49
-16
lines changed

3 files changed

+49
-16
lines changed

src/osd/ECBackend.cc

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push_reply(
296296
if (!recovery_ops.count(op.soid))
297297
return;
298298
RecoveryOp &rop = recovery_ops[op.soid];
299-
ceph_assert(rop.waiting_on_pushes.count(from));
299+
ceph_assert(rop.waiting_on_pushes.contains(from));
300300
rop.waiting_on_pushes.erase(from);
301301
continue_recovery_op(rop, m);
302302
}
@@ -377,10 +377,6 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
377377

378378
int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
379379
ceph_assert(r == 0);
380-
// We are never appending here, so we never need hinfo.
381-
op.returned_data->insert_parity_buffers();
382-
r = op.returned_data->encode(ec_impl, NULL, 0);
383-
ceph_assert(r==0);
384380

385381
// Finally, we don't want to write any padding, so truncate the buffer
386382
// to remove it.
@@ -538,12 +534,30 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
538534

539535
op.state = RecoveryOp::READING;
540536

541-
// We always read the recovery chunk size (default 8MiB + parity). If that
542-
// amount of data is not available, then the backend will truncate the
543-
// response.
537+
/* When beginning recovery, the OI may not be known. As such the object
538+
* size is not known. For the first read, attempt to read the default
539+
* size. If this is larger than the object sizes, then the OSD will
540+
* return truncated reads. If the object size is known, then attempt
541+
* correctly sized reads.
542+
*/
543+
uint64_t read_size = get_recovery_chunk_size();
544+
if (op.obc) {
545+
uint64_t read_to_end = ECUtil::align_next(op.obc->obs.oi.size) -
546+
op.recovery_progress.data_recovered_to;
547+
548+
if (read_to_end < read_size) {
549+
read_size = read_to_end;
550+
}
551+
}
544552
sinfo.ro_range_to_shard_extent_set_with_parity(
545-
op.recovery_progress.data_recovered_to,
546-
get_recovery_chunk_size(), want);
553+
op.recovery_progress.data_recovered_to, read_size, want);
554+
555+
op.recovery_progress.data_recovered_to += read_size;
556+
557+
// We only need to recover shards that are missing.
558+
for (auto shard : shard_id_set::difference(sinfo.get_all_shards(), op.missing_on_shards)) {
559+
want.erase(shard);
560+
}
547561

548562
if (op.recovery_progress.first && op.obc) {
549563
op.xattrs = op.obc->attr_cache;
@@ -593,9 +607,15 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
593607
}
594608
if (read_request.shard_reads.empty()) {
595609
ceph_assert(op.obc);
596-
ceph_assert(0 == op.obc->obs.oi.size);
597-
dout(10) << __func__ << "Zero size object recovery, skipping reads."
598-
<< op << dendl;
610+
/* This can happen for several reasons
611+
* - A zero-sized object.
612+
* - The missing shards have no data.
613+
* - The previous recovery did not need the last data shard. In this
614+
* case, data_recovered_to may indicate that the last shard still
615+
* needs recovery, when it does not.
616+
* We can just skip the read and fall through below.
617+
*/
618+
dout(10) << __func__ << " No reads required " << op << dendl;
599619
// Create an empty read result and fall through.
600620
op.returned_data.emplace(&sinfo);
601621
} else {
@@ -614,7 +634,6 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
614634
dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl;
615635
op.state = RecoveryOp::WRITING;
616636
ObjectRecoveryProgress after_progress = op.recovery_progress;
617-
after_progress.data_recovered_to = op.returned_data->get_ro_end();
618637
after_progress.first = false;
619638
if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
620639
after_progress.data_complete = true;

src/osd/ECBackend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ class ECBackend : public ECCommon {
252252
hobject_t hoid;
253253
eversion_t v;
254254
std::set<pg_shard_t> missing_on;
255-
std::set<shard_id_t> missing_on_shards;
255+
shard_id_set missing_on_shards;
256256

257257
ObjectRecoveryInfo recovery_info;
258258
ObjectRecoveryProgress recovery_progress;

src/osd/ECUtil.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ class stripe_info_t {
358358
const std::vector<raw_shard_id_t> chunk_mapping_reverse;
359359
const shard_id_set data_shards;
360360
const shard_id_set parity_shards;
361+
const shard_id_set all_shards;
361362

362363
private:
363364
void ro_range_to_shards(
@@ -410,6 +411,13 @@ class stripe_info_t {
410411
return data_shards;
411412
}
412413

414+
static shard_id_set calc_all_shards(int k_plus_m) {
415+
shard_id_set all_shards;
416+
all_shards.insert_range(shard_id_t(), k_plus_m);
417+
return all_shards;
418+
}
419+
420+
413421
public:
414422
stripe_info_t(const ErasureCodeInterfaceRef &ec_impl, const pg_pool_t *pool,
415423
uint64_t stripe_width
@@ -424,7 +432,8 @@ class stripe_info_t {
424432
complete_chunk_mapping(ec_impl->get_chunk_mapping(), k + m)),
425433
chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
426434
data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
427-
parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
435+
parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)),
436+
all_shards(calc_all_shards(k + m)) {
428437
ceph_assert(stripe_width != 0);
429438
ceph_assert(stripe_width % k == 0);
430439
}
@@ -601,6 +610,11 @@ class stripe_info_t {
601610
return parity_shards;
602611
}
603612

613+
auto get_all_shards() const {
614+
return all_shards;
615+
}
616+
617+
604618
uint64_t ro_offset_to_prev_chunk_offset(uint64_t offset) const {
605619
return (offset / stripe_width) * chunk_size;
606620
}

0 commit comments

Comments
 (0)