@@ -296,7 +296,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push_reply(
296296 if (!recovery_ops.count (op.soid ))
297297 return ;
298298 RecoveryOp &rop = recovery_ops[op.soid ];
299- ceph_assert (rop.waiting_on_pushes .count (from));
299+ ceph_assert (rop.waiting_on_pushes .contains (from));
300300 rop.waiting_on_pushes .erase (from);
301301 continue_recovery_op (rop, m);
302302}
@@ -377,10 +377,6 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
377377
378378 int r = op.returned_data ->decode (ec_impl, shard_want_to_read, aligned_size);
379379 ceph_assert (r == 0 );
380- // We are never appending here, so we never need hinfo.
381- op.returned_data ->insert_parity_buffers ();
382- r = op.returned_data ->encode (ec_impl, NULL , 0 );
383- ceph_assert (r==0 );
384380
385381 // Finally, we don't want to write any padding, so truncate the buffer
386382 // to remove it.
@@ -538,12 +534,30 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
538534
539535 op.state = RecoveryOp::READING;
540536
541- // We always read the recovery chunk size (default 8MiB + parity). If that
542- // amount of data is not available, then the backend will truncate the
543- // response.
537+ /* When beginning recovery, the OI may not be known. As such the object
538+ * size is not known. For the first read, attempt to read the default
539+ * size. If this is larger than the object sizes, then the OSD will
540+ * return truncated reads. If the object size is known, then attempt
541+ * correctly sized reads.
542+ */
543+ uint64_t read_size = get_recovery_chunk_size ();
544+ if (op.obc ) {
545+ uint64_t read_to_end = ECUtil::align_next (op.obc ->obs .oi .size ) -
546+ op.recovery_progress .data_recovered_to ;
547+
548+ if (read_to_end < read_size) {
549+ read_size = read_to_end;
550+ }
551+ }
544552 sinfo.ro_range_to_shard_extent_set_with_parity (
545- op.recovery_progress .data_recovered_to ,
546- get_recovery_chunk_size (), want);
553+ op.recovery_progress .data_recovered_to , read_size, want);
554+
555+ op.recovery_progress .data_recovered_to += read_size;
556+
557+ // We only need to recover shards that are missing.
558+ for (auto shard : shard_id_set::difference (sinfo.get_all_shards (), op.missing_on_shards )) {
559+ want.erase (shard);
560+ }
547561
548562 if (op.recovery_progress .first && op.obc ) {
549563 op.xattrs = op.obc ->attr_cache ;
@@ -593,9 +607,15 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
593607 }
594608 if (read_request.shard_reads .empty ()) {
595609 ceph_assert (op.obc );
596- ceph_assert (0 == op.obc ->obs .oi .size );
597- dout (10 ) << __func__ << " Zero size object recovery, skipping reads."
598- << op << dendl;
610+ /* This can happen for several reasons
611+ * - A zero-sized object.
612+ * - The missing shards have no data.
613+ * - The previous recovery did not need the last data shard. In this
614+ * case, data_recovered_to may indicate that the last shard still
615+ * needs recovery, when it does not.
616+ * We can just skip the read and fall through below.
617+ */
618+ dout (10 ) << __func__ << " No reads required " << op << dendl;
599619 // Create an empty read result and fall through.
600620 op.returned_data .emplace (&sinfo);
601621 } else {
@@ -614,7 +634,6 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
614634 dout (20 ) << __func__ << " : returned_data=" << op.returned_data << dendl;
615635 op.state = RecoveryOp::WRITING;
616636 ObjectRecoveryProgress after_progress = op.recovery_progress ;
617- after_progress.data_recovered_to = op.returned_data ->get_ro_end ();
618637 after_progress.first = false ;
619638 if (after_progress.data_recovered_to >= op.obc ->obs .oi .size ) {
620639 after_progress.data_complete = true ;
0 commit comments