@@ -467,6 +467,7 @@ void PrimaryLogPG::on_local_recover(
467467 dout (20 ) << " kicking unreadable waiters on " << hoid << dendl;
468468 requeue_ops (unreadable_object_entry->second );
469469 waiting_for_unreadable_object.erase (unreadable_object_entry);
470+ finish_unreadable_object (unreadable_object_entry->first );
470471 }
471472 }
472473 } else {
@@ -520,6 +521,7 @@ void PrimaryLogPG::on_global_recover(
520521 waiting_for_unreadable_object.erase (unreadable_object_entry);
521522 }
522523 finish_degraded_object (soid);
524+ finish_unreadable_object (soid);
523525}
524526
525527void PrimaryLogPG::schedule_recovery_work (
@@ -747,6 +749,18 @@ void PrimaryLogPG::block_write_on_degraded_snap(
747749 wait_for_degraded_object (snap, op);
748750}
749751
752+ void PrimaryLogPG::block_write_on_unreadable_snap (
753+ const hobject_t & snap, OpRequestRef op)
754+ {
755+ dout (20 ) << __func__ << " : blocking object " << snap.get_head ()
756+ << " on unreadable snap " << snap << dendl;
757+ // otherwise, we'd have blocked in do_op
758+ ceph_assert (objects_blocked_on_unreadable_snap.count (snap.get_head ()) == 0 );
759+ objects_blocked_on_unreadable_snap[snap.get_head ()] = snap.snap ;
760+ // the op must be queued before calling block_write_on_unreadable_snap
761+ ceph_assert (waiting_for_unreadable_object.count (snap) == 1 );
762+ }
763+
750764bool PrimaryLogPG::maybe_await_blocked_head (
751765 const hobject_t &hoid,
752766 OpRequestRef op)
@@ -2196,6 +2210,14 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
21962210 return ;
21972211 }
21982212
2213+ if (auto blocked_iter = objects_blocked_on_unreadable_snap.find (head);
2214+ blocked_iter != std::end (objects_blocked_on_unreadable_snap)) {
2215+ hobject_t to_wait_on (head);
2216+ to_wait_on.snap = blocked_iter->second ;
2217+ wait_for_unreadable_object (to_wait_on, op);
2218+ return ;
2219+ }
2220+
21992221 // blocked on snap?
22002222 if (auto blocked_iter = objects_blocked_on_degraded_snap.find (head);
22012223 blocked_iter != std::end (objects_blocked_on_degraded_snap)) {
@@ -3468,47 +3490,56 @@ int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_o
34683490 return cnt;
34693491}
34703492
3471- bool PrimaryLogPG::recover_adjacent_clones (ObjectContextRef obc, OpRequestRef op)
3493+ snapid_t PrimaryLogPG::do_recover_adjacent_clones (ObjectContextRef obc, OpRequestRef op)
34723494{
3473- if (!obc->ssc || !obc->ssc ->snapset .clones .size ()) {
3474- return false ;
3475- }
3476- MOSDOp *m = static_cast <MOSDOp*>(op->get_nonconst_req ());
3477- bool has_manifest_op = std::any_of (
3478- begin (m->ops ),
3479- end (m->ops ),
3480- [](const auto & osd_op) {
3481- return osd_op.op .op == CEPH_OSD_OP_SET_CHUNK;
3482- });
3483- if (!obc->obs .oi .manifest .is_chunked () && !has_manifest_op) {
3484- return false ;
3485- }
34863495 ceph_assert (op);
3487-
34883496 const SnapSet& snapset = obc->ssc ->snapset ;
34893497 auto s = std::find (snapset.clones .begin (), snapset.clones .end (), obc->obs .oi .soid .snap );
3490- auto is_unreadable_snap = [this , obc, &snapset, op](auto iter) -> bool {
3498+ auto is_unreadable_snap = [this , obc, &snapset, op](auto iter) -> snapid_t {
34913499 hobject_t cid = obc->obs .oi .soid ;
34923500 cid.snap = (iter == snapset.clones .end ()) ? snapid_t (CEPH_NOSNAP) : *iter;
34933501 if (is_unreadable_object (cid)) {
34943502 dout (10 ) << __func__ << " : clone " << cid
34953503 << " is unreadable, waiting" << dendl;
34963504 wait_for_unreadable_object (cid, op);
3497- return true ;
3505+ return cid. snap ;
34983506 }
3499- return false ;
3507+ return snapid_t () ;
35003508 };
35013509 if (s != snapset.clones .begin ()) {
3502- if (is_unreadable_snap (s - 1 )) {
3503- return true ;
3510+ snapid_t snap = is_unreadable_snap (s - 1 );
3511+ if (snap != snapid_t ()) {
3512+ return snap;
35043513 }
35053514 }
35063515 if (s != snapset.clones .end ()) {
3507- if (is_unreadable_snap (s + 1 )) {
3508- return true ;
3516+ snapid_t snap = is_unreadable_snap (s + 1 );
3517+ if (snap != snapid_t ()) {
3518+ return snap;
35093519 }
35103520 }
3511- return false ;
3521+ return snapid_t ();
3522+ }
3523+
3524+ bool PrimaryLogPG::recover_adjacent_clones (ObjectContextRef obc, OpRequestRef op)
3525+ {
3526+ if (!obc->ssc || !obc->ssc ->snapset .clones .size ()) {
3527+ return false ;
3528+ }
3529+ MOSDOp *m = static_cast <MOSDOp*>(op->get_nonconst_req ());
3530+ bool has_manifest_op = false ;
3531+ for (auto & osd_op : m->ops ) {
3532+ if (osd_op.op .op == CEPH_OSD_OP_ROLLBACK) {
3533+ return false ;
3534+ } else if (osd_op.op .op == CEPH_OSD_OP_SET_CHUNK) {
3535+ has_manifest_op = true ;
3536+ break ;
3537+ }
3538+ }
3539+ if (!obc->obs .oi .manifest .is_chunked () && !has_manifest_op) {
3540+ return false ;
3541+ }
3542+ return do_recover_adjacent_clones (obc, op) != snapid_t ();
35123543}
35133544
35143545ObjectContextRef PrimaryLogPG::get_prev_clone_obc (ObjectContextRef obc)
@@ -8291,6 +8322,40 @@ int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op)
82918322 block_write_on_degraded_snap (missing_oid, ctx->op );
82928323 return ret;
82938324 }
8325+ /*
8326+ * In rollback, if the head object is not manfest and the rollback_to is manifest,
8327+ * the head object will become the manifest object. At this point,
8328+ * we need to check adjacent clones beside the head object to calculate
8329+ * correct reference count for deduped chunks because the head object is now
8330+ * manifest. The reverse is also true---the head object is manifest, but the rollback_to
8331+ * is not manifest.
8332+ * Therefore, the following lines inserts the op to the waiting queue to wait until
8333+ * unreadable object is recovered if either adjacent clones is
8334+ * unreadable to calculate chunk references.
8335+ */
8336+ auto block_write_if_unreadable = [this ](ObjectContextRef obc, OpRequestRef op) {
8337+ snapid_t sid = do_recover_adjacent_clones (obc, op);
8338+ if (sid != snapid_t ()) {
8339+ hobject_t oid = obc->obs .oi .soid ;
8340+ oid.snap = sid;
8341+ block_write_on_unreadable_snap (oid, op);
8342+ return -EAGAIN;
8343+ }
8344+ return 0 ;
8345+ };
8346+ if (oi.has_manifest () && oi.manifest .is_chunked ()) {
8347+ int r = block_write_if_unreadable (ctx->obc , ctx->op );
8348+ if (r < 0 ) {
8349+ return r;
8350+ }
8351+ }
8352+ if (rollback_to && rollback_to->obs .oi .has_manifest () &&
8353+ rollback_to->obs .oi .manifest .is_chunked ()) {
8354+ int r = block_write_if_unreadable (rollback_to, ctx->op );
8355+ if (r < 0 ) {
8356+ return r;
8357+ }
8358+ }
82948359 {
82958360 ObjectContextRef promote_obc;
82968361 cache_result_t tier_mode_result;
@@ -12432,6 +12497,16 @@ void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
1243212497 objects_blocked_on_degraded_snap.erase (i);
1243312498}
1243412499
12500+ void PrimaryLogPG::finish_unreadable_object (const hobject_t oid)
12501+ {
12502+ dout (10 ) << __func__ << " " << oid << dendl;
12503+ map<hobject_t , snapid_t >::iterator i = objects_blocked_on_unreadable_snap.find (
12504+ oid.get_head ());
12505+ if (i != objects_blocked_on_unreadable_snap.end () &&
12506+ i->second == oid.snap )
12507+ objects_blocked_on_unreadable_snap.erase (i);
12508+ }
12509+
1243512510void PrimaryLogPG::_committed_pushed_object (
1243612511 epoch_t epoch, eversion_t last_complete)
1243712512{
@@ -13174,6 +13249,7 @@ void PrimaryLogPG::cancel_pull(const hobject_t &soid)
1317413249 if (is_missing_object (soid))
1317513250 recovery_state.set_last_requested (0 );
1317613251 finish_degraded_object (soid);
13252+ finish_unreadable_object (soid);
1317713253}
1317813254
1317913255void PrimaryLogPG::check_recovery_sources (const OSDMapRef& osdmap)
0 commit comments