Skip to content

Commit 8b6a9ed

Browse files
authored
Merge pull request ceph#53222 from myoungwon/wip-62167
osd: check if adjacent clones are unreadable when rollback is called. Reviewed-by: athanatos
2 parents 313b886 + aeeee3f commit 8b6a9ed

File tree

3 files changed

+103
-23
lines changed

3 files changed

+103
-23
lines changed

src/osd/PG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,7 @@ class PG : public DoutPrefixProvider,
11181118

11191119
std::set<hobject_t> objects_blocked_on_cache_full;
11201120
std::map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1121+
std::map<hobject_t,snapid_t> objects_blocked_on_unreadable_snap;
11211122
std::map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
11221123

11231124
// Callbacks should assume pg (and nothing else) is locked

src/osd/PrimaryLogPG.cc

Lines changed: 99 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,7 @@ void PrimaryLogPG::on_local_recover(
467467
dout(20) << " kicking unreadable waiters on " << hoid << dendl;
468468
requeue_ops(unreadable_object_entry->second);
469469
waiting_for_unreadable_object.erase(unreadable_object_entry);
470+
finish_unreadable_object(unreadable_object_entry->first);
470471
}
471472
}
472473
} else {
@@ -520,6 +521,7 @@ void PrimaryLogPG::on_global_recover(
520521
waiting_for_unreadable_object.erase(unreadable_object_entry);
521522
}
522523
finish_degraded_object(soid);
524+
finish_unreadable_object(soid);
523525
}
524526

525527
void PrimaryLogPG::schedule_recovery_work(
@@ -747,6 +749,18 @@ void PrimaryLogPG::block_write_on_degraded_snap(
747749
wait_for_degraded_object(snap, op);
748750
}
749751

752+
void PrimaryLogPG::block_write_on_unreadable_snap(
753+
const hobject_t& snap, OpRequestRef op)
754+
{
755+
dout(20) << __func__ << ": blocking object " << snap.get_head()
756+
<< " on unreadable snap " << snap << dendl;
757+
// otherwise, we'd have blocked in do_op
758+
ceph_assert(objects_blocked_on_unreadable_snap.count(snap.get_head()) == 0);
759+
objects_blocked_on_unreadable_snap[snap.get_head()] = snap.snap;
760+
// the op must be queued before calling block_write_on_unreadable_snap
761+
ceph_assert(waiting_for_unreadable_object.count(snap) == 1);
762+
}
763+
750764
bool PrimaryLogPG::maybe_await_blocked_head(
751765
const hobject_t &hoid,
752766
OpRequestRef op)
@@ -2196,6 +2210,14 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
21962210
return;
21972211
}
21982212

2213+
if (auto blocked_iter = objects_blocked_on_unreadable_snap.find(head);
2214+
blocked_iter != std::end(objects_blocked_on_unreadable_snap)) {
2215+
hobject_t to_wait_on(head);
2216+
to_wait_on.snap = blocked_iter->second;
2217+
wait_for_unreadable_object(to_wait_on, op);
2218+
return;
2219+
}
2220+
21992221
// blocked on snap?
22002222
if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
22012223
blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
@@ -3468,47 +3490,56 @@ int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_o
34683490
return cnt;
34693491
}
34703492

3471-
bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3493+
snapid_t PrimaryLogPG::do_recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
34723494
{
3473-
if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3474-
return false;
3475-
}
3476-
MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3477-
bool has_manifest_op = std::any_of(
3478-
begin(m->ops),
3479-
end(m->ops),
3480-
[](const auto& osd_op) {
3481-
return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
3482-
});
3483-
if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
3484-
return false;
3485-
}
34863495
ceph_assert(op);
3487-
34883496
const SnapSet& snapset = obc->ssc->snapset;
34893497
auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3490-
auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3498+
auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> snapid_t {
34913499
hobject_t cid = obc->obs.oi.soid;
34923500
cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
34933501
if (is_unreadable_object(cid)) {
34943502
dout(10) << __func__ << ": clone " << cid
34953503
<< " is unreadable, waiting" << dendl;
34963504
wait_for_unreadable_object(cid, op);
3497-
return true;
3505+
return cid.snap;
34983506
}
3499-
return false;
3507+
return snapid_t();
35003508
};
35013509
if (s != snapset.clones.begin()) {
3502-
if (is_unreadable_snap(s - 1)) {
3503-
return true;
3510+
snapid_t snap = is_unreadable_snap(s - 1);
3511+
if (snap != snapid_t()) {
3512+
return snap;
35043513
}
35053514
}
35063515
if (s != snapset.clones.end()) {
3507-
if (is_unreadable_snap(s + 1)) {
3508-
return true;
3516+
snapid_t snap = is_unreadable_snap(s + 1);
3517+
if (snap != snapid_t()) {
3518+
return snap;
35093519
}
35103520
}
3511-
return false;
3521+
return snapid_t();
3522+
}
3523+
3524+
bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3525+
{
3526+
if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3527+
return false;
3528+
}
3529+
MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3530+
bool has_manifest_op = false;
3531+
for (auto& osd_op : m->ops) {
3532+
if (osd_op.op.op == CEPH_OSD_OP_ROLLBACK) {
3533+
return false;
3534+
} else if (osd_op.op.op == CEPH_OSD_OP_SET_CHUNK) {
3535+
has_manifest_op = true;
3536+
break;
3537+
}
3538+
}
3539+
if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
3540+
return false;
3541+
}
3542+
return do_recover_adjacent_clones(obc, op) != snapid_t();
35123543
}
35133544

35143545
ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
@@ -8291,6 +8322,40 @@ int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op)
82918322
block_write_on_degraded_snap(missing_oid, ctx->op);
82928323
return ret;
82938324
}
8325+
/*
8326+
* In rollback, if the head object is not manfest and the rollback_to is manifest,
8327+
* the head object will become the manifest object. At this point,
8328+
* we need to check adjacent clones beside the head object to calculate
8329+
* correct reference count for deduped chunks because the head object is now
8330+
* manifest. The reverse is also true---the head object is manifest, but the rollback_to
8331+
* is not manifest.
8332+
* Therefore, the following lines inserts the op to the waiting queue to wait until
8333+
* unreadable object is recovered if either adjacent clones is
8334+
* unreadable to calculate chunk references.
8335+
*/
8336+
auto block_write_if_unreadable = [this](ObjectContextRef obc, OpRequestRef op) {
8337+
snapid_t sid = do_recover_adjacent_clones(obc, op);
8338+
if (sid != snapid_t()) {
8339+
hobject_t oid = obc->obs.oi.soid;
8340+
oid.snap = sid;
8341+
block_write_on_unreadable_snap(oid, op);
8342+
return -EAGAIN;
8343+
}
8344+
return 0;
8345+
};
8346+
if (oi.has_manifest() && oi.manifest.is_chunked()) {
8347+
int r = block_write_if_unreadable(ctx->obc, ctx->op);
8348+
if (r < 0) {
8349+
return r;
8350+
}
8351+
}
8352+
if (rollback_to && rollback_to->obs.oi.has_manifest() &&
8353+
rollback_to->obs.oi.manifest.is_chunked()) {
8354+
int r = block_write_if_unreadable(rollback_to, ctx->op);
8355+
if (r < 0) {
8356+
return r;
8357+
}
8358+
}
82948359
{
82958360
ObjectContextRef promote_obc;
82968361
cache_result_t tier_mode_result;
@@ -12432,6 +12497,16 @@ void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
1243212497
objects_blocked_on_degraded_snap.erase(i);
1243312498
}
1243412499

12500+
void PrimaryLogPG::finish_unreadable_object(const hobject_t oid)
12501+
{
12502+
dout(10) << __func__ << " " << oid << dendl;
12503+
map<hobject_t, snapid_t>::iterator i = objects_blocked_on_unreadable_snap.find(
12504+
oid.get_head());
12505+
if (i != objects_blocked_on_unreadable_snap.end() &&
12506+
i->second == oid.snap)
12507+
objects_blocked_on_unreadable_snap.erase(i);
12508+
}
12509+
1243512510
void PrimaryLogPG::_committed_pushed_object(
1243612511
epoch_t epoch, eversion_t last_complete)
1243712512
{
@@ -13174,6 +13249,7 @@ void PrimaryLogPG::cancel_pull(const hobject_t &soid)
1317413249
if (is_missing_object(soid))
1317513250
recovery_state.set_last_requested(0);
1317613251
finish_degraded_object(soid);
13252+
finish_unreadable_object(soid);
1317713253
}
1317813254

1317913255
void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)

src/osd/PrimaryLogPG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1469,6 +1469,7 @@ class PrimaryLogPG : public PG,
14691469
void dec_refcount_by_dirty(OpContext* ctx);
14701470
ObjectContextRef get_prev_clone_obc(ObjectContextRef obc);
14711471
bool recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op);
1472+
snapid_t do_recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op);
14721473
void get_adjacent_clones(ObjectContextRef src_obc,
14731474
ObjectContextRef& _l, ObjectContextRef& _g);
14741475
bool inc_refcount_by_set(OpContext* ctx, object_manifest_t& tgt,
@@ -1851,6 +1852,7 @@ class PrimaryLogPG : public PG,
18511852
}
18521853
void maybe_kick_recovery(const hobject_t &soid);
18531854
void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
1855+
void finish_unreadable_object(const hobject_t oid);
18541856

18551857
int get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op);
18561858

@@ -1881,6 +1883,7 @@ class PrimaryLogPG : public PG,
18811883
void block_write_on_snap_rollback(
18821884
const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
18831885
void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
1886+
void block_write_on_unreadable_snap(const hobject_t& snap, OpRequestRef op);
18841887

18851888
bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op);
18861889
void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);

0 commit comments

Comments
 (0)