Skip to content

Commit aeeee3f

Browse files
committed
osd: check if adjacent clones are unreadable when rollback is called
In rollback, if the head object is not manfest and the rollback_to is manifest, the head object becomes the manifest object. At this point, we need to check adjacent clonces to the head object to calculate correct reference count for deduped chunks because the head object is now manifest. Plus, in the meantime of the waiting for recovery, subsequent requests should be blocked to ensure transaction order. To fix the issue, this commit is to wait until the adjacent clones are readable and delay future incoming ops. fixes: https://tracker.ceph.com/issues/62167 Signed-off-by: Myoungwon Oh <[email protected]>
1 parent fb6cce2 commit aeeee3f

File tree

3 files changed

+103
-23
lines changed

3 files changed

+103
-23
lines changed

src/osd/PG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,6 +1115,7 @@ class PG : public DoutPrefixProvider,
11151115

11161116
std::set<hobject_t> objects_blocked_on_cache_full;
11171117
std::map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1118+
std::map<hobject_t,snapid_t> objects_blocked_on_unreadable_snap;
11181119
std::map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
11191120

11201121
// Callbacks should assume pg (and nothing else) is locked

src/osd/PrimaryLogPG.cc

Lines changed: 99 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,7 @@ void PrimaryLogPG::on_local_recover(
467467
dout(20) << " kicking unreadable waiters on " << hoid << dendl;
468468
requeue_ops(unreadable_object_entry->second);
469469
waiting_for_unreadable_object.erase(unreadable_object_entry);
470+
finish_unreadable_object(unreadable_object_entry->first);
470471
}
471472
}
472473
} else {
@@ -520,6 +521,7 @@ void PrimaryLogPG::on_global_recover(
520521
waiting_for_unreadable_object.erase(unreadable_object_entry);
521522
}
522523
finish_degraded_object(soid);
524+
finish_unreadable_object(soid);
523525
}
524526

525527
void PrimaryLogPG::schedule_recovery_work(
@@ -747,6 +749,18 @@ void PrimaryLogPG::block_write_on_degraded_snap(
747749
wait_for_degraded_object(snap, op);
748750
}
749751

752+
void PrimaryLogPG::block_write_on_unreadable_snap(
753+
const hobject_t& snap, OpRequestRef op)
754+
{
755+
dout(20) << __func__ << ": blocking object " << snap.get_head()
756+
<< " on unreadable snap " << snap << dendl;
757+
// otherwise, we'd have blocked in do_op
758+
ceph_assert(objects_blocked_on_unreadable_snap.count(snap.get_head()) == 0);
759+
objects_blocked_on_unreadable_snap[snap.get_head()] = snap.snap;
760+
// the op must be queued before calling block_write_on_unreadable_snap
761+
ceph_assert(waiting_for_unreadable_object.count(snap) == 1);
762+
}
763+
750764
bool PrimaryLogPG::maybe_await_blocked_head(
751765
const hobject_t &hoid,
752766
OpRequestRef op)
@@ -2196,6 +2210,14 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
21962210
return;
21972211
}
21982212

2213+
if (auto blocked_iter = objects_blocked_on_unreadable_snap.find(head);
2214+
blocked_iter != std::end(objects_blocked_on_unreadable_snap)) {
2215+
hobject_t to_wait_on(head);
2216+
to_wait_on.snap = blocked_iter->second;
2217+
wait_for_unreadable_object(to_wait_on, op);
2218+
return;
2219+
}
2220+
21992221
// blocked on snap?
22002222
if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
22012223
blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
@@ -3468,47 +3490,56 @@ int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_o
34683490
return cnt;
34693491
}
34703492

3471-
bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3493+
snapid_t PrimaryLogPG::do_recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
34723494
{
3473-
if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3474-
return false;
3475-
}
3476-
MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3477-
bool has_manifest_op = std::any_of(
3478-
begin(m->ops),
3479-
end(m->ops),
3480-
[](const auto& osd_op) {
3481-
return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
3482-
});
3483-
if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
3484-
return false;
3485-
}
34863495
ceph_assert(op);
3487-
34883496
const SnapSet& snapset = obc->ssc->snapset;
34893497
auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3490-
auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3498+
auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> snapid_t {
34913499
hobject_t cid = obc->obs.oi.soid;
34923500
cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
34933501
if (is_unreadable_object(cid)) {
34943502
dout(10) << __func__ << ": clone " << cid
34953503
<< " is unreadable, waiting" << dendl;
34963504
wait_for_unreadable_object(cid, op);
3497-
return true;
3505+
return cid.snap;
34983506
}
3499-
return false;
3507+
return snapid_t();
35003508
};
35013509
if (s != snapset.clones.begin()) {
3502-
if (is_unreadable_snap(s - 1)) {
3503-
return true;
3510+
snapid_t snap = is_unreadable_snap(s - 1);
3511+
if (snap != snapid_t()) {
3512+
return snap;
35043513
}
35053514
}
35063515
if (s != snapset.clones.end()) {
3507-
if (is_unreadable_snap(s + 1)) {
3508-
return true;
3516+
snapid_t snap = is_unreadable_snap(s + 1);
3517+
if (snap != snapid_t()) {
3518+
return snap;
35093519
}
35103520
}
3511-
return false;
3521+
return snapid_t();
3522+
}
3523+
3524+
bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3525+
{
3526+
if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3527+
return false;
3528+
}
3529+
MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3530+
bool has_manifest_op = false;
3531+
for (auto& osd_op : m->ops) {
3532+
if (osd_op.op.op == CEPH_OSD_OP_ROLLBACK) {
3533+
return false;
3534+
} else if (osd_op.op.op == CEPH_OSD_OP_SET_CHUNK) {
3535+
has_manifest_op = true;
3536+
break;
3537+
}
3538+
}
3539+
if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
3540+
return false;
3541+
}
3542+
return do_recover_adjacent_clones(obc, op) != snapid_t();
35123543
}
35133544

35143545
ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
@@ -8289,6 +8320,40 @@ int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op)
82898320
block_write_on_degraded_snap(missing_oid, ctx->op);
82908321
return ret;
82918322
}
8323+
/*
8324+
* In rollback, if the head object is not manfest and the rollback_to is manifest,
8325+
* the head object will become the manifest object. At this point,
8326+
* we need to check adjacent clones beside the head object to calculate
8327+
* correct reference count for deduped chunks because the head object is now
8328+
* manifest. The reverse is also true---the head object is manifest, but the rollback_to
8329+
* is not manifest.
8330+
* Therefore, the following lines inserts the op to the waiting queue to wait until
8331+
* unreadable object is recovered if either adjacent clones is
8332+
* unreadable to calculate chunk references.
8333+
*/
8334+
auto block_write_if_unreadable = [this](ObjectContextRef obc, OpRequestRef op) {
8335+
snapid_t sid = do_recover_adjacent_clones(obc, op);
8336+
if (sid != snapid_t()) {
8337+
hobject_t oid = obc->obs.oi.soid;
8338+
oid.snap = sid;
8339+
block_write_on_unreadable_snap(oid, op);
8340+
return -EAGAIN;
8341+
}
8342+
return 0;
8343+
};
8344+
if (oi.has_manifest() && oi.manifest.is_chunked()) {
8345+
int r = block_write_if_unreadable(ctx->obc, ctx->op);
8346+
if (r < 0) {
8347+
return r;
8348+
}
8349+
}
8350+
if (rollback_to && rollback_to->obs.oi.has_manifest() &&
8351+
rollback_to->obs.oi.manifest.is_chunked()) {
8352+
int r = block_write_if_unreadable(rollback_to, ctx->op);
8353+
if (r < 0) {
8354+
return r;
8355+
}
8356+
}
82928357
{
82938358
ObjectContextRef promote_obc;
82948359
cache_result_t tier_mode_result;
@@ -12431,6 +12496,16 @@ void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
1243112496
objects_blocked_on_degraded_snap.erase(i);
1243212497
}
1243312498

12499+
void PrimaryLogPG::finish_unreadable_object(const hobject_t oid)
12500+
{
12501+
dout(10) << __func__ << " " << oid << dendl;
12502+
map<hobject_t, snapid_t>::iterator i = objects_blocked_on_unreadable_snap.find(
12503+
oid.get_head());
12504+
if (i != objects_blocked_on_unreadable_snap.end() &&
12505+
i->second == oid.snap)
12506+
objects_blocked_on_unreadable_snap.erase(i);
12507+
}
12508+
1243412509
void PrimaryLogPG::_committed_pushed_object(
1243512510
epoch_t epoch, eversion_t last_complete)
1243612511
{
@@ -13172,6 +13247,7 @@ void PrimaryLogPG::cancel_pull(const hobject_t &soid)
1317213247
if (is_missing_object(soid))
1317313248
recovery_state.set_last_requested(0);
1317413249
finish_degraded_object(soid);
13250+
finish_unreadable_object(soid);
1317513251
}
1317613252

1317713253
void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)

src/osd/PrimaryLogPG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1451,6 +1451,7 @@ class PrimaryLogPG : public PG, public PGBackend::Listener {
14511451
void dec_refcount_by_dirty(OpContext* ctx);
14521452
ObjectContextRef get_prev_clone_obc(ObjectContextRef obc);
14531453
bool recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op);
1454+
snapid_t do_recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op);
14541455
void get_adjacent_clones(ObjectContextRef src_obc,
14551456
ObjectContextRef& _l, ObjectContextRef& _g);
14561457
bool inc_refcount_by_set(OpContext* ctx, object_manifest_t& tgt,
@@ -1833,6 +1834,7 @@ class PrimaryLogPG : public PG, public PGBackend::Listener {
18331834
}
18341835
void maybe_kick_recovery(const hobject_t &soid);
18351836
void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
1837+
void finish_unreadable_object(const hobject_t oid);
18361838

18371839
int get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op);
18381840

@@ -1863,6 +1865,7 @@ class PrimaryLogPG : public PG, public PGBackend::Listener {
18631865
void block_write_on_snap_rollback(
18641866
const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
18651867
void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
1868+
void block_write_on_unreadable_snap(const hobject_t& snap, OpRequestRef op);
18661869

18671870
bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op);
18681871
void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);

0 commit comments

Comments
 (0)