Skip to content

Commit 80be0ae

Browse files
xxhdx1985126Matan-B
authored andcommitted
crimson/osd/osd_operations/client_request: hang client requests when the
object is missing in the whole cluster Fixes: https://tracker.ceph.com/issues/65696 Signed-off-by: Xuehan Xu <[email protected]>
1 parent a4c3e58 commit 80be0ae

File tree

6 files changed

+54
-8
lines changed

6 files changed

+54
-8
lines changed

src/crimson/osd/osd_operations/client_request.cc

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ ClientRequest::recover_missing_snaps(
286286
ObjectContextRef head,
287287
std::set<snapid_t> &snaps)
288288
{
289+
LOG_PREFIX(ClientRequest::process_op);
289290
co_await ihref.enter_stage<interruptor>(
290291
client_pp(*pg).recover_missing_snaps, *this);
291292
for (auto &snap : snaps) {
@@ -299,7 +300,12 @@ ClientRequest::recover_missing_snaps(
299300
* we skip the oid as there is no corresponding clone to recover.
300301
* See https://tracker.ceph.com/issues/63821 */
301302
if (oid) {
302-
co_await do_recover_missing(pg, *oid, m->get_reqid());
303+
auto unfound = co_await do_recover_missing(pg, *oid, m->get_reqid());
304+
if (unfound) {
305+
DEBUGDPP("{} unfound, hang it for now", *pg, m->get_hobj().get_head());
306+
co_await interruptor::make_interruptible(
307+
pg->get_recovery_backend()->add_unfound(m->get_hobj().get_head()));
308+
}
303309
}
304310
}
305311
}
@@ -317,7 +323,14 @@ ClientRequest::process_op(
317323
"Skipping recover_missings on non primary pg for soid {}",
318324
*pg, m->get_hobj());
319325
} else {
320-
co_await do_recover_missing(pg, m->get_hobj().get_head(), m->get_reqid());
326+
auto unfound = co_await do_recover_missing(
327+
pg, m->get_hobj().get_head(), m->get_reqid());
328+
if (unfound) {
329+
DEBUGDPP("{} unfound, hang it for now", *pg, m->get_hobj().get_head());
330+
co_await interruptor::make_interruptible(
331+
pg->get_recovery_backend()->add_unfound(m->get_hobj().get_head()));
332+
}
333+
321334
std::set<snapid_t> snaps = snaps_need_to_recover();
322335
if (!snaps.empty()) {
323336
// call with_obc() in order, but wait concurrently for loading.

src/crimson/osd/osd_operations/client_request_common.cc

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ namespace {
1313

1414
namespace crimson::osd {
1515

16-
typename InterruptibleOperation::template interruptible_future<>
16+
typename InterruptibleOperation::template interruptible_future<bool>
1717
CommonClientRequest::do_recover_missing(
1818
Ref<PG> pg,
1919
const hobject_t& soid,
@@ -45,22 +45,29 @@ CommonClientRequest::do_recover_missing(
4545
if (!needs_recovery_or_backfill) {
4646
logger().debug("{} reqid {} nothing to recover {}",
4747
__func__, reqid, soid);
48-
return seastar::now();
48+
return seastar::make_ready_future<bool>(false);
4949
}
5050

51+
if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) {
52+
return seastar::make_ready_future<bool>(true);
53+
}
5154
logger().debug("{} reqid {} need to wait for recovery, {} version {}",
5255
__func__, reqid, soid, ver);
5356
if (pg->get_recovery_backend()->is_recovering(soid)) {
5457
logger().debug("{} reqid {} object {} version {}, already recovering",
5558
__func__, reqid, soid, ver);
56-
return pg->get_recovery_backend()->get_recovering(soid).wait_for_recovered();
59+
return pg->get_recovery_backend()->get_recovering(
60+
soid).wait_for_recovered(
61+
).then([] {
62+
return seastar::make_ready_future<bool>(false);
63+
});
5764
} else {
5865
logger().debug("{} reqid {} object {} version {}, starting recovery",
5966
__func__, reqid, soid, ver);
6067
auto [op, fut] =
6168
pg->get_shard_services().start_operation<UrgentRecovery>(
6269
soid, ver, pg, pg->get_shard_services(), pg->get_osdmap_epoch());
63-
return std::move(fut);
70+
return fut.then([] { return seastar::make_ready_future<bool>(false); });
6471
}
6572
}
6673

src/crimson/osd/osd_operations/client_request_common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ namespace crimson::osd {
1111

1212
struct CommonClientRequest {
1313

14-
static InterruptibleOperation::template interruptible_future<>
14+
static InterruptibleOperation::template interruptible_future<bool>
1515
do_recover_missing(
1616
Ref<PG> pg,
1717
const hobject_t& soid,

src/crimson/osd/osd_operations/internal_client_request.cc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,12 @@ seastar::future<> InternalClientRequest::start()
7070
client_pp().recover_missing);
7171
}).then_interruptible([this] {
7272
return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
73-
}).then_interruptible([this] {
73+
}).then_interruptible([this](bool unfound) {
74+
if (unfound) {
75+
throw std::system_error(
76+
std::make_error_code(std::errc::operation_canceled),
77+
fmt::format("{} is unfound, drop it!", get_target_oid()));
78+
}
7479
return enter_stage<interruptor>(
7580
client_pp().get_obc);
7681
}).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
@@ -128,6 +133,9 @@ seastar::future<> InternalClientRequest::start()
128133
}, pg, start_epoch);
129134
}).then([this] {
130135
track_event<CompletionEvent>();
136+
}).handle_exception_type([](std::system_error &error) {
137+
logger().debug("error {}, message: {}", error.code(), error.what());
138+
return seastar::now();
131139
}).finally([this] {
132140
logger().debug("{}: exit", *this);
133141
handle.exit();

src/crimson/osd/pg_recovery.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ void PGRecovery::on_global_recover (
431431
auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
432432
recovery_waiter.set_recovered();
433433
pg->get_recovery_backend()->remove_recovering(soid);
434+
pg->get_recovery_backend()->found_and_remove(soid);
434435
}
435436

436437
void PGRecovery::on_failed_recover(

src/crimson/osd/recovery_backend.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,18 @@ class RecoveryBackend {
5050
assert(it->second);
5151
return {*(it->second), added};
5252
}
53+
seastar::future<> add_unfound(const hobject_t &soid) {
54+
auto [it, added] = unfound.emplace(soid, seastar::shared_promise());
55+
return it->second.get_shared_future();
56+
}
57+
void found_and_remove(const hobject_t &soid) {
58+
auto it = unfound.find(soid);
59+
if (it != unfound.end()) {
60+
auto &found_promise = it->second;
61+
found_promise.set_value();
62+
unfound.erase(it);
63+
}
64+
}
5365
WaitForObjectRecovery& get_recovering(const hobject_t& soid) {
5466
assert(is_recovering(soid));
5567
return *(recovering.at(soid));
@@ -91,6 +103,10 @@ class RecoveryBackend {
91103
for (auto& [soid, recovery_waiter] : recovering) {
92104
recovery_waiter->stop();
93105
}
106+
for (auto& [soid, promise] : unfound) {
107+
promise.set_exception(
108+
crimson::common::system_shutdown_exception());
109+
}
94110
return on_stop();
95111
}
96112
protected:
@@ -236,6 +252,7 @@ class RecoveryBackend {
236252
using WaitForObjectRecoveryRef = boost::intrusive_ptr<WaitForObjectRecovery>;
237253
protected:
238254
std::map<hobject_t, WaitForObjectRecoveryRef> recovering;
255+
std::map<hobject_t, seastar::shared_promise<>> unfound;
239256
hobject_t get_temp_recovery_object(
240257
const hobject_t& target,
241258
eversion_t version) const;

0 commit comments

Comments
 (0)