Skip to content

Commit 11ce348

Browse files
authored
Merge pull request ceph#62837 from athanatos/sjust/wip-crimson-stuck-backfilling
crimson: fix several bugs causing stuck backfills Reviewed-by: Matan Breizman <[email protected]>
2 parents cd82bfc + cd9dcf6 commit 11ce348

File tree

7 files changed

+61
-63
lines changed

7 files changed

+61
-63
lines changed

src/crimson/osd/osd_operation.h

Lines changed: 21 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -300,34 +300,6 @@ class OperationThrottler : public BlockerT<OperationThrottler>,
300300
friend BlockerT<OperationThrottler>;
301301
static constexpr const char* type_name = "OperationThrottler";
302302

303-
template <typename OperationT, typename F>
304-
auto with_throttle(
305-
OperationT* op,
306-
crimson::osd::scheduler::params_t params,
307-
F &&f) {
308-
if (!max_in_progress) return f();
309-
return acquire_throttle(params)
310-
.then(std::forward<F>(f))
311-
.then([this](auto x) {
312-
release_throttle();
313-
return x;
314-
});
315-
}
316-
317-
template <typename OperationT, typename F>
318-
seastar::future<> with_throttle_while(
319-
OperationT* op,
320-
crimson::osd::scheduler::params_t params,
321-
F &&f) {
322-
return with_throttle(op, params, f).then([this, params, op, f](bool cont) {
323-
return cont
324-
? seastar::yield().then([params, op, f, this] {
325-
return with_throttle_while(op, params, f); })
326-
: seastar::now();
327-
});
328-
}
329-
330-
331303
public:
332304
OperationThrottler(ConfigProxy &conf);
333305

@@ -340,24 +312,28 @@ class OperationThrottler : public BlockerT<OperationThrottler>,
340312
return !max_in_progress || in_progress < max_in_progress;
341313
}
342314

343-
template <typename F>
344-
auto with_throttle(
345-
crimson::osd::scheduler::params_t params,
346-
F &&f) {
347-
if (!max_in_progress) return f();
348-
return acquire_throttle(params)
349-
.then(std::forward<F>(f))
350-
.finally([this] {
351-
release_throttle();
352-
});
353-
}
315+
class ThrottleReleaser {
316+
OperationThrottler *parent = nullptr;
317+
public:
318+
ThrottleReleaser(OperationThrottler *parent) : parent(parent) {}
319+
ThrottleReleaser(const ThrottleReleaser &) = delete;
320+
ThrottleReleaser(ThrottleReleaser &&rhs) noexcept {
321+
std::swap(parent, rhs.parent);
322+
}
354323

355-
template <class OpT, class... Args>
356-
seastar::future<> with_throttle_while(
357-
BlockingEvent::Trigger<OpT>&& trigger,
358-
Args&&... args) {
359-
return trigger.maybe_record_blocking(
360-
with_throttle_while(std::forward<Args>(args)...), *this);
324+
~ThrottleReleaser() {
325+
if (parent) {
326+
parent->release_throttle();
327+
}
328+
}
329+
};
330+
331+
auto get_throttle(crimson::osd::scheduler::params_t params) {
332+
return acquire_throttle(
333+
params
334+
).then([this] {
335+
return ThrottleReleaser{this};
336+
});
361337
}
362338

363339
private:

src/crimson/osd/pg.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,9 @@ class PG : public boost::intrusive_ref_counter<
734734
ShardServices& get_shard_services() final {
735735
return shard_services;
736736
}
737+
DoutPrefixProvider& get_dpp() final {
738+
return *this;
739+
}
737740
seastar::future<> stop();
738741
private:
739742
class C_PG_FinishRecovery : public Context {

src/crimson/osd/pg_recovery.cc

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <fmt/ostream.h>
66
#include <fmt/ranges.h>
77

8+
#include "crimson/common/coroutine.h"
9+
#include "crimson/common/log.h"
810
#include "crimson/common/type_helpers.h"
911
#include "crimson/osd/backfill_facades.h"
1012
#include "crimson/osd/osd_operations/background_recovery.h"
@@ -16,6 +18,8 @@
1618
#include "osd/osd_types.h"
1719
#include "osd/PeeringState.h"
1820

21+
SET_SUBSYS(osd);
22+
1923
namespace {
2024
seastar::logger& logger() {
2125
return crimson::get_logger(ceph_subsys_osd);
@@ -522,21 +526,19 @@ void PGRecovery::request_primary_scan(
522526

523527
PGRecovery::interruptible_future<>
524528
PGRecovery::recover_object_with_throttle(
525-
const hobject_t &soid,
529+
hobject_t soid,
526530
eversion_t need)
527531
{
528-
crimson::osd::scheduler::params_t params =
529-
{1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort};
530-
auto &ss = pg->get_shard_services();
531-
logger().debug("{} {}", soid, need);
532-
return ss.with_throttle(
533-
std::move(params),
534-
[this, soid, need] {
535-
logger().debug("got throttle: {} {}", soid, need);
536-
auto backend = pg->get_recovery_backend();
537-
assert(backend);
538-
return backend->recover_object(soid, need);
539-
});
532+
LOG_PREFIX(PGRecovery::recover_object_with_throttle);
533+
DEBUGDPP("{} {}", pg->get_dpp(), soid, need);
534+
auto releaser = co_await interruptor::make_interruptible(
535+
pg->get_shard_services().get_throttle(
536+
crimson::osd::scheduler::params_t{
537+
1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort
538+
}));
539+
DEBUGDPP("got throttle: {} {}", pg->get_dpp(), soid, need);
540+
co_await pg->get_recovery_backend()->recover_object(soid, need);
541+
co_return;
540542
}
541543

542544
void PGRecovery::enqueue_push(

src/crimson/osd/pg_recovery.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ class PGBackend;
2525

2626
class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
2727
public:
28+
using interruptor =
29+
::crimson::interruptible::interruptor<
30+
::crimson::osd::IOInterruptCondition>;
2831
template <typename T = void>
2932
using interruptible_future = RecoveryBackend::interruptible_future<T>;
3033
PGRecovery(PGRecoveryListener* pg) : pg(pg) {}
@@ -100,7 +103,7 @@ class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
100103
friend class crimson::osd::UrgentRecovery;
101104

102105
interruptible_future<> recover_object_with_throttle(
103-
const hobject_t &soid,
106+
hobject_t soid,
104107
eversion_t need);
105108

106109
interruptible_future<> recover_object(

src/crimson/osd/pg_recovery_listener.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class PGRecovery;
2020
class PGRecoveryListener {
2121
public:
2222
virtual crimson::osd::ShardServices& get_shard_services() = 0;
23+
virtual DoutPrefixProvider& get_dpp() = 0;
2324
virtual PGRecovery* get_recovery_handler() = 0;
2425
virtual epoch_t get_osdmap_epoch() const = 0;
2526
virtual bool is_primary() const = 0;

src/crimson/osd/recovery_backend.cc

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,12 +235,25 @@ RecoveryBackend::scan_for_backfill(
235235
co_await interruptor::parallel_for_each(objects, seastar::coroutine::lambda([FNAME, this, version_map]
236236
(const hobject_t& object) -> interruptible_future<> {
237237
DEBUGDPP("querying obj:{}", pg, object);
238-
auto obc_manager = pg.obc_loader.get_obc_manager(object);
239-
co_await pg.obc_loader.load_and_lock(
238+
auto obc_manager = pg.obc_loader.get_obc_manager(
239+
object, /* resolve_clone = */ false);
240+
241+
auto found = co_await pg.obc_loader.load_and_lock(
240242
obc_manager, RWState::RWREAD
241-
).handle_error_interruptible(
243+
).si_then([] {
244+
return true;
245+
}).handle_error_interruptible(
246+
crimson::ct_error::enoent::handle([](auto) {
247+
return false;
248+
}),
242249
crimson::ct_error::assert_all("unexpected error")
243250
);
251+
if (!found) {
252+
// if the object does not exist here, it must have been removed
253+
// between the collection_list_partial and here. This can happen
254+
// for the first item in the range, which is usually last_backfill.
255+
co_return;
256+
}
244257

245258
if (obc_manager.get_obc()->obs.exists) {
246259
auto version = obc_manager.get_obc()->obs.oi.version;

src/crimson/osd/shard_services.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ class ShardServices : public OSDMapService {
593593
}
594594

595595
FORWARD_TO_OSD_SINGLETON(get_pool_info)
596-
FORWARD(with_throttle, with_throttle, local_state.throttler)
596+
FORWARD(get_throttle, get_throttle, local_state.throttler)
597597

598598
FORWARD_TO_OSD_SINGLETON(build_incremental_map_msg)
599599
FORWARD_TO_OSD_SINGLETON(send_incremental_map)

0 commit comments

Comments
 (0)