Skip to content

Commit f689bb8

Browse files
authored
Merge pull request ceph#59368 from xxhdx1985126/wip-crimson-backfill-cancel-part2
crimson/osd: support backfill cancellation ---- part2 Reviewed-by: Matan Breizman <[email protected]> Reviewed-by: Radosław Zarzyński <[email protected]>
2 parents 73c1934 + d7e6896 commit f689bb8

File tree

6 files changed

+165
-27
lines changed

6 files changed

+165
-27
lines changed

src/crimson/osd/backfill_state.cc

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,21 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt)
101101
}
102102
}
103103

104+
boost::statechart::result
105+
BackfillState::Cancelled::react(const BackfillState::Triggered& evt)
106+
{
107+
logger().debug("{}: backfill re-triggered", __func__);
108+
ceph_assert(peering_state().is_backfilling());
109+
if (Enqueuing::all_enqueued(peering_state(),
110+
backfill_state().backfill_info,
111+
backfill_state().peer_backfill_info)) {
112+
logger().debug("{}: switching to Done state", __func__);
113+
return transit<BackfillState::Done>();
114+
} else {
115+
logger().debug("{}: switching to Enqueuing state", __func__);
116+
return transit<BackfillState::Enqueuing>();
117+
}
118+
}
104119

105120
// -- Enqueuing
106121
void BackfillState::Enqueuing::maybe_update_range()
@@ -451,6 +466,15 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
451466
return discard_event();
452467
}
453468

469+
boost::statechart::result
470+
BackfillState::ReplicasScanning::react(CancelBackfill evt)
471+
{
472+
logger().debug("{}: cancelled within ReplicasScanning",
473+
__func__);
474+
waiting_on_backfill.clear();
475+
return transit<Cancelled>();
476+
}
477+
454478
boost::statechart::result
455479
BackfillState::ReplicasScanning::react(ObjectPushed evt)
456480
{
@@ -499,11 +523,10 @@ BackfillState::Crashed::Crashed()
499523
}
500524

501525
// -- Cancelled
502-
BackfillState::Cancelled::Cancelled()
526+
BackfillState::Cancelled::Cancelled(my_context ctx)
527+
: my_base(ctx)
503528
{
504-
backfill_state().backfill_info.clear();
505-
backfill_state().peer_backfill_info.clear();
506-
backfill_state().progress_tracker.reset();
529+
ceph_assert(peering_state().get_backfill_targets().size());
507530
}
508531

509532
// ProgressTracker is an intermediary between the BackfillListener and

src/crimson/osd/backfill_state.h

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,27 @@ struct BackfillState {
135135
explicit Crashed();
136136
};
137137

138-
struct Cancelled : sc::simple_state<Cancelled, BackfillMachine>,
139-
StateHelper<Cancelled> {
140-
explicit Cancelled();
138+
struct Cancelled : sc::state<Cancelled, BackfillMachine>,
139+
StateHelper<Cancelled> {
140+
using reactions = boost::mpl::list<
141+
sc::custom_reaction<Triggered>,
142+
sc::custom_reaction<PrimaryScanned>,
143+
sc::custom_reaction<ReplicaScanned>,
144+
sc::custom_reaction<ObjectPushed>,
145+
sc::transition<sc::event_base, Crashed>>;
146+
explicit Cancelled(my_context);
147+
// resume after triggering backfill by on_activate_complete().
148+
// transit to Enqueuing.
149+
sc::result react(const Triggered&);
150+
sc::result react(const PrimaryScanned&) {
151+
return discard_event();
152+
}
153+
sc::result react(const ReplicaScanned&) {
154+
return discard_event();
155+
}
156+
sc::result react(const ObjectPushed&) {
157+
return discard_event();
158+
}
141159
};
142160

143161
struct Initial : sc::state<Initial, BackfillMachine>,
@@ -159,6 +177,8 @@ struct BackfillState {
159177
sc::transition<RequestPrimaryScanning, PrimaryScanning>,
160178
sc::transition<RequestReplicasScanning, ReplicasScanning>,
161179
sc::transition<RequestWaiting, Waiting>,
180+
sc::transition<RequestDone, Done>,
181+
sc::transition<CancelBackfill, Cancelled>,
162182
sc::transition<sc::event_base, Crashed>>;
163183
explicit Enqueuing(my_context);
164184

@@ -229,14 +249,15 @@ struct BackfillState {
229249
using reactions = boost::mpl::list<
230250
sc::custom_reaction<ObjectPushed>,
231251
sc::custom_reaction<ReplicaScanned>,
252+
sc::custom_reaction<CancelBackfill>,
232253
sc::transition<RequestDone, Done>,
233-
sc::transition<CancelBackfill, Cancelled>,
234254
sc::transition<sc::event_base, Crashed>>;
235255
explicit ReplicasScanning(my_context);
236256
// collect scanning result; if all results are collected, transition
237257
// to Enqueuing will happen.
238258
sc::result react(ObjectPushed);
239259
sc::result react(ReplicaScanned);
260+
sc::result react(CancelBackfill);
240261

241262
// indicate whether a particular peer should be scanned to retrieve
242263
// BackfillInterval for new range of hobject_t namespace.

src/crimson/osd/pg.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ void PG::on_activate_complete()
425425
PeeringState::AllReplicasRecovered{});
426426
}
427427
publish_stats_to_osd();
428+
recovery_handler->on_activate_complete();
428429
}
429430

430431
void PG::prepare_write(pg_info_t &info,

src/crimson/osd/pg_recovery.cc

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -623,42 +623,51 @@ void PGRecovery::backfill_cancelled()
623623
using BackfillState = crimson::osd::BackfillState;
624624
backfill_state->process_event(
625625
BackfillState::CancelBackfill{}.intrusive_from_this());
626-
backfill_state.reset();
627626
}
628627

629628
void PGRecovery::dispatch_backfill_event(
630629
boost::intrusive_ptr<const boost::statechart::event_base> evt)
631630
{
632631
logger().debug("{}", __func__);
633-
if (backfill_state) {
634-
backfill_state->process_event(evt);
635-
} else {
636-
// TODO: Do we need to worry about cases in which the pg has
637-
// been through both backfill cancellations and backfill
638-
// restarts between the sendings and replies of
639-
// ReplicaScan/ObjectPush requests? Seems classic OSDs
640-
// doesn't handle these cases.
641-
logger().debug("{}, backfill cancelled, dropping evt");
642-
}
632+
assert(backfill_state);
633+
backfill_state->process_event(evt);
634+
// TODO: Do we need to worry about cases in which the pg has
635+
// been through both backfill cancellations and backfill
636+
// restarts between the sendings and replies of
637+
// ReplicaScan/ObjectPush requests? Seems classic OSDs
638+
// doesn't handle these cases.
639+
}
640+
641+
void PGRecovery::on_activate_complete()
642+
{
643+
logger().debug("{} backfill_state={}",
644+
__func__, fmt::ptr(backfill_state.get()));
645+
backfill_state.reset();
643646
}
644647

645648
void PGRecovery::on_backfill_reserved()
646649
{
647650
logger().debug("{}", __func__);
648-
// PIMP and depedency injection for the sake unittestability.
649-
// I'm not afraid about the performance here.
650-
using BackfillState = crimson::osd::BackfillState;
651-
backfill_state = std::make_unique<BackfillState>(
652-
*this,
653-
std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
654-
std::make_unique<crimson::osd::PGFacade>(
655-
*static_cast<crimson::osd::PG*>(pg)));
656651
// yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING
657652
// will be set after on_backfill_reserved() returns.
658653
// Backfill needs to take this into consideration when scheduling
659654
// events -- they must be mutually exclusive with PeeringEvent
660655
// instances. Otherwise the execution might begin without having
661656
// the state updated.
662657
ceph_assert(!pg->get_peering_state().is_backfilling());
658+
// let's be lazy with creating the backfill stuff
659+
using BackfillState = crimson::osd::BackfillState;
660+
if (!backfill_state) {
661+
// PIMP and depedency injection for the sake of unittestability.
662+
// I'm not afraid about the performance here.
663+
backfill_state = std::make_unique<BackfillState>(
664+
*this,
665+
std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
666+
std::make_unique<crimson::osd::PGFacade>(
667+
*static_cast<crimson::osd::PG*>(pg)));
668+
}
669+
// it may be we either start a completely new backfill (first
670+
// event since last on_activate_complete()) or to resume already
671+
// (but stopped one).
663672
start_backfill_recovery(BackfillState::Triggered{});
664673
}

src/crimson/osd/pg_recovery.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
3333
interruptible_future<bool> start_recovery_ops(
3434
RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
3535
size_t max_to_start);
36+
void on_activate_complete();
3637
void on_backfill_reserved();
3738
void dispatch_backfill_event(
3839
boost::intrusive_ptr<const boost::statechart::event_base> evt);

src/test/crimson/test_backfill.cc

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,18 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
154154
}
155155
}
156156

157+
template <class EventT>
158+
void next_round2() {
159+
ceph_assert(events_to_dispatch.size());
160+
// workaround for Clang's `-Wpotentially-evaluated-expression`. See:
161+
// * https://gitlab.cern.ch/gaudi/Gaudi/-/merge_requests/970
162+
// * https://stackoverflow.com/q/46494928
163+
const auto& front_event = *events_to_dispatch.front();
164+
ceph_assert(typeid(front_event) == typeid(EventT));
165+
backfill_state.process_event(std::move(events_to_dispatch.front()));
166+
events_to_dispatch.pop_front();
167+
}
168+
157169
void next_till_done() {
158170
while (!events_to_dispatch.empty()) {
159171
next_round();
@@ -171,6 +183,15 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
171183

172184
struct PeeringFacade;
173185
struct PGFacade;
186+
187+
void cancel() {
188+
events_to_dispatch.clear();
189+
schedule_event(crimson::osd::BackfillState::CancelBackfill{});
190+
}
191+
192+
void resume() {
193+
schedule_event(crimson::osd::BackfillState::Triggered{});
194+
}
174195
};
175196

176197
struct BackfillFixture::PeeringFacade
@@ -411,6 +432,68 @@ TEST(backfill, two_empty_replicas)
411432
EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
412433
}
413434

435+
TEST(backfill, cancel_resume)
436+
{
437+
const auto reference_store = FakeStore{ {
438+
{ "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
439+
{ "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
440+
{ "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
441+
}};
442+
auto cluster_fixture = BackfillFixtureBuilder::add_source(
443+
reference_store.objs
444+
).add_target(
445+
{ /* nothing 1 */ }
446+
).add_target(
447+
{ /* nothing 2 */ }
448+
).get_result();
449+
450+
EXPECT_CALL(cluster_fixture, backfilled);
451+
cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
452+
cluster_fixture.cancel();
453+
cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
454+
cluster_fixture.resume();
455+
cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
456+
cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
457+
cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
458+
cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
459+
cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
460+
cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
461+
cluster_fixture.next_till_done();
462+
463+
EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
464+
}
465+
466+
TEST(backfill, cancel_resume_middle_of_scan)
467+
{
468+
const auto reference_store = FakeStore{ {
469+
{ "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
470+
{ "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
471+
{ "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
472+
}};
473+
auto cluster_fixture = BackfillFixtureBuilder::add_source(
474+
reference_store.objs
475+
).add_target(
476+
{ /* nothing 1 */ }
477+
).add_target(
478+
{ /* nothing 2 */ }
479+
).get_result();
480+
481+
EXPECT_CALL(cluster_fixture, backfilled);
482+
cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
483+
cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
484+
cluster_fixture.cancel();
485+
cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
486+
cluster_fixture.resume();
487+
cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
488+
cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
489+
cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
490+
cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
491+
cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
492+
cluster_fixture.next_till_done();
493+
494+
EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
495+
}
496+
414497
namespace StoreRandomizer {
415498
// FIXME: copied & pasted from test/test_snap_mapper.cc. We need to
416499
// find a way to avoid code duplication in test. A static library?

0 commit comments

Comments
 (0)