Skip to content

Commit a34b0ce

Browse files
committed
crimson/osd/backfill_state: treat Cancelled as a pause of the ongoing backfilling
Fixes: https://tracker.ceph.com/issues/67888 Signed-off-by: Xuehan Xu <[email protected]>
1 parent 892cf56 commit a34b0ce

File tree

2 files changed

+110
-7
lines changed

2 files changed

+110
-7
lines changed

src/crimson/osd/backfill_state.cc

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,34 @@ BackfillState::PrimaryScanning::react(PrimaryScanned evt)
407407
LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned);
408408
DEBUGDPP("", pg());
409409
backfill_state().backfill_info = std::move(evt.result);
410-
return transit<Enqueuing>();
410+
if (!backfill_state().is_suspended()) {
411+
return transit<Enqueuing>();
412+
} else {
413+
DEBUGDPP("backfill suspended, not going Enqueuing", pg());
414+
backfill_state().go_enqueuing_on_resume();
415+
}
416+
return discard_event();
417+
}
418+
419+
boost::statechart::result
420+
BackfillState::PrimaryScanning::react(CancelBackfill evt)
421+
{
422+
LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill);
423+
DEBUGDPP("suspended within PrimaryScanning", pg());
424+
backfill_state().on_suspended();
425+
return discard_event();
426+
}
427+
428+
boost::statechart::result
429+
BackfillState::PrimaryScanning::react(Triggered evt)
430+
{
431+
LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered);
432+
ceph_assert(backfill_state().is_suspended());
433+
if (backfill_state().on_resumed()) {
434+
DEBUGDPP("Backfill resumed, going Enqueuing", pg());
435+
return transit<Enqueuing>();
436+
}
437+
return discard_event();
411438
}
412439

413440
boost::statechart::result
@@ -470,21 +497,40 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
470497
if (waiting_on_backfill.empty()) {
471498
ceph_assert(backfill_state().peer_backfill_info.size() == \
472499
peering_state().get_backfill_targets().size());
473-
return transit<Enqueuing>();
500+
if (!backfill_state().is_suspended()) {
501+
return transit<Enqueuing>();
502+
} else {
503+
DEBUGDPP("backfill suspended, not going Enqueuing", pg());
504+
backfill_state().go_enqueuing_on_resume();
505+
}
474506
}
475507
} else {
476-
// we canceled backfill for a while due to a too full, and this
508+
// we suspended backfill for a while due to a too full, and this
477509
// is an extra response from a non-too-full peer
478-
DEBUGDPP("canceled backfill (too full?)", pg());
510+
DEBUGDPP("suspended backfill (too full?)", pg());
479511
}
480512
return discard_event();
481513
}
482514

483515
boost::statechart::result
484516
BackfillState::ReplicasScanning::react(CancelBackfill evt)
485517
{
486-
LOG_PREFIX(BackfillState::ReplicasScanning::react::CancelBackfill);
487-
DEBUGDPP("cancelled within ReplicasScanning", pg());
518+
LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill);
519+
DEBUGDPP("suspended within ReplicasScanning", pg());
520+
backfill_state().on_suspended();
521+
return discard_event();
522+
}
523+
524+
boost::statechart::result
525+
BackfillState::ReplicasScanning::react(Triggered evt)
526+
{
527+
LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered);
528+
ceph_assert(backfill_state().is_suspended());
529+
if (backfill_state().on_resumed()) {
530+
DEBUGDPP("Backfill resumed, going Enqueuing", pg());
531+
return transit<Enqueuing>();
532+
}
533+
return discard_event();
488534
}
489535

490536
boost::statechart::result
@@ -510,7 +556,34 @@ BackfillState::Waiting::react(ObjectPushed evt)
510556
LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed);
511557
DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object);
512558
backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
513-
return transit<Enqueuing>();
559+
if (!backfill_state().is_suspended()) {
560+
return transit<Enqueuing>();
561+
} else {
562+
DEBUGDPP("backfill suspended, not going Enqueuing", pg());
563+
backfill_state().go_enqueuing_on_resume();
564+
}
565+
return discard_event();
566+
}
567+
568+
boost::statechart::result
569+
BackfillState::Waiting::react(CancelBackfill evt)
570+
{
571+
LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill);
572+
DEBUGDPP("suspended within Waiting", pg());
573+
backfill_state().on_suspended();
574+
return discard_event();
575+
}
576+
577+
boost::statechart::result
578+
BackfillState::Waiting::react(Triggered evt)
579+
{
580+
LOG_PREFIX(BackfillState::Waiting::react::Triggered);
581+
ceph_assert(backfill_state().is_suspended());
582+
if (backfill_state().on_resumed()) {
583+
DEBUGDPP("Backfill resumed, going Enqueuing", pg());
584+
return transit<Enqueuing>();
585+
}
586+
return discard_event();
514587
}
515588

516589
// -- Done

src/crimson/osd/backfill_state.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,11 +210,15 @@ struct BackfillState {
210210
sc::custom_reaction<ObjectPushed>,
211211
sc::custom_reaction<PrimaryScanned>,
212212
sc::transition<RequestDone, Done>,
213+
sc::custom_reaction<CancelBackfill>,
214+
sc::custom_reaction<Triggered>,
213215
sc::transition<sc::event_base, Crashed>>;
214216
explicit PrimaryScanning(my_context);
215217
sc::result react(ObjectPushed);
216218
// collect scanning result and transit to Enqueuing.
217219
sc::result react(PrimaryScanned);
220+
sc::result react(CancelBackfill);
221+
sc::result react(Triggered);
218222
};
219223

220224
struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>,
@@ -223,6 +227,7 @@ struct BackfillState {
223227
sc::custom_reaction<ObjectPushed>,
224228
sc::custom_reaction<ReplicaScanned>,
225229
sc::custom_reaction<CancelBackfill>,
230+
sc::custom_reaction<Triggered>,
226231
sc::transition<RequestDone, Done>,
227232
sc::transition<sc::event_base, Crashed>>;
228233
explicit ReplicasScanning(my_context);
@@ -231,6 +236,7 @@ struct BackfillState {
231236
sc::result react(ObjectPushed);
232237
sc::result react(ReplicaScanned);
233238
sc::result react(CancelBackfill);
239+
sc::result react(Triggered);
234240

235241
// indicate whether a particular peer should be scanned to retrieve
236242
// BackfillInterval for new range of hobject_t namespace.
@@ -249,9 +255,13 @@ struct BackfillState {
249255
using reactions = boost::mpl::list<
250256
sc::custom_reaction<ObjectPushed>,
251257
sc::transition<RequestDone, Done>,
258+
sc::custom_reaction<CancelBackfill>,
259+
sc::custom_reaction<Triggered>,
252260
sc::transition<sc::event_base, Crashed>>;
253261
explicit Waiting(my_context);
254262
sc::result react(ObjectPushed);
263+
sc::result react(CancelBackfill);
264+
sc::result react(Triggered);
255265
};
256266

257267
struct Done : sc::state<Done, BackfillMachine>,
@@ -296,6 +306,26 @@ struct BackfillState {
296306
}
297307
}
298308
private:
309+
struct backfill_suspend_state_t {
310+
bool suspended = false;
311+
bool should_go_enqueuing = false;
312+
} backfill_suspend_state;
313+
bool is_suspended() const {
314+
return backfill_suspend_state.suspended;
315+
}
316+
void on_suspended() {
317+
ceph_assert(!is_suspended());
318+
backfill_suspend_state = {true, false};
319+
}
320+
bool on_resumed() {
321+
auto go_enqueuing = backfill_suspend_state.should_go_enqueuing;
322+
backfill_suspend_state = {false, false};
323+
return go_enqueuing;
324+
}
325+
void go_enqueuing_on_resume() {
326+
ceph_assert(is_suspended());
327+
backfill_suspend_state.should_go_enqueuing = true;
328+
}
299329
hobject_t last_backfill_started;
300330
BackfillInterval backfill_info;
301331
std::map<pg_shard_t, BackfillInterval> peer_backfill_info;

0 commit comments

Comments
 (0)