Skip to content

Commit bde52eb

Browse files
committed
osd/scrub: add a "clean primary" base state
... to the scrubber state machine. Similar to ReplicaActive, this state is entered after the peering is concluded and the PG is set to be Primary, active & clean. Signed-off-by: Ronen Friedman <[email protected]>
1 parent c31b795 commit bde52eb

File tree

7 files changed

+193
-48
lines changed

7 files changed

+193
-48
lines changed

src/osd/PG.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1861,6 +1861,16 @@ void PG::on_active_exit()
18611861
agent_stop();
18621862
}
18631863

1864+
Context* PG::on_clean()
1865+
{
1866+
if (is_active()) {
1867+
kick_snap_trim();
1868+
}
1869+
m_scrubber->on_primary_active_clean();
1870+
requeue_ops(waiting_for_clean_to_primary_repair);
1871+
return finish_recovery();
1872+
}
1873+
18641874
void PG::on_active_advmap(const OSDMapRef &osdmap)
18651875
{
18661876
const auto& new_removed_snaps = osdmap->get_new_removed_snaps();

src/osd/PG.h

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -608,13 +608,7 @@ class PG : public DoutPrefixProvider,
608608

609609
void on_active_exit() override;
610610

611-
Context *on_clean() override {
612-
if (is_active()) {
613-
kick_snap_trim();
614-
}
615-
requeue_ops(waiting_for_clean_to_primary_repair);
616-
return finish_recovery();
617-
}
611+
Context *on_clean() override;
618612

619613
void on_activate(interval_set<snapid_t> snaps) override;
620614

src/osd/scrubber/pg_scrubber.cc

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
152152

153153
// if we were not aware of the abort before - kill the scrub.
154154
if (epoch_to_verify >= m_last_aborted) {
155-
scrub_clear_state();
155+
m_fsm->process_event(FullReset{});
156156
m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
157157
}
158158
return false;
@@ -407,7 +407,7 @@ void PgScrubber::reset_epoch(epoch_t epoch_queued)
407407
{
408408
dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB)
409409
<< dendl;
410-
m_fsm->assert_not_active();
410+
m_fsm->assert_not_in_session();
411411

412412
m_epoch_start = epoch_queued;
413413
ceph_assert(m_is_deep == state_test(PG_STATE_DEEP_SCRUB));
@@ -462,6 +462,11 @@ void PgScrubber::on_new_interval()
462462
<< dendl;
463463

464464
m_fsm->process_event(IntervalChanged{});
465+
// the following asserts were added due to a past bug, where PG flags were
466+
// left set in some scenarios.
467+
ceph_assert(!is_queued_or_active());
468+
ceph_assert(!state_test(PG_STATE_SCRUBBING));
469+
ceph_assert(!state_test(PG_STATE_DEEP_SCRUB));
465470
rm_from_osd_scrubbing();
466471
}
467472

@@ -517,6 +522,14 @@ void PgScrubber::on_pg_activate(const requested_scrub_t& request_flags)
517522
<< dendl;
518523
}
519524

525+
void PgScrubber::on_primary_active_clean()
526+
{
527+
dout(10) << fmt::format(
528+
"{}: reg state: {}", __func__, m_scrub_job->state_desc())
529+
<< dendl;
530+
m_fsm->process_event(PrimaryActivate{});
531+
}
532+
520533
/*
521534
* A note re the call to publish_stats_to_osd() below:
522535
* - we are called from either request_rescrubbing() or scrub_requested().

src/osd/scrubber/pg_scrubber.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,8 @@ class PgScrubber : public ScrubPgIF,
314314

315315
void on_new_interval() final;
316316

317+
void on_primary_active_clean() final;
318+
317319
void on_replica_activate() final;
318320

319321
void scrub_clear_state() final;

src/osd/scrubber/scrub_machine.cc

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ void on_event_discard(std::string_view nm)
4747
dout(20) << " event: --^^^^---- " << nm << dendl;
4848
}
4949

50-
void ScrubMachine::assert_not_active() const
50+
void ScrubMachine::assert_not_in_session() const
5151
{
52-
ceph_assert(state_cast<const NotActive*>());
52+
ceph_assert(!state_cast<const Session*>());
5353
}
5454

5555
bool ScrubMachine::is_reserving() const
@@ -114,27 +114,64 @@ NotActive::NotActive(my_context ctx)
114114
scrbr->clear_queued_or_active();
115115
}
116116

117-
sc::result NotActive::react(const StartScrub&)
117+
118+
// ----------------------- PrimaryActive --------------------------------
119+
120+
PrimaryActive::PrimaryActive(my_context ctx)
121+
: my_base(ctx)
122+
, NamedSimply(context<ScrubMachine>().m_scrbr, "PrimaryActive")
123+
{
124+
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
125+
dout(10) << "-- state -->> PrimaryActive" << dendl;
126+
}
127+
128+
PrimaryActive::~PrimaryActive()
129+
{
130+
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
131+
// we may have set some PG state flags without reaching Session.
132+
// And we may be holding a 'local resource'.
133+
scrbr->clear_pgscrub_state();
134+
scrbr->rm_from_osd_scrubbing();
135+
}
136+
137+
138+
// ---------------- PrimaryActive/PrimaryIdle ---------------------------
139+
140+
PrimaryIdle::PrimaryIdle(my_context ctx)
141+
: my_base(ctx)
142+
, NamedSimply(context<ScrubMachine>().m_scrbr, "PrimaryActive/PrimaryIdle")
143+
{
144+
dout(10) << "-- state -->> PrimaryActive/PrimaryIdle" << dendl;
145+
}
146+
147+
sc::result PrimaryIdle::react(const StartScrub&)
118148
{
119-
dout(10) << "NotActive::react(const StartScrub&)" << dendl;
149+
dout(10) << "PrimaryIdle::react(const StartScrub&)" << dendl;
120150
DECLARE_LOCALS;
121151
return transit<ReservingReplicas>();
122152
}
123153

124-
sc::result NotActive::react(const AfterRepairScrub&)
154+
sc::result PrimaryIdle::react(const AfterRepairScrub&)
125155
{
126-
dout(10) << "NotActive::react(const AfterRepairScrub&)" << dendl;
156+
dout(10) << "PrimaryIdle::react(const AfterRepairScrub&)" << dendl;
127157
DECLARE_LOCALS;
128158
return transit<ReservingReplicas>();
129159
}
130160

161+
void PrimaryIdle::clear_state(const FullReset&) {
162+
dout(10) << "PrimaryIdle::react(const FullReset&): clearing state flags"
163+
<< dendl;
164+
DECLARE_LOCALS;
165+
scrbr->clear_pgscrub_state();
166+
}
167+
131168
// ----------------------- Session -----------------------------------------
132169

133170
Session::Session(my_context ctx)
134171
: my_base(ctx)
135-
, NamedSimply(context<ScrubMachine>().m_scrbr, "Session")
172+
, NamedSimply(context<ScrubMachine>().m_scrbr, "PrimaryActive/Session")
136173
{
137-
dout(10) << "-- state -->> Session" << dendl;
174+
dout(10) << "-- state -->> PrimaryActive/Session" << dendl;
138175
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
139176

140177
// while we've checked the 'someone is reserving' flag before queueing
@@ -242,7 +279,7 @@ sc::result ReservingReplicas::react(const ReplicaReject& ev)
242279
scrbr->flag_reservations_failure();
243280

244281
// 'Session' state dtor stops the scrubber
245-
return transit<NotActive>();
282+
return transit<PrimaryIdle>();
246283
}
247284

248285
sc::result ReservingReplicas::react(const ReservationTimeout&)
@@ -261,7 +298,7 @@ sc::result ReservingReplicas::react(const ReservationTimeout&)
261298
// cause the scrubber to stop the scrub session, marking 'reservation
262299
// failure' as the cause (affecting future scheduling)
263300
scrbr->flag_reservations_failure();
264-
return transit<NotActive>();
301+
return transit<PrimaryIdle>();
265302
}
266303

267304
// ----------------------- ActiveScrubbing -----------------------------------
@@ -678,7 +715,7 @@ sc::result WaitDigestUpdate::react(const ScrubFinished&)
678715
session.m_session_started_at = ScrubTimePoint{};
679716

680717
scrbr->scrub_finish();
681-
return transit<NotActive>();
718+
return transit<PrimaryIdle>();
682719
}
683720

684721
ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
@@ -813,6 +850,11 @@ ReplicaIdle::ReplicaIdle(my_context ctx)
813850
dout(10) << "-- state -->> ReplicaActive/ReplicaIdle" << dendl;
814851
}
815852

853+
void ReplicaIdle::reset_ignored(const FullReset&)
854+
{
855+
dout(10) << "ReplicaIdle::react(const FullReset&): FullReset ignored"
856+
<< dendl;
857+
}
816858

817859
// ------------- ReplicaActive/ReplicaActiveOp --------------------------
818860

0 commit comments

Comments
 (0)