Skip to content

Commit 699dd28

Browse files
committed
osd/scrub: fix job requeue conditions
Previous commits handled the following two cases correctly: - requeueing a scrub job while the OSD is still the primary, and - not restoring the scrub job to the queue if the PG is not there; Here we handle the missed scenario: the PG is there (we were able to lock it), but is no longer the primary. Also - a configuration change must not cause a re-queue of a scrub-job for a PG that is in the middle of scrubbing. Signed-off-by: Ronen Friedman <[email protected]>
1 parent 2187c49 commit 699dd28

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

src/osd/PG.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,7 +1363,7 @@ double PG::next_deepscrub_interval() const
13631363

13641364
void PG::on_scrub_schedule_input_change(Scrub::delay_ready_t delay_ready)
13651365
{
1366-
if (is_active() && is_primary()) {
1366+
if (is_active() && is_primary() && !is_scrub_queued_or_active()) {
13671367
dout(10) << fmt::format(
13681368
"{}: active/primary. delay_ready={:c}", __func__,
13691369
(delay_ready == Scrub::delay_ready_t::delay_ready) ? 't'
@@ -1372,7 +1372,10 @@ void PG::on_scrub_schedule_input_change(Scrub::delay_ready_t delay_ready)
13721372
ceph_assert(m_scrubber);
13731373
m_scrubber->update_scrub_job(delay_ready);
13741374
} else {
1375-
dout(10) << fmt::format("{}: inactive or non-primary", __func__) << dendl;
1375+
dout(10) << fmt::format(
1376+
"{}: inactive, non-primary - or already scrubbing",
1377+
__func__)
1378+
<< dendl;
13761379
}
13771380
}
13781381

src/osd/scrubber/pg_scrubber.cc

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2093,6 +2093,15 @@ void PgScrubber::on_digest_updates()
20932093
*/
20942094
void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
20952095
{
2096+
if (!m_scrub_job->is_registered()) {
2097+
dout(10) << fmt::format(
2098+
"{}: PG not registered for scrubbing on this OSD. Won't "
2099+
"requeue!",
2100+
__func__)
2101+
<< dendl;
2102+
return;
2103+
}
2104+
20962105
// assuming we can still depend on the 'scrubbing' flag being set;
20972106
// Also on Queued&Active.
20982107

@@ -2113,7 +2122,6 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
21132122

21142123
m_scrub_job->merge_and_delay(
21152124
m_active_target->schedule, issue, m_planned_scrub, ceph_clock_now());
2116-
ceph_assert(m_scrub_job->is_registered());
21172125
ceph_assert(!m_scrub_job->target_queued);
21182126
m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
21192127
m_scrub_job->target_queued = true;
@@ -2122,9 +2130,16 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
21222130

21232131
void PgScrubber::requeue_penalized(Scrub::delay_cause_t cause)
21242132
{
2133+
if (!m_scrub_job->is_registered()) {
2134+
dout(10) << fmt::format(
2135+
"{}: PG not registered for scrubbing on this OSD. Won't "
2136+
"requeue!",
2137+
__func__)
2138+
<< dendl;
2139+
return;
2140+
}
21252141
/// \todo fix the 5s' to use a cause-specific delay parameter
21262142
m_scrub_job->delay_on_failure(5s, cause, ceph_clock_now());
2127-
ceph_assert(m_scrub_job->is_registered());
21282143
ceph_assert(!m_scrub_job->target_queued);
21292144
m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
21302145
m_scrub_job->target_queued = true;
@@ -2148,9 +2163,19 @@ Scrub::schedule_result_t PgScrubber::start_scrub_session(
21482163

21492164
m_active_target = std::move(candidate);
21502165

2166+
if (!is_primary() || !m_pg->is_active()) {
2167+
// the PG is not expected to be 'registered' in this state. And we should
2168+
// not attempt to queue it.
2169+
dout(10) << __func__ << ": cannot scrub (not an active primary)"
2170+
<< dendl;
2171+
return schedule_result_t::target_specific_failure;
2172+
}
2173+
21512174
// for all other failures - we must reinstate our entry in the Scrub Queue
2152-
if (!is_primary() || !m_pg->is_active() || !m_pg->is_clean()) {
2153-
dout(10) << __func__ << ": cannot scrub (not a clean and active primary)"
2175+
if (!m_pg->is_clean()) {
2176+
dout(10) << fmt::format(
2177+
"{}: cannot scrub (not clean). Registered?{:c}", __func__,
2178+
m_scrub_job->is_registered() ? 'Y' : 'n')
21542179
<< dendl;
21552180
requeue_penalized(Scrub::delay_cause_t::pg_state);
21562181
return schedule_result_t::target_specific_failure;

0 commit comments

Comments
 (0)