Skip to content

Commit ee8e415

Browse files
authored
Merge pull request ceph#55107 from ronen-fr/wip-rf-rm-penaltyq
osd/scrub: remove the 'penalty queue' from the scrubber Reviewed-by: Samuel Just <[email protected]>-
2 parents 27a0fc5 + 375d01a commit ee8e415

File tree

15 files changed

+238
-188
lines changed

15 files changed

+238
-188
lines changed

src/osd/PG.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,9 +1342,12 @@ Scrub::schedule_result_t PG::start_scrubbing(
13421342
<< dendl;
13431343
ceph_assert(ceph_mutex_is_locked(_lock));
13441344

1345+
// recheck PG status (as the PG was unlocked for a time after being selected
1346+
// for scrubbing)
13451347
if (!is_primary() || !is_active() || !is_clean()) {
13461348
dout(10) << __func__ << ": cannot scrub (not a clean and active primary)"
13471349
<< dendl;
1350+
m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::pg_state);
13481351
return schedule_result_t::target_specific_failure;
13491352
}
13501353

@@ -1361,6 +1364,7 @@ Scrub::schedule_result_t PG::start_scrubbing(
13611364
<< ": skipping this PG as repairing was not explicitly "
13621365
"requested for it"
13631366
<< dendl;
1367+
m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::scrub_params);
13641368
return schedule_result_t::target_specific_failure;
13651369
}
13661370

@@ -1369,6 +1373,7 @@ Scrub::schedule_result_t PG::start_scrubbing(
13691373
// (on the transition from NotTrimming to Trimming/WaitReservation),
13701374
// i.e. some time before setting 'snaptrim'.
13711375
dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl;
1376+
m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::pg_state);
13721377
return schedule_result_t::target_specific_failure;
13731378
}
13741379

@@ -1381,13 +1386,15 @@ Scrub::schedule_result_t PG::start_scrubbing(
13811386
// (due to configuration or priority issues)
13821387
// The reason was already reported by the callee.
13831388
dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
1389+
m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::scrub_params);
13841390
return schedule_result_t::target_specific_failure;
13851391
}
13861392

13871393
// try to reserve the local OSD resources. If failing: no harm. We will
13881394
// be retried by the OSD later on.
13891395
if (!m_scrubber->reserve_local()) {
13901396
dout(10) << __func__ << ": failed to reserve locally" << dendl;
1397+
m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::local_resources);
13911398
return schedule_result_t::osd_wide_failure;
13921399
}
13931400

src/osd/PG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ class PG : public DoutPrefixProvider,
281281
recovery_state.update_stats(
282282
[t](auto &history, auto &stats) {
283283
set_last_deep_scrub_stamp(t, history, stats);
284+
set_last_scrub_stamp(t, history, stats);
284285
return true;
285286
});
286287
on_scrub_schedule_input_change();

src/osd/osd_types.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ struct pg_t {
403403
uint32_t m_seed;
404404

405405
pg_t() : m_pool(0), m_seed(0) {}
406-
pg_t(ps_t seed, uint64_t pool) :
406+
constexpr pg_t(ps_t seed, uint64_t pool) :
407407
m_pool(pool), m_seed(seed) {}
408408
// cppcheck-suppress noExplicitConstructor
409409
pg_t(const ceph_pg& cpg) :
@@ -521,7 +521,7 @@ struct spg_t {
521521
pg_t pgid;
522522
shard_id_t shard;
523523
spg_t() : shard(shard_id_t::NO_SHARD) {}
524-
spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
524+
constexpr spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
525525
explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
526526
auto operator<=>(const spg_t&) const = default;
527527
unsigned get_split_bits(unsigned pg_num) const {

src/osd/scrubber/osd_scrub.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -429,11 +429,22 @@ Scrub::sched_params_t OsdScrub::determine_scrub_time(
429429

430430
void OsdScrub::update_job(
431431
Scrub::ScrubJobRef sjob,
432-
const Scrub::sched_params_t& suggested)
432+
const Scrub::sched_params_t& suggested,
433+
bool reset_notbefore)
433434
{
434-
m_queue.update_job(sjob, suggested);
435+
m_queue.update_job(sjob, suggested, reset_notbefore);
435436
}
436437

438+
void OsdScrub::delay_on_failure(
439+
Scrub::ScrubJobRef sjob,
440+
std::chrono::seconds delay,
441+
Scrub::delay_cause_t delay_cause,
442+
utime_t now_is)
443+
{
444+
m_queue.delay_on_failure(sjob, delay, delay_cause, now_is);
445+
}
446+
447+
437448
void OsdScrub::register_with_osd(
438449
Scrub::ScrubJobRef sjob,
439450
const Scrub::sched_params_t& suggested)

src/osd/scrubber/osd_scrub.h

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,21 +90,23 @@ class OsdScrub {
9090
* the registration will be with "beginning of time" target, making the
9191
* scrub-job eligible to immediate scrub (given that external conditions
9292
* do not prevent scrubbing)
93-
*
9493
* - 'must' is asserted, and the suggested time is 'now':
9594
* This happens if our stats are unknown. The results are similar to the
9695
* previous scenario.
97-
*
9896
* - not a 'must': we take the suggested time as a basis, and add to it some
9997
* configuration / random delays.
100-
*
10198
* ('must' is Scrub::sched_params_t.is_must)
10299
*
100+
* 'reset_notbefore' is used to reset the 'not_before' time to the updated
101+
* 'scheduled_at' time. This is used whenever the scrub-job schedule is
102+
* updated not as a result of a scrub attempt failure.
103+
*
103104
* locking: not using the jobs_lock
104105
*/
105106
void update_job(
106107
Scrub::ScrubJobRef sjob,
107-
const Scrub::sched_params_t& suggested);
108+
const Scrub::sched_params_t& suggested,
109+
bool reset_notbefore);
108110

109111
/**
110112
* Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically
@@ -147,6 +149,17 @@ class OsdScrub {
147149

148150
void clear_reserving_now(spg_t reserving_id);
149151

152+
/**
153+
* push the 'not_before' time out by 'delay' seconds, so that this scrub target
154+
* would not be retried before 'delay' seconds have passed.
155+
*/
156+
void delay_on_failure(
157+
Scrub::ScrubJobRef sjob,
158+
std::chrono::seconds delay,
159+
Scrub::delay_cause_t delay_cause,
160+
utime_t now_is);
161+
162+
150163
/**
151164
* \returns true if the current time is within the scrub time window
152165
*/

0 commit comments

Comments
 (0)