Skip to content

Commit 0de916d

Browse files
committed
osd/scrub: modify deep scrub interval randomization
The interaction between the various configuration parameters controlling the scheduling of deep scrubs is not clearly defined nor clearly documented. The existing set of parameters creates unnecessary code complexity, is surprising to the operators, and does not provide the level of control desired by Ceph users. This is a proposed change to the deep scrub interval randomization: Pre this PR, deep scrubs scheduling is controlled by the following set of parameters: The desired interval between deep scrubs is determined by osd_deep_scrub_interval. To prevent a "thundering herd" problem if multiple PGs were created at the same time, a randomization effect was added: at a configurable frequency, a shallow scrub is "upgraded" to a deep scrub. As mentioned above, the interaction between these parameters isn't always clear to the operators. But the main issue is its effect on code complexity and design choices (as it is never known in advance whether the next scrub will be deep or shallow). Here we change the randomization method, decoupling it from shallow scrubs scheduling. In the new method, deep scrubs are scheduled at the desired interval - in average. The actual time is randomized to a normal distribution with a CV of osd_deep_scrub_interval_cv (clamped to reasonable values). Signed-off-by: Ronen Friedman <[email protected]>
1 parent d3a1626 commit 0de916d

File tree

4 files changed

+53
-24
lines changed

4 files changed

+53
-24
lines changed

src/common/options/osd.yaml.in

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -440,17 +440,32 @@ options:
440440
type: float
441441
level: advanced
442442
desc: Deep scrub each PG (i.e., verify data checksums) at least this often
443-
fmt_desc: The interval for "deep" scrubbing (fully reading all data). The
444-
``osd_scrub_load_threshold`` does not affect this setting.
443+
fmt_desc: The interval for "deep" scrubbing (fully reading all data).
445444
default: 7_day
446445
with_legacy: true
446+
- name: osd_deep_scrub_interval_cv
447+
type: float
448+
level: advanced
449+
desc: determining the amount of variation in the deep scrub interval
450+
long_desc: deep scrub intervals are varied by a random amount to prevent
451+
stampedes. This parameter determines the amount of variation.
452+
Technically - osd_deep_scrub_interval_cv is the coefficient of variation for
453+
the deep scrub interval.
454+
fmt_desc: The coefficient of variation for the deep scrub interval, specified as a
455+
ratio. On average, the next deep scrub for a PG is scheduled osd_deep_scrub_interval
456+
after the last deep scrub . The actual time is randomized to a normal distribution
457+
with a standard deviation of osd_deep_scrub_interval * osd_deep_scrub_interval_cv
458+
(clamped to within 2 standard deviations).
459+
The default value guarantees that 95% of the deep scrubs will be scheduled in the range
460+
[0.8 * osd_deep_scrub_interval, 1.2 * osd_deep_scrub_interval].
461+
min: 0
462+
max: 0.4
463+
default: 0.2
464+
with_legacy: false
447465
- name: osd_deep_scrub_randomize_ratio
448466
type: float
449467
level: advanced
450-
desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs
451-
are deep)
452-
long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they
453-
are uniformly distributed over the week
468+
desc: deprecated. Has no effect.
454469
default: 0.15
455470
with_legacy: true
456471
- name: osd_deep_scrub_stride

src/osd/scrubber/pg_scrubber.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,7 @@ bool PgScrubber::reserve_local(const Scrub::SchedTarget& trgt)
753753
Scrub::sched_conf_t PgScrubber::populate_config_params() const
754754
{
755755
const pool_opts_t& pool_conf = m_pg->get_pgpool().info.opts;
756-
auto& conf = get_pg_cct()->_conf; // for brevity
756+
const auto& conf = get_pg_cct()->_conf; // for brevity
757757
Scrub::sched_conf_t configs;
758758

759759
// deep-scrub optimal interval
@@ -792,7 +792,7 @@ Scrub::sched_conf_t PgScrubber::populate_config_params() const
792792
std::max(configs.max_shallow.value_or(0.0), configs.deep_interval);
793793

794794
configs.interval_randomize_ratio = conf->osd_scrub_interval_randomize_ratio;
795-
configs.deep_randomize_ratio = conf->osd_deep_scrub_randomize_ratio;
795+
configs.deep_randomize_ratio = conf.get_val<double>("osd_deep_scrub_interval_cv");
796796
configs.mandatory_on_invalid = conf->osd_scrub_invalid_stats;
797797

798798
dout(15) << fmt::format("{}: updated config:{}", __func__, configs) << dendl;
@@ -2608,9 +2608,7 @@ PgScrubber::PgScrubber(PG* pg)
26082608
{
26092609
m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
26102610
m_fsm->initiate();
2611-
2612-
m_scrub_job = std::make_optional<Scrub::ScrubJob>(
2613-
m_osds->cct, m_pg->pg_id, m_osds->get_nodeid());
2611+
m_scrub_job.emplace(m_osds->cct, m_pg->pg_id, m_osds->get_nodeid());
26142612
}
26152613

26162614
void PgScrubber::set_scrub_duration(std::chrono::milliseconds duration)

src/osd/scrubber/scrub_job.cc

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ ScrubJob::ScrubJob(CephContext* cct, const spg_t& pg, int node_id)
6565
, shallow_target{pg, scrub_level_t::shallow}
6666
, deep_target{pg, scrub_level_t::deep}
6767
, cct{cct}
68+
, random_gen{random_dev()}
6869
, log_msg_prefix{fmt::format("osd.{} scrub-job:pg[{}]:", node_id, pgid)}
6970
{}
7071

@@ -240,6 +241,7 @@ utime_t ScrubJob::get_sched_time() const
240241
return earliest_target().sched_info.schedule.not_before;
241242
}
242243

244+
243245
void ScrubJob::adjust_deep_schedule(
244246
utime_t last_deep,
245247
const Scrub::sched_conf_t& app_conf,
@@ -256,13 +258,7 @@ void ScrubJob::adjust_deep_schedule(
256258

257259
auto& dp_times = deep_target.sched_info.schedule; // shorthand
258260

259-
if (!ScrubJob::requires_randomization(deep_target.urgency())) {
260-
// the target time is already set. Make sure to reset the n.b. and
261-
// the (irrelevant) deadline
262-
dp_times.not_before = dp_times.scheduled_at;
263-
dp_times.deadline = dp_times.scheduled_at;
264-
265-
} else {
261+
if (ScrubJob::requires_randomization(deep_target.urgency())) {
266262
utime_t adj_not_before = last_deep;
267263
utime_t adj_target = last_deep;
268264
dp_times.deadline = adj_target;
@@ -271,10 +267,18 @@ void ScrubJob::adjust_deep_schedule(
271267
// scrubs that are not already eligible for scrubbing.
272268
if ((modify_ready_targets == delay_ready_t::delay_ready) ||
273269
adj_not_before > scrub_clock_now) {
274-
adj_target += app_conf.deep_interval;
275-
double r = rand() / (double)RAND_MAX;
276-
adj_target += app_conf.deep_interval * app_conf.interval_randomize_ratio *
277-
r; // RRR fix
270+
double sdv = app_conf.deep_interval * app_conf.deep_randomize_ratio;
271+
std::normal_distribution<double> normal_dist{app_conf.deep_interval, sdv};
272+
auto next_delay = std::clamp(
273+
normal_dist(random_gen), app_conf.deep_interval - 2 * sdv,
274+
app_conf.deep_interval + 2 * sdv);
275+
adj_target += next_delay;
276+
dout(20) << fmt::format(
277+
"deep scrubbing: next_delay={:.0f} (interval={:.0f}, "
278+
"ratio={:.3f}), adjusted:{:s}",
279+
next_delay, app_conf.deep_interval,
280+
app_conf.deep_randomize_ratio, adj_target)
281+
<< dendl;
278282
}
279283

280284
// the deadline can be updated directly into the scrub-job
@@ -288,6 +292,11 @@ void ScrubJob::adjust_deep_schedule(
288292
}
289293
dp_times.scheduled_at = adj_target;
290294
dp_times.not_before = adj_not_before;
295+
} else {
296+
// the target time is already set. Make sure to reset the n.b. and
297+
// the (irrelevant) deadline
298+
dp_times.not_before = dp_times.scheduled_at;
299+
dp_times.deadline = dp_times.scheduled_at;
291300
}
292301

293302
dout(10) << fmt::format(

src/osd/scrubber/scrub_job.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <compare>
77
#include <iostream>
88
#include <memory>
9+
#include <random>
910
#include <vector>
1011

1112
#include "common/ceph_atomic.h"
@@ -65,8 +66,9 @@ struct sched_conf_t {
6566

6667
/**
6768
* a randomization factor aimed at preventing 'thundering herd' problems
68-
* upon deep-scrubs common intervals. If polling a random number smaller
69-
* than that percentage, the next shallow scrub is upgraded to deep.
69+
* upon deep-scrubs common intervals. The actual deep scrub interval will
70+
* be selected with a normal distribution around the configured interval,
71+
* with a standard deviation of <deep_randomize_ratio> * <interval>.
7072
*/
7173
double deep_randomize_ratio{0.0};
7274

@@ -168,6 +170,11 @@ class ScrubJob {
168170

169171
CephContext* cct;
170172

173+
/// random generator for the randomization of the scrub times
174+
/// \todo consider using one common generator in the OSD service
175+
std::random_device random_dev;
176+
std::mt19937 random_gen;
177+
171178
ScrubJob(CephContext* cct, const spg_t& pg, int node_id);
172179

173180
/**

0 commit comments

Comments
 (0)