Skip to content

Commit 6db9011

Browse files
authored
Merge pull request ceph#46293 from ronen-fr/wip-rf-sched-test
test/osd: unit-tests for the scrubber scheduler Reviewed-by: Samuel Just <[email protected]> Reviewed-by: Nitzan Mordechai <[email protected]> Reviewed-by: Neha Ojha <[email protected]>
2 parents 9476907 + 30a2e86 commit 6db9011

File tree

7 files changed

+563
-87
lines changed

7 files changed

+563
-87
lines changed

src/osd/OSD.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class MMonGetPurgedSnapsReply;
9494

9595
class OSD;
9696

97-
class OSDService {
97+
class OSDService : public Scrub::ScrubSchedListener {
9898
using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
9999
public:
100100
OSD *osd;
@@ -147,7 +147,7 @@ class OSDService {
147147
superblock = block;
148148
}
149149

150-
int get_nodeid() const { return whoami; }
150+
int get_nodeid() const final { return whoami; }
151151

152152
std::atomic<epoch_t> max_oldest_map;
153153
private:
@@ -290,7 +290,9 @@ class OSDService {
290290
* @param allow_requested_repair_only
291291
* @return a Scrub::attempt_t detailing either a success, or the failure reason.
292292
*/
293-
Scrub::schedule_result_t initiate_a_scrub(spg_t pgid, bool allow_requested_repair_only);
293+
Scrub::schedule_result_t initiate_a_scrub(
294+
spg_t pgid,
295+
bool allow_requested_repair_only) final;
294296

295297

296298
private:

src/osd/scrubber/osd_scrub_sched.cc

Lines changed: 76 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22
// vim: ts=8 sw=2 smarttab
33
#include "./osd_scrub_sched.h"
44

5-
#include "include/utime_fmt.h"
65
#include "osd/OSD.h"
7-
#include "osd/osd_types_fmt.h"
86

97
#include "pg_scrubber.h"
108

@@ -79,11 +77,12 @@ std::string ScrubQueue::ScrubJob::scheduling_state(utime_t now_is,
7977
#undef dout_context
8078
#define dout_context (cct)
8179
#undef dout_prefix
82-
#define dout_prefix \
83-
*_dout << "osd." << osd_service.whoami << " scrub-queue::" << __func__ << " "
80+
#define dout_prefix \
81+
*_dout << "osd." << osd_service.get_nodeid() << " scrub-queue::" << __func__ \
82+
<< " "
8483

8584

86-
ScrubQueue::ScrubQueue(CephContext* cct, OSDService& osds)
85+
ScrubQueue::ScrubQueue(CephContext* cct, Scrub::ScrubSchedListener& osds)
8786
: cct{cct}
8887
, osd_service{osds}
8988
{
@@ -98,7 +97,7 @@ ScrubQueue::ScrubQueue(CephContext* cct, OSDService& osds)
9897

9998
std::optional<double> ScrubQueue::update_load_average()
10099
{
101-
int hb_interval = cct->_conf->osd_heartbeat_interval;
100+
int hb_interval = conf()->osd_heartbeat_interval;
102101
int n_samples = 60 * 24 * 24;
103102
if (hb_interval > 1) {
104103
n_samples /= hb_interval;
@@ -220,6 +219,45 @@ void ScrubQueue::update_job(ScrubJobRef scrub_job,
220219
scrub_job->update_schedule(adjusted);
221220
}
222221

222+
ScrubQueue::sched_params_t ScrubQueue::determine_scrub_time(
223+
const requested_scrub_t& request_flags,
224+
const pg_info_t& pg_info,
225+
const pool_opts_t pool_conf) const
226+
{
227+
ScrubQueue::sched_params_t res;
228+
dout(15) << ": requested_scrub_t: {}" << request_flags << dendl;
229+
230+
if (request_flags.must_scrub || request_flags.need_auto) {
231+
232+
// Set the smallest time that isn't utime_t()
233+
res.proposed_time = PgScrubber::scrub_must_stamp();
234+
res.is_must = ScrubQueue::must_scrub_t::mandatory;
235+
// we do not need the interval data in this case
236+
237+
} else if (pg_info.stats.stats_invalid &&
238+
conf()->osd_scrub_invalid_stats) {
239+
res.proposed_time = time_now();
240+
res.is_must = ScrubQueue::must_scrub_t::mandatory;
241+
242+
} else {
243+
res.proposed_time = pg_info.history.last_scrub_stamp;
244+
res.min_interval = pool_conf.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
245+
res.max_interval = pool_conf.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
246+
}
247+
248+
dout(15) << fmt::format(
249+
": suggested: {} hist: {} v: {}/{} must: {} pool-min: {}",
250+
res.proposed_time,
251+
pg_info.history.last_scrub_stamp,
252+
(bool)pg_info.stats.stats_invalid,
253+
conf()->osd_scrub_invalid_stats,
254+
(res.is_must == must_scrub_t::mandatory ? "y" : "n"),
255+
res.min_interval)
256+
<< dendl;
257+
return res;
258+
}
259+
260+
223261
// used under jobs_lock
224262
void ScrubQueue::move_failed_pgs(utime_t now_is)
225263
{
@@ -237,7 +275,7 @@ void ScrubQueue::move_failed_pgs(utime_t now_is)
237275

238276
// determine the penalty time, after which the job should be reinstated
239277
utime_t after = now_is;
240-
after += cct->_conf->osd_scrub_sleep * 2 + utime_t{300'000ms};
278+
after += conf()->osd_scrub_sleep * 2 + utime_t{300'000ms};
241279

242280
// note: currently - not taking 'deadline' into account when determining
243281
// 'penalty_timeout'.
@@ -309,7 +347,7 @@ Scrub::schedule_result_t ScrubQueue::select_pg_and_scrub(
309347
dout(10) << " reg./pen. sizes: " << to_scrub.size() << " / "
310348
<< penalized.size() << dendl;
311349

312-
utime_t now_is = ceph_clock_now();
350+
utime_t now_is = time_now();
313351

314352
preconds.time_permit = scrub_time_permit(now_is);
315353
preconds.load_is_low = scrub_load_below_threshold();
@@ -489,28 +527,28 @@ ScrubQueue::scrub_schedule_t ScrubQueue::adjust_target_time(
489527

490528
if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
491529
dout(20) << "min t: " << times.min_interval
492-
<< " osd: " << cct->_conf->osd_scrub_min_interval
530+
<< " osd: " << conf()->osd_scrub_min_interval
493531
<< " max t: " << times.max_interval
494-
<< " osd: " << cct->_conf->osd_scrub_max_interval << dendl;
532+
<< " osd: " << conf()->osd_scrub_max_interval << dendl;
495533

496534
dout(20) << "at " << sched_n_dead.scheduled_at << " ratio "
497-
<< cct->_conf->osd_scrub_interval_randomize_ratio << dendl;
535+
<< conf()->osd_scrub_interval_randomize_ratio << dendl;
498536
}
499537

500538
if (times.is_must == ScrubQueue::must_scrub_t::not_mandatory) {
501539

502540
// unless explicitly requested, postpone the scrub with a random delay
503541
double scrub_min_interval = times.min_interval > 0
504542
? times.min_interval
505-
: cct->_conf->osd_scrub_min_interval;
543+
: conf()->osd_scrub_min_interval;
506544
double scrub_max_interval = times.max_interval > 0
507545
? times.max_interval
508-
: cct->_conf->osd_scrub_max_interval;
546+
: conf()->osd_scrub_max_interval;
509547

510548
sched_n_dead.scheduled_at += scrub_min_interval;
511549
double r = rand() / (double)RAND_MAX;
512550
sched_n_dead.scheduled_at +=
513-
scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
551+
scrub_min_interval * conf()->osd_scrub_interval_randomize_ratio * r;
514552

515553
if (scrub_max_interval <= 0) {
516554
sched_n_dead.deadline = utime_t{};
@@ -526,15 +564,15 @@ ScrubQueue::scrub_schedule_t ScrubQueue::adjust_target_time(
526564

527565
double ScrubQueue::scrub_sleep_time(bool must_scrub) const
528566
{
529-
double regular_sleep_period = cct->_conf->osd_scrub_sleep;
567+
double regular_sleep_period = conf()->osd_scrub_sleep;
530568

531-
if (must_scrub || scrub_time_permit(ceph_clock_now())) {
569+
if (must_scrub || scrub_time_permit(time_now())) {
532570
return regular_sleep_period;
533571
}
534572

535573
// relevant if scrubbing started during allowed time, but continued into
536574
// forbidden hours
537-
double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
575+
double extended_sleep = conf()->osd_scrub_extended_sleep;
538576
dout(20) << "w/ extended sleep (" << extended_sleep << ")" << dendl;
539577
return std::max(extended_sleep, regular_sleep_period);
540578
}
@@ -550,9 +588,9 @@ bool ScrubQueue::scrub_load_below_threshold() const
550588
// allow scrub if below configured threshold
551589
long cpus = sysconf(_SC_NPROCESSORS_ONLN);
552590
double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
553-
if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
591+
if (loadavg_per_cpu < conf()->osd_scrub_load_threshold) {
554592
dout(20) << "loadavg per cpu " << loadavg_per_cpu << " < max "
555-
<< cct->_conf->osd_scrub_load_threshold << " = yes" << dendl;
593+
<< conf()->osd_scrub_load_threshold << " = yes" << dendl;
556594
return true;
557595
}
558596

@@ -565,7 +603,7 @@ bool ScrubQueue::scrub_load_below_threshold() const
565603
}
566604

567605
dout(20) << "loadavg " << loadavgs[0] << " >= max "
568-
<< cct->_conf->osd_scrub_load_threshold << " and ( >= daily_loadavg "
606+
<< conf()->osd_scrub_load_threshold << " and ( >= daily_loadavg "
569607
<< daily_loadavg << " or >= 15m avg " << loadavgs[2] << ") = no"
570608
<< dendl;
571609
return false;
@@ -616,22 +654,22 @@ bool ScrubQueue::scrub_time_permit(utime_t now) const
616654
time_t tt = now.sec();
617655
localtime_r(&tt, &bdt);
618656

619-
bool day_permit = isbetween_modulo(cct->_conf->osd_scrub_begin_week_day,
620-
cct->_conf->osd_scrub_end_week_day,
657+
bool day_permit = isbetween_modulo(conf()->osd_scrub_begin_week_day,
658+
conf()->osd_scrub_end_week_day,
621659
bdt.tm_wday);
622660
if (!day_permit) {
623661
dout(20) << "should run between week day "
624-
<< cct->_conf->osd_scrub_begin_week_day << " - "
625-
<< cct->_conf->osd_scrub_end_week_day << " now " << bdt.tm_wday
662+
<< conf()->osd_scrub_begin_week_day << " - "
663+
<< conf()->osd_scrub_end_week_day << " now " << bdt.tm_wday
626664
<< " - no" << dendl;
627665
return false;
628666
}
629667

630-
bool time_permit = isbetween_modulo(cct->_conf->osd_scrub_begin_hour,
631-
cct->_conf->osd_scrub_end_hour,
668+
bool time_permit = isbetween_modulo(conf()->osd_scrub_begin_hour,
669+
conf()->osd_scrub_end_hour,
632670
bdt.tm_hour);
633-
dout(20) << "should run between " << cct->_conf->osd_scrub_begin_hour << " - "
634-
<< cct->_conf->osd_scrub_end_hour << " now (" << bdt.tm_hour
671+
dout(20) << "should run between " << conf()->osd_scrub_begin_hour << " - "
672+
<< conf()->osd_scrub_end_hour << " now (" << bdt.tm_hour
635673
<< ") = " << (time_permit ? "yes" : "no") << dendl;
636674
return time_permit;
637675
}
@@ -694,34 +732,34 @@ bool ScrubQueue::can_inc_scrubs() const
694732
// inc_scrubs_local() failures
695733
std::lock_guard lck{resource_lock};
696734

697-
if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
735+
if (scrubs_local + scrubs_remote < conf()->osd_max_scrubs) {
698736
return true;
699737
}
700738

701739
dout(20) << " == false. " << scrubs_local << " local + " << scrubs_remote
702-
<< " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
740+
<< " remote >= max " << conf()->osd_max_scrubs << dendl;
703741
return false;
704742
}
705743

706744
bool ScrubQueue::inc_scrubs_local()
707745
{
708746
std::lock_guard lck{resource_lock};
709747

710-
if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
748+
if (scrubs_local + scrubs_remote < conf()->osd_max_scrubs) {
711749
++scrubs_local;
712750
return true;
713751
}
714752

715753
dout(20) << ": " << scrubs_local << " local + " << scrubs_remote
716-
<< " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
754+
<< " remote >= max " << conf()->osd_max_scrubs << dendl;
717755
return false;
718756
}
719757

720758
void ScrubQueue::dec_scrubs_local()
721759
{
722760
std::lock_guard lck{resource_lock};
723761
dout(20) << ": " << scrubs_local << " -> " << (scrubs_local - 1) << " (max "
724-
<< cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")"
762+
<< conf()->osd_max_scrubs << ", remote " << scrubs_remote << ")"
725763
<< dendl;
726764

727765
--scrubs_local;
@@ -732,24 +770,24 @@ bool ScrubQueue::inc_scrubs_remote()
732770
{
733771
std::lock_guard lck{resource_lock};
734772

735-
if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
773+
if (scrubs_local + scrubs_remote < conf()->osd_max_scrubs) {
736774
dout(20) << ": " << scrubs_remote << " -> " << (scrubs_remote + 1)
737-
<< " (max " << cct->_conf->osd_max_scrubs << ", local "
775+
<< " (max " << conf()->osd_max_scrubs << ", local "
738776
<< scrubs_local << ")" << dendl;
739777
++scrubs_remote;
740778
return true;
741779
}
742780

743781
dout(20) << ": " << scrubs_local << " local + " << scrubs_remote
744-
<< " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
782+
<< " remote >= max " << conf()->osd_max_scrubs << dendl;
745783
return false;
746784
}
747785

748786
void ScrubQueue::dec_scrubs_remote()
749787
{
750788
std::lock_guard lck{resource_lock};
751789
dout(20) << ": " << scrubs_remote << " -> " << (scrubs_remote - 1) << " (max "
752-
<< cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")"
790+
<< conf()->osd_max_scrubs << ", local " << scrubs_local << ")"
753791
<< dendl;
754792
--scrubs_remote;
755793
ceph_assert(scrubs_remote >= 0);
@@ -760,5 +798,5 @@ void ScrubQueue::dump_scrub_reservations(ceph::Formatter* f) const
760798
std::lock_guard lck{resource_lock};
761799
f->dump_int("scrubs_local", scrubs_local);
762800
f->dump_int("scrubs_remote", scrubs_remote);
763-
f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
801+
f->dump_int("osd_max_scrubs", conf()->osd_max_scrubs);
764802
}

0 commit comments

Comments
 (0)