Skip to content

Commit ce19a4e

Browse files
authored
Merge pull request ceph#62818 from ronen-fr/wip-rf-iocnt-plus
osd/scrub: performance counters: count I/Os, use unlabeled counters Reviewed-by: Alex Ainscow <[email protected]> Reviewed-by: Radoslaw Zarzynski <[email protected]> Reviewed-by: Bill Scales <[email protected]> Reviewed-by: Samuel Just <[email protected]>
2 parents 7a2fecf + 62f9b36 commit ce19a4e

File tree

10 files changed

+329
-79
lines changed

10 files changed

+329
-79
lines changed

src/osd/osd_perf_counters.cc

Lines changed: 116 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,122 @@ PerfCounters *build_osd_logger(CephContext *cct) {
356356
osd_plb.add_u64_counter(
357357
l_osd_watch_timeouts, "watch_timeouts",
358358
"Number of watches that timed out or were blocklisted",
359-
NULL, PerfCountersBuilder::PRIO_USEFUL);
359+
nullptr, PerfCountersBuilder::PRIO_USEFUL);
360+
361+
// scrub I/O (no EC vs. replicated differentiation)
362+
osd_plb.add_u64_counter(l_osd_scrub_omapgetheader_cnt, "scrub_omapgetheader_cnt", "scrub omap get header calls count");
363+
osd_plb.add_u64_counter(l_osd_scrub_omapgetheader_bytes, "scrub_omapgetheader_bytes", "scrub omap get header bytes read");
364+
osd_plb.add_u64_counter(l_osd_scrub_omapget_cnt, "scrub_omapget_cnt", "scrub omap get calls count");
365+
osd_plb.add_u64_counter(l_osd_scrub_omapget_bytes, "scrub_omapget_bytes", "scrub omap get bytes read");
366+
// scrub I/O performed for replicated pools
367+
osd_plb.add_u64_counter(l_osd_scrub_rppool_getattr_cnt, "scrub_replicated_getattr_cnt", "scrub replicated pool getattr calls count");
368+
osd_plb.add_u64_counter(l_osd_scrub_rppool_stats_cnt, "scrub_replicated_stats_cnt", "scrub replicated pool stats calls count");
369+
osd_plb.add_u64_counter(l_osd_scrub_rppool_read_cnt, "scrub_replicated_read_cnt", "scrub replicated pool read calls count");
370+
osd_plb.add_u64_counter(l_osd_scrub_rppool_read_bytes, "scrub_replicated_read_bytes", "scrub replicated pool read bytes read");
371+
// scrub I/O performed for EC pools
372+
osd_plb.add_u64_counter(l_osd_scrub_ec_getattr_cnt, "scrub_ec_getattr_cnt", "scrub EC getattr calls count");
373+
osd_plb.add_u64_counter(l_osd_scrub_ec_stats_cnt, "scrub_ec_stats_cnt", "scrub EC stats calls count");
374+
osd_plb.add_u64_counter(l_osd_scrub_ec_read_cnt, "scrub_ec_read_cnt", "scrub EC read calls count");
375+
osd_plb.add_u64_counter(l_osd_scrub_ec_read_bytes, "scrub_ec_read_bytes", "scrub EC read bytes read");
376+
377+
// scrub - replicated pools
378+
osd_plb.add_u64_counter(
379+
l_osd_scrub_rppool_started,
380+
"num_scrubs_started_replicated",
381+
"replicated scrubs attempted count");
382+
osd_plb.add_u64_counter(
383+
l_osd_scrub_rppool_active_started,
384+
"num_scrubs_past_reservation_replicated",
385+
"replicated scrubs count");
386+
osd_plb.add_u64_counter(
387+
l_osd_scrub_rppool_successful,
388+
"successful_scrubs_replicated",
389+
"successful replicated scrubs count");
390+
osd_plb.add_time_avg(
391+
l_osd_scrub_rppool_successful_elapsed,
392+
"successful_scrubs_replicated_elapsed",
393+
"time to complete a successful replicated scrub");
394+
osd_plb.add_u64_counter(
395+
l_osd_scrub_rppool_failed, "failed_scrubs_replicated",
396+
"failed replicated scrubs count");
397+
osd_plb.add_time_avg(
398+
l_osd_scrub_rppool_failed_elapsed,
399+
"failed_scrubs_replicated_elapsed",
400+
"time to scrub failure replicated");
401+
402+
// the replica reservation process - replicated pool
403+
osd_plb.add_u64_counter(
404+
l_osd_scrub_rppool_reserv_success,
405+
"scrub_replicated_scrub_reservations_completed",
406+
"successfully completed reservation processes");
407+
osd_plb.add_time_avg(
408+
l_osd_scrub_rppool_reserv_successful_elapsed,
409+
"scrub_replicated_successful_reservations_elapsed",
410+
"time to scrub reservation completion");
411+
osd_plb.add_u64_counter(
412+
l_osd_scrub_rppool_reserv_aborted,
413+
"scrub_replicated_reservation_process_aborted",
414+
"scrub replicated pool reservation was aborted");
415+
osd_plb.add_u64_counter(
416+
l_osd_scrub_rppool_reserv_rejected,
417+
"scrub_replicated_reservation_process_failure",
418+
"scrub replicated pool reservation failed due to replica denial");
419+
osd_plb.add_u64_counter(
420+
l_osd_scrub_rppool_reserv_skipped,
421+
"scrub_replicated_reservation_process_skipped",
422+
"scrub replicated pool reservation skipped for high priority scrub");
423+
osd_plb.add_time_avg(
424+
l_osd_scrub_rppool_reserv_failed_elapsed,
425+
"scrub_replicated_failed_reservations_elapsed",
426+
"scrub replicated pool time for scrub reservation to fail");
427+
osd_plb.add_u64(
428+
l_osd_scrub_rppool_reserv_secondaries_num,
429+
"scrub_replicated_replicas_in_reservation",
430+
"scrub replicated pool number of replicas to reserve");
431+
432+
// scrub - EC
433+
osd_plb.add_u64_counter(
434+
l_osd_scrub_ec_started, "num_scrubs_started_ec",
435+
"EC scrubs attempted count");
436+
osd_plb.add_u64_counter(
437+
l_osd_scrub_ec_active_started, "num_scrubs_past_reservation_ec",
438+
"EC scrubs count");
439+
osd_plb.add_u64_counter(
440+
l_osd_scrub_ec_successful, "successful_scrubs_ec",
441+
"successful EC scrubs count");
442+
osd_plb.add_time_avg(
443+
l_osd_scrub_ec_successful_elapsed, "successful_scrubs_ec_elapsed",
444+
"time to complete a successful EC scrub");
445+
osd_plb.add_u64_counter(
446+
l_osd_scrub_ec_failed, "failed_scrubs_ec", "failed scrubs count EC");
447+
osd_plb.add_time_avg(
448+
l_osd_scrub_ec_failed_elapsed, "failed_scrubs_ec_elapsed",
449+
"time to scrub failure ec");
450+
451+
// the replica reservation process - EC
452+
osd_plb.add_u64_counter(
453+
l_osd_scrub_ec_reserv_success, "scrub_ec_reservations_completed",
454+
"successfully completed reservation processes EC");
455+
osd_plb.add_time_avg(
456+
l_osd_scrub_ec_reserv_successful_elapsed,
457+
"scrub_ec_successful_reservations_elapsed",
458+
"time to EC scrub reservation completion");
459+
osd_plb.add_u64_counter(
460+
l_osd_scrub_ec_reserv_aborted, "scrub_ec_reservation_process_aborted",
461+
"scrub reservation was aborted EC");
462+
osd_plb.add_u64_counter(
463+
l_osd_scrub_ec_reserv_rejected, "scrub_ec_reservation_process_failure",
464+
"scrub reservation failed due to replica denial EC");
465+
osd_plb.add_u64_counter(
466+
l_osd_scrub_ec_reserv_skipped, "scrub_ec_reservation_process_skipped",
467+
"scrub reservation skipped for high priority scrub EC");
468+
osd_plb.add_time_avg(
469+
l_osd_scrub_ec_reserv_failed_elapsed,
470+
"scrub_ec_failed_reservations_elapsed",
471+
"time for scrub reservation to fail EC");
472+
osd_plb.add_u64(
473+
l_osd_scrub_ec_reserv_secondaries_num, "scrub_ec_replicas_in_reservation",
474+
"number of replicas to reserve EC");
360475

361476
return osd_plb.create_perf_counters();
362477
}
@@ -409,27 +524,12 @@ PerfCounters *build_scrub_labeled_perf(CephContext *cct, std::string label)
409524

410525
scrub_perf.set_prio_default(PerfCountersBuilder::PRIO_INTERESTING);
411526

412-
scrub_perf.add_u64_counter(scrbcnt_started, "num_scrubs_started", "scrubs attempted count");
413-
scrub_perf.add_u64_counter(scrbcnt_active_started, "num_scrubs_past_reservation", "scrubs count");
414-
scrub_perf.add_u64_counter(scrbcnt_failed, "failed_scrubs", "failed scrubs count");
415-
scrub_perf.add_u64_counter(scrbcnt_successful, "successful_scrubs", "successful scrubs count");
416-
scrub_perf.add_time_avg(scrbcnt_failed_elapsed, "failed_scrubs_elapsed", "time to scrub failure");
417-
scrub_perf.add_time_avg(scrbcnt_successful_elapsed, "successful_scrubs_elapsed", "time to scrub completion");
418-
419527
scrub_perf.add_u64_counter(scrbcnt_preempted, "preemptions", "preemptions on scrubs");
420528
scrub_perf.add_u64_counter(scrbcnt_chunks_selected, "chunk_selected", "chunk selection during scrubs");
421529
scrub_perf.add_u64_counter(scrbcnt_chunks_busy, "chunk_busy", "chunk busy during scrubs");
422530
scrub_perf.add_u64_counter(scrbcnt_blocked, "locked_object", "waiting on locked object events");
423531
scrub_perf.add_u64_counter(scrbcnt_write_blocked, "write_blocked_by_scrub", "write blocked by scrub");
424532

425-
// the replica reservation process
426-
scrub_perf.add_u64_counter(scrbcnt_resrv_success, "scrub_reservations_completed", "successfully completed reservation processes");
427-
scrub_perf.add_time_avg(scrbcnt_resrv_successful_elapsed, "successful_reservations_elapsed", "time to scrub reservation completion");
428-
scrub_perf.add_u64_counter(scrbcnt_resrv_aborted, "reservation_process_aborted", "scrub reservation was aborted");
429-
scrub_perf.add_u64_counter(scrbcnt_resrv_rejected, "reservation_process_failure", "scrub reservation failed due to replica denial");
430-
scrub_perf.add_u64_counter(scrbcnt_resrv_skipped, "reservation_process_skipped", "scrub reservation skipped for high priority scrub");
431-
scrub_perf.add_time_avg(scrbcnt_resrv_failed_elapsed, "failed_reservations_elapsed", "time for scrub reservation to fail");
432-
scrub_perf.add_u64(scrbcnt_resrv_replicas_num, "replicas_in_reservation", "number of replicas in reservation");
433533

434534
return scrub_perf.create_perf_counters();
435535
}

src/osd/osd_perf_counters.h

Lines changed: 70 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "common/perf_counters.h"
88
#include "common/perf_counters_key.h"
99

10-
enum {
10+
enum osd_counter_idx_t {
1111
l_osd_first = 10000,
1212
l_osd_op_wip,
1313
l_osd_op,
@@ -143,6 +143,75 @@ enum {
143143

144144
l_osd_watch_timeouts,
145145

146+
// scrub I/O (no EC vs. replicated differentiation)
147+
l_osd_scrub_omapgetheader_cnt, ///< omap get header calls count
148+
l_osd_scrub_omapgetheader_bytes, ///< bytes read by omap get header
149+
l_osd_scrub_omapget_cnt, ///< omap get calls count
150+
l_osd_scrub_omapget_bytes, ///< total bytes read by omap get
151+
152+
// ---- scrub I/O - replicated pools
153+
l_osd_scrub_rppool_getattr_cnt, ///< get_attr calls count
154+
l_osd_scrub_rppool_stats_cnt, ///< stats calls count
155+
l_osd_scrub_rppool_read_cnt, ///< read calls count
156+
l_osd_scrub_rppool_read_bytes, ///< total bytes read
157+
158+
// ---- scrub I/O - EC
159+
l_osd_scrub_ec_getattr_cnt, ///< get_attr calls count
160+
l_osd_scrub_ec_stats_cnt, ///< stats calls count
161+
l_osd_scrub_ec_read_cnt, ///< read calls count
162+
l_osd_scrub_ec_read_bytes, ///< total bytes read
163+
164+
// ---- scrub - replicated pools
165+
l_osd_scrub_rppool_started, ///< scrubs that got started
166+
l_osd_scrub_rppool_active_started, ///< scrubs that got past replicas reservation
167+
l_osd_scrub_rppool_successful, ///< successful scrubs count
168+
l_osd_scrub_rppool_successful_elapsed, ///< time to complete a successful scrub
169+
l_osd_scrub_rppool_failed, ///< failed scrubs count
170+
l_osd_scrub_rppool_failed_elapsed, ///< time from start to failure
171+
172+
// ---- scrub reservation process - replicated pools
173+
174+
/// successful replicas reservation count
175+
l_osd_scrub_rppool_reserv_success,
176+
/// time to complete a successful replicas reservation
177+
l_osd_scrub_rppool_reserv_successful_elapsed,
178+
/// failed attempt to reserve replicas due to an abort
179+
l_osd_scrub_rppool_reserv_aborted,
180+
/// reservation failed due to a 'rejected' response
181+
l_osd_scrub_rppool_reserv_rejected,
182+
/// reservation skipped for high-priority scrubs
183+
l_osd_scrub_rppool_reserv_skipped,
184+
/// time for a replicas reservation process to fail
185+
l_osd_scrub_rppool_reserv_failed_elapsed,
186+
/// number of replicas
187+
l_osd_scrub_rppool_reserv_secondaries_num,
188+
189+
190+
// ---- scrub - EC
191+
l_osd_scrub_ec_started, ///< scrubs that got started
192+
l_osd_scrub_ec_active_started, /// scrubs that got past secondaries reservation
193+
l_osd_scrub_ec_successful, ///< successful scrubs count
194+
l_osd_scrub_ec_successful_elapsed, ///< time to complete a successful scrub
195+
l_osd_scrub_ec_failed, ///< failed scrubs count
196+
l_osd_scrub_ec_failed_elapsed, ///< time from start to failure
197+
198+
// ---- scrub reservation process - EC
199+
200+
/// successful replicas reservation count
201+
l_osd_scrub_ec_reserv_success,
202+
/// time to complete a successful replicas reservation
203+
l_osd_scrub_ec_reserv_successful_elapsed,
204+
/// failed attempt to reserve replicas due to an abort
205+
l_osd_scrub_ec_reserv_aborted,
206+
/// reservation failed due to a 'rejected' response
207+
l_osd_scrub_ec_reserv_rejected,
208+
/// reservation skipped for high-priority scrubs
209+
l_osd_scrub_ec_reserv_skipped,
210+
/// time for a replicas reservation process to fail
211+
l_osd_scrub_ec_reserv_failed_elapsed,
212+
/// number of replicas
213+
l_osd_scrub_ec_reserv_secondaries_num,
214+
146215
l_osd_last,
147216
};
148217

@@ -192,20 +261,6 @@ PerfCounters *build_recoverystate_perf(CephContext *cct);
192261
enum {
193262
scrbcnt_first = 20500,
194263

195-
// -- basic statistics --
196-
/// The number of times we started a scrub
197-
scrbcnt_started,
198-
/// # scrubs that got past replicas reservation
199-
scrbcnt_active_started,
200-
/// # successful scrubs
201-
scrbcnt_successful,
202-
/// time to complete a successful scrub
203-
scrbcnt_successful_elapsed,
204-
/// # failed scrubs
205-
scrbcnt_failed,
206-
/// time for a scrub to fail
207-
scrbcnt_failed_elapsed,
208-
209264
// -- interruptions of various types
210265
/// # preemptions
211266
scrbcnt_preempted,
@@ -218,22 +273,6 @@ enum {
218273
/// # write blocked by the scrub
219274
scrbcnt_write_blocked,
220275

221-
// -- replicas reservation
222-
/// # successfully completed reservation steps
223-
scrbcnt_resrv_success,
224-
/// time to complete a successful replicas reservation
225-
scrbcnt_resrv_successful_elapsed,
226-
/// # failed attempt to reserve replicas due to an abort
227-
scrbcnt_resrv_aborted,
228-
/// # reservation failed due to a 'rejected' response
229-
scrbcnt_resrv_rejected,
230-
/// # reservation skipped for high-priority scrubs
231-
scrbcnt_resrv_skipped,
232-
/// time for a replicas reservation process to fail
233-
scrbcnt_resrv_failed_elapsed,
234-
/// # number of replicas
235-
scrbcnt_resrv_replicas_num,
236-
237276
scrbcnt_last,
238277
};
239278

src/osd/scrubber/pg_scrubber.cc

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -999,7 +999,7 @@ std::optional<uint64_t> PgScrubber::select_range()
999999

10001000
void PgScrubber::select_range_n_notify()
10011001
{
1002-
get_counters_set().inc(scrbcnt_chunks_selected);
1002+
get_labeled_counters()->inc(scrbcnt_chunks_selected);
10031003
auto num_chunk_objects = select_range();
10041004
if (num_chunk_objects.has_value()) {
10051005
// the next chunk to handle is not blocked
@@ -1010,7 +1010,7 @@ void PgScrubber::select_range_n_notify()
10101010
// we will wait for the objects range to become available for scrubbing
10111011
dout(10) << __func__ << ": selected chunk is busy" << dendl;
10121012
m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
1013-
get_counters_set().inc(scrbcnt_chunks_busy);
1013+
get_labeled_counters()->inc(scrbcnt_chunks_busy);
10141014
}
10151015
}
10161016

@@ -1042,7 +1042,7 @@ bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
10421042
return false;
10431043
}
10441044

1045-
get_counters_set().inc(scrbcnt_write_blocked);
1045+
get_labeled_counters()->inc(scrbcnt_write_blocked);
10461046
dout(20) << __func__ << " " << soid << " can preempt? "
10471047
<< preemption_data.is_preemptable() << " already preempted? "
10481048
<< preemption_data.was_preempted() << dendl;
@@ -2525,11 +2525,22 @@ void PgScrubber::set_scrub_duration(std::chrono::milliseconds duration)
25252525
});
25262526
}
25272527

2528-
PerfCounters& PgScrubber::get_counters_set() const
2528+
PerfCounters* PgScrubber::get_osd_perf_counters() const
25292529
{
2530-
return *m_osds->get_scrub_services().get_perf_counters(
2530+
return m_osds->logger;
2531+
}
2532+
2533+
const Scrub::ScrubCounterSet& PgScrubber::get_unlabeled_counters() const
2534+
{
2535+
return m_pg->pool.info.is_replicated() ? io_counters_replicated
2536+
: io_counters_ec;
2537+
}
2538+
2539+
PerfCounters* PgScrubber::get_labeled_counters() const
2540+
{
2541+
return m_osds->get_scrub_services().get_perf_counters(
25312542
(m_pg->pool.info.is_replicated() ? pg_pool_t::TYPE_REPLICATED
2532-
: pg_pool_t::TYPE_ERASURE),
2543+
: pg_pool_t::TYPE_ERASURE),
25332544
(m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow));
25342545
}
25352546

0 commit comments

Comments
 (0)