Skip to content

Commit 62f9b36

Browse files
committed
osd/scrub: fixing scrub reservation process counters
Using regular (unlabeled) OSD performance counters for tracking the scrub reservation performance. Signed-off-by: Ronen Friedman <[email protected]>
1 parent 16cf250 commit 62f9b36

File tree

7 files changed

+176
-59
lines changed

7 files changed

+176
-59
lines changed

src/osd/osd_perf_counters.cc

Lines changed: 99 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -369,27 +369,109 @@ PerfCounters *build_osd_logger(CephContext *cct) {
369369
osd_plb.add_u64_counter(l_osd_scrub_rppool_read_cnt, "scrub_replicated_read_cnt", "scrub replicated pool read calls count");
370370
osd_plb.add_u64_counter(l_osd_scrub_rppool_read_bytes, "scrub_replicated_read_bytes", "scrub replicated pool read bytes read");
371371
// scrub I/O performed for EC pools
372-
osd_plb.add_u64_counter(l_osd_scrub_ec_getattr_cnt, "scrub_ec_getattr_cnt", "scrub ec getattr calls count");
373-
osd_plb.add_u64_counter(l_osd_scrub_ec_stats_cnt, "scrub_ec_stats_cnt", "scrub ec stats calls count");
374-
osd_plb.add_u64_counter(l_osd_scrub_ec_read_cnt, "scrub_ec_read_cnt", "scrub ec read calls count");
375-
osd_plb.add_u64_counter(l_osd_scrub_ec_read_bytes, "scrub_ec_read_bytes", "scrub ec read bytes read");
372+
osd_plb.add_u64_counter(l_osd_scrub_ec_getattr_cnt, "scrub_ec_getattr_cnt", "scrub EC getattr calls count");
373+
osd_plb.add_u64_counter(l_osd_scrub_ec_stats_cnt, "scrub_ec_stats_cnt", "scrub EC stats calls count");
374+
osd_plb.add_u64_counter(l_osd_scrub_ec_read_cnt, "scrub_ec_read_cnt", "scrub EC read calls count");
375+
osd_plb.add_u64_counter(l_osd_scrub_ec_read_bytes, "scrub_ec_read_bytes", "scrub EC read bytes read");
376376

377-
// scrub (no EC vs. replicated differentiation)
378377
// scrub - replicated pools
379-
osd_plb.add_u64_counter(l_osd_scrub_rppool_started, "num_scrubs_started_replicated", "replicated scrubs attempted count");
380-
osd_plb.add_u64_counter(l_osd_scrub_rppool_active_started, "num_scrubs_past_reservation_replicated", "replicated scrubs count");
381-
osd_plb.add_u64_counter(l_osd_scrub_rppool_successful, "successful_scrubs_replicated", "successful replicated scrubs count");
382-
osd_plb.add_time_avg(l_osd_scrub_rppool_successful_elapsed, "successful_scrubs_replicated_elapsed", "time to complete a successful replicated scrub");
383-
osd_plb.add_u64_counter(l_osd_scrub_rppool_failed, "failed_scrubs_replicated", "failed replicated scrubs count");
384-
osd_plb.add_time_avg(l_osd_scrub_rppool_failed_elapsed, "failed_scrubs_replicated_elapsed", "time to scrub failure replicated");
378+
osd_plb.add_u64_counter(
379+
l_osd_scrub_rppool_started,
380+
"num_scrubs_started_replicated",
381+
"replicated scrubs attempted count");
382+
osd_plb.add_u64_counter(
383+
l_osd_scrub_rppool_active_started,
384+
"num_scrubs_past_reservation_replicated",
385+
"replicated scrubs count");
386+
osd_plb.add_u64_counter(
387+
l_osd_scrub_rppool_successful,
388+
"successful_scrubs_replicated",
389+
"successful replicated scrubs count");
390+
osd_plb.add_time_avg(
391+
l_osd_scrub_rppool_successful_elapsed,
392+
"successful_scrubs_replicated_elapsed",
393+
"time to complete a successful replicated scrub");
394+
osd_plb.add_u64_counter(
395+
l_osd_scrub_rppool_failed, "failed_scrubs_replicated",
396+
"failed replicated scrubs count");
397+
osd_plb.add_time_avg(
398+
l_osd_scrub_rppool_failed_elapsed,
399+
"failed_scrubs_replicated_elapsed",
400+
"time to scrub failure replicated");
401+
402+
// the replica reservation process - replicated pool
403+
osd_plb.add_u64_counter(
404+
l_osd_scrub_rppool_reserv_success,
405+
"scrub_replicated_scrub_reservations_completed",
406+
"successfully completed reservation processes");
407+
osd_plb.add_time_avg(
408+
l_osd_scrub_rppool_reserv_successful_elapsed,
409+
"scrub_replicated_successful_reservations_elapsed",
410+
"time to scrub reservation completion");
411+
osd_plb.add_u64_counter(
412+
l_osd_scrub_rppool_reserv_aborted,
413+
"scrub_replicated_reservation_process_aborted",
414+
"scrub replicated pool reservation was aborted");
415+
osd_plb.add_u64_counter(
416+
l_osd_scrub_rppool_reserv_rejected,
417+
"scrub_replicated_reservation_process_failure",
418+
"scrub replicated pool reservation failed due to replica denial");
419+
osd_plb.add_u64_counter(
420+
l_osd_scrub_rppool_reserv_skipped,
421+
"scrub_replicated_reservation_process_skipped",
422+
"scrub replicated pool reservation skipped for high priority scrub");
423+
osd_plb.add_time_avg(
424+
l_osd_scrub_rppool_reserv_failed_elapsed,
425+
"scrub_replicated_failed_reservations_elapsed",
426+
"scrub replicated pool time for scrub reservation to fail");
427+
osd_plb.add_u64(
428+
l_osd_scrub_rppool_reserv_secondaries_num,
429+
"scrub_replicated_replicas_in_reservation",
430+
"scrub replicated pool number of replicas to reserve");
385431

386432
// scrub - EC
387-
osd_plb.add_u64_counter(l_osd_scrub_ec_started, "num_scrubs_started_ec", "scrubs attempted count ec");
388-
osd_plb.add_u64_counter(l_osd_scrub_ec_active_started, "num_scrubs_past_reservation_ec", "scrubs count ec");
389-
osd_plb.add_u64_counter(l_osd_scrub_ec_successful, "successful_scrubs_ec", "successful scrubs count ec");
390-
osd_plb.add_time_avg(l_osd_scrub_ec_successful_elapsed, "successful_scrubs_ec_elapsed", "time to complete a successful ec scrub");
391-
osd_plb.add_u64_counter(l_osd_scrub_ec_failed, "failed_scrubs_ec", "failed scrubs count ec");
392-
osd_plb.add_time_avg(l_osd_scrub_ec_failed_elapsed, "failed_scrubs_ec_elapsed", "time to scrub failure ec");
433+
osd_plb.add_u64_counter(
434+
l_osd_scrub_ec_started, "num_scrubs_started_ec",
435+
"EC scrubs attempted count");
436+
osd_plb.add_u64_counter(
437+
l_osd_scrub_ec_active_started, "num_scrubs_past_reservation_ec",
438+
"EC scrubs count");
439+
osd_plb.add_u64_counter(
440+
l_osd_scrub_ec_successful, "successful_scrubs_ec",
441+
"successful EC scrubs count");
442+
osd_plb.add_time_avg(
443+
l_osd_scrub_ec_successful_elapsed, "successful_scrubs_ec_elapsed",
444+
"time to complete a successful EC scrub");
445+
osd_plb.add_u64_counter(
446+
l_osd_scrub_ec_failed, "failed_scrubs_ec", "failed scrubs count EC");
447+
osd_plb.add_time_avg(
448+
l_osd_scrub_ec_failed_elapsed, "failed_scrubs_ec_elapsed",
449+
"time to scrub failure ec");
450+
451+
// the replica reservation process - EC
452+
osd_plb.add_u64_counter(
453+
l_osd_scrub_ec_reserv_success, "scrub_ec_reservations_completed",
454+
"successfully completed reservation processes EC");
455+
osd_plb.add_time_avg(
456+
l_osd_scrub_ec_reserv_successful_elapsed,
457+
"scrub_ec_successful_reservations_elapsed",
458+
"time to EC scrub reservation completion");
459+
osd_plb.add_u64_counter(
460+
l_osd_scrub_ec_reserv_aborted, "scrub_ec_reservation_process_aborted",
461+
"scrub reservation was aborted EC");
462+
osd_plb.add_u64_counter(
463+
l_osd_scrub_ec_reserv_rejected, "scrub_ec_reservation_process_failure",
464+
"scrub reservation failed due to replica denial EC");
465+
osd_plb.add_u64_counter(
466+
l_osd_scrub_ec_reserv_skipped, "scrub_ec_reservation_process_skipped",
467+
"scrub reservation skipped for high priority scrub EC");
468+
osd_plb.add_time_avg(
469+
l_osd_scrub_ec_reserv_failed_elapsed,
470+
"scrub_ec_failed_reservations_elapsed",
471+
"time for scrub reservation to fail EC");
472+
osd_plb.add_u64(
473+
l_osd_scrub_ec_reserv_secondaries_num, "scrub_ec_replicas_in_reservation",
474+
"number of replicas to reserve EC");
393475

394476
return osd_plb.create_perf_counters();
395477
}
@@ -448,14 +530,6 @@ PerfCounters *build_scrub_labeled_perf(CephContext *cct, std::string label)
448530
scrub_perf.add_u64_counter(scrbcnt_blocked, "locked_object", "waiting on locked object events");
449531
scrub_perf.add_u64_counter(scrbcnt_write_blocked, "write_blocked_by_scrub", "write blocked by scrub");
450532

451-
// the replica reservation process
452-
scrub_perf.add_u64_counter(scrbcnt_resrv_success, "scrub_reservations_completed", "successfully completed reservation processes");
453-
scrub_perf.add_time_avg(scrbcnt_resrv_successful_elapsed, "successful_reservations_elapsed", "time to scrub reservation completion");
454-
scrub_perf.add_u64_counter(scrbcnt_resrv_aborted, "reservation_process_aborted", "scrub reservation was aborted");
455-
scrub_perf.add_u64_counter(scrbcnt_resrv_rejected, "reservation_process_failure", "scrub reservation failed due to replica denial");
456-
scrub_perf.add_u64_counter(scrbcnt_resrv_skipped, "reservation_process_skipped", "scrub reservation skipped for high priority scrub");
457-
scrub_perf.add_time_avg(scrbcnt_resrv_failed_elapsed, "failed_reservations_elapsed", "time for scrub reservation to fail");
458-
scrub_perf.add_u64(scrbcnt_resrv_replicas_num, "replicas_in_reservation", "number of replicas in reservation");
459533

460534
return scrub_perf.create_perf_counters();
461535
}

src/osd/osd_perf_counters.h

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -169,14 +169,49 @@ enum osd_counter_idx_t {
169169
l_osd_scrub_rppool_failed, ///< failed scrubs count
170170
l_osd_scrub_rppool_failed_elapsed, ///< time from start to failure
171171

172-
// scrub - EC
172+
// ---- scrub reservation process - replicated pools
173+
174+
/// successful replicas reservation count
175+
l_osd_scrub_rppool_reserv_success,
176+
/// time to complete a successful replicas reservation
177+
l_osd_scrub_rppool_reserv_successful_elapsed,
178+
/// failed attempt to reserve replicas due to an abort
179+
l_osd_scrub_rppool_reserv_aborted,
180+
/// reservation failed due to a 'rejected' response
181+
l_osd_scrub_rppool_reserv_rejected,
182+
/// reservation skipped for high-priority scrubs
183+
l_osd_scrub_rppool_reserv_skipped,
184+
/// time for a replicas reservation process to fail
185+
l_osd_scrub_rppool_reserv_failed_elapsed,
186+
/// number of replicas
187+
l_osd_scrub_rppool_reserv_secondaries_num,
188+
189+
190+
// ---- scrub - EC
173191
l_osd_scrub_ec_started, ///< scrubs that got started
174192
l_osd_scrub_ec_active_started, /// scrubs that got past secondaries reservation
175193
l_osd_scrub_ec_successful, ///< successful scrubs count
176194
l_osd_scrub_ec_successful_elapsed, ///< time to complete a successful scrub
177195
l_osd_scrub_ec_failed, ///< failed scrubs count
178196
l_osd_scrub_ec_failed_elapsed, ///< time from start to failure
179197

198+
// ---- scrub reservation process - EC
199+
200+
/// successful replicas reservation count
201+
l_osd_scrub_ec_reserv_success,
202+
/// time to complete a successful replicas reservation
203+
l_osd_scrub_ec_reserv_successful_elapsed,
204+
/// failed attempt to reserve replicas due to an abort
205+
l_osd_scrub_ec_reserv_aborted,
206+
/// reservation failed due to a 'rejected' response
207+
l_osd_scrub_ec_reserv_rejected,
208+
/// reservation skipped for high-priority scrubs
209+
l_osd_scrub_ec_reserv_skipped,
210+
/// time for a replicas reservation process to fail
211+
l_osd_scrub_ec_reserv_failed_elapsed,
212+
/// number of replicas
213+
l_osd_scrub_ec_reserv_secondaries_num,
214+
180215
l_osd_last,
181216
};
182217

@@ -238,22 +273,6 @@ enum {
238273
/// # write blocked by the scrub
239274
scrbcnt_write_blocked,
240275

241-
// -- replicas reservation
242-
/// # successfully completed reservation steps
243-
scrbcnt_resrv_success,
244-
/// time to complete a successful replicas reservation
245-
scrbcnt_resrv_successful_elapsed,
246-
/// # failed attempt to reserve replicas due to an abort
247-
scrbcnt_resrv_aborted,
248-
/// # reservation failed due to a 'rejected' response
249-
scrbcnt_resrv_rejected,
250-
/// # reservation skipped for high-priority scrubs
251-
scrbcnt_resrv_skipped,
252-
/// time for a replicas reservation process to fail
253-
scrbcnt_resrv_failed_elapsed,
254-
/// # number of replicas
255-
scrbcnt_resrv_replicas_num,
256-
257276
scrbcnt_last,
258277
};
259278

src/osd/scrubber/pg_scrubber.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,15 @@ static inline constexpr ScrubCounterSet io_counters_replicated{
149149
.successful_cnt = l_osd_scrub_rppool_successful,
150150
.successful_elapsed = l_osd_scrub_rppool_successful_elapsed,
151151
.failed_cnt = l_osd_scrub_rppool_failed,
152-
.failed_elapsed = l_osd_scrub_rppool_failed_elapsed
152+
.failed_elapsed = l_osd_scrub_rppool_failed_elapsed,
153+
// replica-reservation-related:
154+
.rsv_successful_cnt = l_osd_scrub_rppool_reserv_success,
155+
.rsv_successful_elapsed = l_osd_scrub_rppool_reserv_successful_elapsed,
156+
.rsv_aborted_cnt = l_osd_scrub_rppool_reserv_aborted,
157+
.rsv_rejected_cnt = l_osd_scrub_rppool_reserv_rejected,
158+
.rsv_skipped_cnt = l_osd_scrub_rppool_reserv_skipped,
159+
.rsv_failed_elapsed = l_osd_scrub_rppool_reserv_failed_elapsed,
160+
.rsv_secondaries_num = l_osd_scrub_rppool_reserv_secondaries_num
153161
};
154162

155163
static inline constexpr ScrubCounterSet io_counters_ec{
@@ -166,7 +174,15 @@ static inline constexpr ScrubCounterSet io_counters_ec{
166174
.successful_cnt = l_osd_scrub_ec_successful,
167175
.successful_elapsed = l_osd_scrub_ec_successful_elapsed,
168176
.failed_cnt = l_osd_scrub_ec_failed,
169-
.failed_elapsed = l_osd_scrub_ec_failed_elapsed
177+
.failed_elapsed = l_osd_scrub_ec_failed_elapsed,
178+
// replica-reservation-related:
179+
.rsv_successful_cnt = l_osd_scrub_ec_reserv_success,
180+
.rsv_successful_elapsed = l_osd_scrub_ec_reserv_successful_elapsed,
181+
.rsv_aborted_cnt = l_osd_scrub_ec_reserv_aborted,
182+
.rsv_rejected_cnt = l_osd_scrub_ec_reserv_rejected,
183+
.rsv_skipped_cnt = l_osd_scrub_ec_reserv_skipped,
184+
.rsv_failed_elapsed = l_osd_scrub_ec_reserv_failed_elapsed,
185+
.rsv_secondaries_num = l_osd_scrub_ec_reserv_secondaries_num
170186
};
171187
} // namespace Scrub
172188

src/osd/scrubber/scrub_machine.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
257257
// initiate the reservation process
258258
session.m_reservations.emplace(
259259
*scrbr, context<PrimaryActive>().last_request_sent_nonce,
260-
*session.m_perf_set);
260+
*session.m_counters_idx);
261261

262262
if (!session.m_reservations->get_last_sent()) {
263263
// no replicas to reserve

src/osd/scrubber/scrub_reservations.cc

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ namespace Scrub {
3232
ReplicaReservations::ReplicaReservations(
3333
ScrubMachineListener& scrbr,
3434
reservation_nonce_t& nonce,
35-
PerfCounters& pc)
35+
const ScrubCounterSet& pc)
3636
: m_scrubber{scrbr}
3737
, m_pg{m_scrubber.get_pg()}
3838
, m_pgid{m_scrubber.get_spgid().pgid}
3939
, m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
4040
, m_last_request_sent_nonce{nonce}
41-
, m_perf_set{pc}
41+
, m_perf_indices{pc}
4242
{
4343
// the acting set is sorted by pg_shard_t. The reservations are to be issued
4444
// in this order, so that the OSDs will receive the requests in a consistent
@@ -52,7 +52,8 @@ ReplicaReservations::ReplicaReservations(
5252
[whoami = m_pg->pg_whoami](const pg_shard_t& shard) {
5353
return shard != whoami;
5454
});
55-
m_perf_set.set(scrbcnt_resrv_replicas_num, m_sorted_secondaries.size());
55+
m_osds->logger->set(
56+
m_perf_indices.rsv_secondaries_num, m_sorted_secondaries.size());
5657

5758
m_next_to_request = m_sorted_secondaries.cbegin();
5859
if (m_scrubber.is_reservation_required()) {
@@ -63,7 +64,7 @@ ReplicaReservations::ReplicaReservations(
6364
// for high-priority scrubs (i.e. - user-initiated), no reservations are
6465
// needed. Note: not perf-counted as either success or failure.
6566
dout(10) << "high-priority scrub - no reservations needed" << dendl;
66-
m_perf_set.inc(scrbcnt_resrv_skipped);
67+
m_osds->logger->inc(m_perf_indices.rsv_skipped_cnt);
6768
}
6869
}
6970

@@ -97,8 +98,8 @@ void ReplicaReservations::log_success_and_duration()
9798
{
9899
ceph_assert(m_process_started_at.has_value());
99100
auto logged_duration = ScrubClock::now() - m_process_started_at.value();
100-
m_perf_set.tinc(scrbcnt_resrv_successful_elapsed, logged_duration);
101-
m_perf_set.inc(scrbcnt_resrv_success);
101+
m_osds->logger->tinc(m_perf_indices.rsv_successful_elapsed, logged_duration);
102+
m_osds->logger->inc(m_perf_indices.rsv_successful_cnt);
102103
m_osds->logger->hinc(
103104
l_osd_scrub_reservation_dur_hist, std::ssize(m_sorted_secondaries),
104105
logged_duration.count());
@@ -112,16 +113,16 @@ void ReplicaReservations::log_failure_and_duration(int failure_cause_counter)
112113
return;
113114
}
114115
auto logged_duration = ScrubClock::now() - m_process_started_at.value();
115-
m_perf_set.tinc(scrbcnt_resrv_failed_elapsed, logged_duration);
116+
m_osds->logger->tinc(m_perf_indices.rsv_failed_elapsed, logged_duration);
116117
m_process_started_at.reset();
117118
// note: not counted into l_osd_scrub_reservation_dur_hist
118-
m_perf_set.inc(failure_cause_counter);
119+
m_osds->logger->inc(failure_cause_counter);
119120
}
120121

121122
ReplicaReservations::~ReplicaReservations()
122123
{
123124
release_all();
124-
log_failure_and_duration(scrbcnt_resrv_aborted);
125+
log_failure_and_duration(m_perf_indices.rsv_aborted_cnt);
125126
}
126127

127128
bool ReplicaReservations::is_reservation_response_relevant(
@@ -231,7 +232,7 @@ bool ReplicaReservations::handle_reserve_rejection(
231232
return false;
232233
}
233234

234-
log_failure_and_duration(scrbcnt_resrv_rejected);
235+
log_failure_and_duration(m_perf_indices.rsv_rejected_cnt);
235236

236237
// we should never see a rejection carrying a valid
237238
// reservation nonce - arriving while we have no pending requests

src/osd/scrubber/scrub_reservations.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,8 @@ class ReplicaReservations {
9090
*/
9191
reservation_nonce_t& m_last_request_sent_nonce;
9292

93-
/// access to the performance counters container relevant to this scrub
94-
/// parameters
95-
PerfCounters& m_perf_set;
93+
/// the performance counters relevant to this scrub
94+
const ScrubCounterSet& m_perf_indices;
9695

9796
/// used only for the 'duration of the reservation process' perf counter.
9897
/// discarded once the success or failure are recorded
@@ -102,7 +101,7 @@ class ReplicaReservations {
102101
ReplicaReservations(
103102
ScrubMachineListener& scrubber,
104103
reservation_nonce_t& nonce,
105-
PerfCounters& pc);
104+
const ScrubCounterSet& pc);
106105

107106
~ReplicaReservations();
108107

src/osd/scrubber_common.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,14 @@ struct ScrubCounterSet {
307307
osd_counter_idx_t successful_elapsed; ///< time to complete a successful scrub
308308
osd_counter_idx_t failed_cnt; ///< failed scrubs count
309309
osd_counter_idx_t failed_elapsed; ///< time from start to failure
310+
// reservation process related:
311+
osd_counter_idx_t rsv_successful_cnt; ///< completed reservation processes
312+
osd_counter_idx_t rsv_successful_elapsed; ///< time to all-reserved
313+
osd_counter_idx_t rsv_aborted_cnt; ///< failed due to an abort
314+
osd_counter_idx_t rsv_rejected_cnt; ///< 'rejected' response
315+
osd_counter_idx_t rsv_skipped_cnt; ///< high-priority. No reservation
316+
osd_counter_idx_t rsv_failed_elapsed; ///< time for reservation to fail
317+
osd_counter_idx_t rsv_secondaries_num; ///< number of replicas (EC or rep)
310318
};
311319

312320
} // namespace Scrub

0 commit comments

Comments
 (0)