Skip to content

Commit 34b086e

Browse files
committed
osd: add watch ping timeout count in osd
For example, rbd send a watch ping to the header object every 5 seconds to keep watch, if the primary OSD is unable to receive the watch ping of the header object due to rbd network interruption, this means that rbd's I/O has already been hang. This way, we can quickly detect disconnection rbds on the osd, and reflected in metrics. Signed-off-by: Yite Gu <[email protected]>
1 parent 08d35a8 commit 34b086e

File tree

3 files changed

+10
-0
lines changed

3 files changed

+10
-0
lines changed

src/osd/PrimaryLogPG.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11826,6 +11826,10 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
1182611826
oi.watchers.erase(make_pair(watch->get_cookie(),
1182711827
watch->get_entity()));
1182811828

11829+
osd->logger->inc(l_osd_watch_timeouts);
11830+
dout(3) << __func__ << " watcher " << watch->get_peer_addr()
11831+
<< " object " << obc->obs.oi.soid << dendl;
11832+
1182911833
list<watch_disconnect_t> watch_disconnects = {
1183011834
watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
1183111835
};

src/osd/osd_perf_counters.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,10 @@ PerfCounters *build_osd_logger(CephContext *cct) {
337337
osd_plb.add_u64_counter_histogram(
338338
l_osd_scrub_reservation_dur_hist, "scrub_resrv_repnum_vs_duration",
339339
rsrv_hist_x_axis_config, rsrv_hist_y_axis_config, "Histogram of scrub replicas reservation duration");
340+
osd_plb.add_u64_counter(
341+
l_osd_watch_timeouts, "watch_timeouts",
342+
"Number of watches that timed out or were blocklisted",
343+
NULL, PerfCountersBuilder::PRIO_USEFUL);
340344

341345
return osd_plb.create_perf_counters();
342346
}

src/osd/osd_perf_counters.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ enum {
136136
// are labeled, and histograms do not fully support labels.
137137
l_osd_scrub_reservation_dur_hist,
138138

139+
l_osd_watch_timeouts,
140+
139141
l_osd_last,
140142
};
141143

0 commit comments

Comments
 (0)