Skip to content

Commit 2375f70

Browse files
committed
kvserver: add kv.closed_timestamp.policy_latency_info_missing
When a replica refreshes its policies, it looks up its peer replicas latency info via a map passed by PolicyRefresher, which in turn periodically pulls node latency info from RPCContext. If latency data for a node is missing, a default hardcoded max RTT of 150ms is used. Previously, it was hard to tell when this is happening. This commit adds metrics to track how often the closed timestamp policy refresh falls back to the default RTT due to missing node latency info. A high count might indicate the latency cache isn’t refreshed frequently enough, suggesting we should consider lowering kv.closed_timestamp.policy_latency_refresh_interval. Resolves: #143890 Release note: none
1 parent e013969 commit 2375f70

File tree

3 files changed

+19
-2
lines changed

3 files changed

+19
-2
lines changed

docs/generated/metrics/metrics.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@
212212
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_80ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_80MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
213213
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_with_no_latency_info</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
214214
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy_change</td><td>Number of times closed timestamp policy change occurred on ranges</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
215+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy_latency_info_missing</td><td>Number of times closed timestamp policy refresh had to use hardcoded network RTT due to missing node latency info for one or more replicas</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
215216
<tr><td>STORAGE</td><td>kv.concurrency.avg_lock_hold_duration_nanos</td><td>Average lock hold duration across locks currently held in lock tables. Does not include replicated locks (intents) that are not held in memory</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
216217
<tr><td>STORAGE</td><td>kv.concurrency.avg_lock_wait_duration_nanos</td><td>Average lock wait duration across requests currently waiting in lock wait-queues</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
217218
<tr><td>STORAGE</td><td>kv.concurrency.latch_conflict_wait_durations</td><td>Durations in nanoseconds spent on latch acquisition waiting for conflicts with other latches</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2442,6 +2442,14 @@ throttled they do count towards 'delay.total' and 'delay.enginebackpressure'.
24422442
Unit: metric.Unit_COUNT,
24432443
}
24442444

2445+
metaClosedTimestampLatencyInfoMissing = metric.Metadata{
2446+
Name: "kv.closed_timestamp.policy_latency_info_missing",
2447+
Help: "Number of times closed timestamp policy refresh had to use hardcoded network RTT " +
2448+
"due to missing node latency info for one or more replicas",
2449+
Measurement: "Events",
2450+
Unit: metric.Unit_COUNT,
2451+
}
2452+
24452453
// Replica circuit breaker.
24462454
metaReplicaCircuitBreakerCurTripped = metric.Metadata{
24472455
Name: "kv.replica_circuit_breaker.num_tripped_replicas",
@@ -3045,7 +3053,8 @@ type StoreMetrics struct {
30453053
ClosedTimestampMaxBehindNanos *metric.Gauge
30463054

30473055
// Closed timestamp policy change on ranges metrics.
3048-
ClosedTimestampPolicyChange *metric.Counter
3056+
ClosedTimestampPolicyChange *metric.Counter
3057+
ClosedTimestampLatencyInfoMissing *metric.Counter
30493058

30503059
// Replica circuit breaker.
30513060
ReplicaCircuitBreakerCurTripped *metric.Gauge
@@ -3865,7 +3874,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
38653874
SplitsWithEstimatedStats: metric.NewCounter(metaSplitEstimatedStats),
38663875
SplitEstimatedTotalBytesDiff: metric.NewCounter(metaSplitEstimatedTotalBytesDiff),
38673876

3868-
ClosedTimestampPolicyChange: metric.NewCounter(metaClosedTimestampPolicyChange),
3877+
ClosedTimestampPolicyChange: metric.NewCounter(metaClosedTimestampPolicyChange),
3878+
ClosedTimestampLatencyInfoMissing: metric.NewCounter(metaClosedTimestampLatencyInfoMissing),
38693879
}
38703880
sm.categoryIterMetrics.init(storeRegistry)
38713881

pkg/kv/kvserver/replica.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1430,13 +1430,19 @@ func (r *Replica) RefreshPolicy(latencies map[roachpb.NodeID]time.Duration) {
14301430
// policy bucket. This then controls how far in the future timestamps will
14311431
// be closed for the range.
14321432
maxLatency := time.Duration(-1)
1433+
replicaLatencyInfoMissing := false
14331434
for _, peer := range desc.InternalReplicas {
14341435
peerLatency := closedts.DefaultMaxNetworkRTT
14351436
if latency, ok := latencies[peer.NodeID]; ok {
14361437
peerLatency = latency
1438+
} else {
1439+
replicaLatencyInfoMissing = true
14371440
}
14381441
maxLatency = max(maxLatency, peerLatency)
14391442
}
1443+
if replicaLatencyInfoMissing {
1444+
r.store.metrics.ClosedTimestampLatencyInfoMissing.Inc(1)
1445+
}
14401446
return closedts.FindBucketBasedOnNetworkRTTWithDampening(
14411447
oldPolicy,
14421448
maxLatency,

0 commit comments

Comments
 (0)