Skip to content

Commit 16fbe0b

Browse files
committed
kvserver: add kv.closed_timestamp.policy_change
Previously, it was difficult to measure how often policies changed for ranges, which is important because such changes can trigger additional range updates sent in side transport. This commit adds a metric to track the number of policy changes on replicas. Part of: #143890 Release note: none
1 parent 683786d commit 16fbe0b

File tree

3 files changed

+22
-3
lines changed

3 files changed

+22
-3
lines changed

docs/generated/metrics/metrics.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@
193193
<tr><td>STORAGE</td><td>kv.allocator.load_based_replica_rebalancing.missing_stats_for_existing_store</td><td>The number times the allocator was missing the qps stats for the existing store</td><td>Attempts</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
194194
<tr><td>STORAGE</td><td>kv.allocator.load_based_replica_rebalancing.should_transfer</td><td>The number times the allocator determined that the replica should be rebalanced to another store for better load distribution</td><td>Attempts</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
195195
<tr><td>STORAGE</td><td>kv.closed_timestamp.max_behind_nanos</td><td>Largest latency between realtime and replica max closed timestamp</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
196+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy_change</td><td>Number of times closed timestamp policy change occurred on ranges</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
196197
<tr><td>STORAGE</td><td>kv.concurrency.avg_lock_hold_duration_nanos</td><td>Average lock hold duration across locks currently held in lock tables. Does not include replicated locks (intents) that are not held in memory</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
197198
<tr><td>STORAGE</td><td>kv.concurrency.avg_lock_wait_duration_nanos</td><td>Average lock wait duration across requests currently waiting in lock wait-queues</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
198199
<tr><td>STORAGE</td><td>kv.concurrency.latch_conflict_wait_durations</td><td>Durations in nanoseconds spent on latch acquisition waiting for conflicts with other latches</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2432,6 +2432,14 @@ throttled they do count towards 'delay.total' and 'delay.enginebackpressure'.
24322432
Unit: metric.Unit_NANOSECONDS,
24332433
}
24342434

2435+
// Closed timestamp policy change metrics.
2436+
metaClosedTimestampPolicyChange = metric.Metadata{
2437+
Name: "kv.closed_timestamp.policy_change",
2438+
Help: "Number of times closed timestamp policy change occurred on ranges",
2439+
Measurement: "Events",
2440+
Unit: metric.Unit_COUNT,
2441+
}
2442+
24352443
// Replica circuit breaker.
24362444
metaReplicaCircuitBreakerCurTripped = metric.Metadata{
24372445
Name: "kv.replica_circuit_breaker.num_tripped_replicas",
@@ -3033,6 +3041,9 @@ type StoreMetrics struct {
30333041
// Closed timestamp metrics.
30343042
ClosedTimestampMaxBehindNanos *metric.Gauge
30353043

3044+
// Closed timestamp policy change on ranges metrics.
3045+
ClosedTimestampPolicyChange *metric.Counter
3046+
30363047
// Replica circuit breaker.
30373048
ReplicaCircuitBreakerCurTripped *metric.Gauge
30383049
ReplicaCircuitBreakerCumTripped *metric.Counter
@@ -3849,6 +3860,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
38493860
// Estimated MVCC stats in split.
38503861
SplitsWithEstimatedStats: metric.NewCounter(metaSplitEstimatedStats),
38513862
SplitEstimatedTotalBytesDiff: metric.NewCounter(metaSplitEstimatedTotalBytesDiff),
3863+
3864+
ClosedTimestampPolicyChange: metric.NewCounter(metaClosedTimestampPolicyChange),
38523865
}
38533866
sm.categoryIterMetrics.init(storeRegistry)
38543867

pkg/kv/kvserver/replica.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,7 +1405,7 @@ func (r *Replica) closedTimestampPolicyRLocked() ctpb.RangeClosedTimestampPolicy
14051405
// RefreshPolicy updates the replica's cached closed timestamp policy based on
14061406
// span configurations and provided node round-trip latencies.
14071407
func (r *Replica) RefreshPolicy(latencies map[roachpb.NodeID]time.Duration) {
1408-
policy := func() ctpb.RangeClosedTimestampPolicy {
1408+
computeNewPolicy := func(oldPolicy ctpb.RangeClosedTimestampPolicy) ctpb.RangeClosedTimestampPolicy {
14091409
desc, conf := r.DescAndSpanConfig()
14101410
// The node liveness range ignores zone configs and always uses a
14111411
// LAG_BY_CLUSTER_SETTING closed timestamp policy. If it was to begin
@@ -1438,12 +1438,17 @@ func (r *Replica) RefreshPolicy(latencies map[roachpb.NodeID]time.Duration) {
14381438
maxLatency = max(maxLatency, peerLatency)
14391439
}
14401440
return closedts.FindBucketBasedOnNetworkRTTWithDampening(
1441-
ctpb.RangeClosedTimestampPolicy(r.cachedClosedTimestampPolicy.Load()),
1441+
oldPolicy,
14421442
maxLatency,
14431443
closedts.PolicySwitchWhenLatencyExceedsBucketFraction.Get(&r.store.GetStoreConfig().Settings.SV),
14441444
)
14451445
}
1446-
r.cachedClosedTimestampPolicy.Store(int32(policy()))
1446+
oldPolicy := ctpb.RangeClosedTimestampPolicy(r.cachedClosedTimestampPolicy.Load())
1447+
newPolicy := computeNewPolicy(oldPolicy)
1448+
if newPolicy != oldPolicy {
1449+
r.store.metrics.ClosedTimestampPolicyChange.Inc(1)
1450+
r.cachedClosedTimestampPolicy.Store(int32(newPolicy))
1451+
}
14471452
}
14481453

14491454
// NodeID returns the ID of the node this replica belongs to.

0 commit comments

Comments
 (0)