Skip to content

Commit e013969

Browse files
committed
kvserver: add more metrics for policies
Previously, it was difficult to determine how many ranges fell into each latency bucket policy. This commit adds 18 new metrics to StoreMetrics to track the number of ranges per policy bucket for every store. Part of: #143890 Release note: none
1 parent 16fbe0b commit e013969

File tree

4 files changed

+68
-21
lines changed

4 files changed

+68
-21
lines changed

docs/generated/metrics/metrics.html

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,24 @@
193193
<tr><td>STORAGE</td><td>kv.allocator.load_based_replica_rebalancing.missing_stats_for_existing_store</td><td>The number times the allocator was missing the qps stats for the existing store</td><td>Attempts</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
194194
<tr><td>STORAGE</td><td>kv.allocator.load_based_replica_rebalancing.should_transfer</td><td>The number times the allocator determined that the replica should be rebalanced to another store for better load distribution</td><td>Attempts</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
195195
<tr><td>STORAGE</td><td>kv.closed_timestamp.max_behind_nanos</td><td>Largest latency between realtime and replica max closed timestamp</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
196+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lag_by_cluster_setting</td><td>Number of ranges with LAG_BY_CLUSTER_SETTING closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
197+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_equal_or_greater_than_300ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_EQUAL_OR_GREATER_THAN_300MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
198+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_100ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_100MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
199+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_120ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_120MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
200+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_140ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_140MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
201+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_160ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_160MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
202+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_180ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_180MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
203+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_200ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_200MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
204+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_20ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_20MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
205+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_220ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_220MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
206+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_240ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_240MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
207+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_260ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_260MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
208+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_280ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_280MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
209+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_300ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_300MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
210+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_40ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
211+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_60ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
212+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_latency_less_than_80ms</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_80MS closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
213+
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy.lead_for_global_reads_with_no_latency_info</td><td>Number of ranges with LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO closed timestamp policy</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
196214
<tr><td>STORAGE</td><td>kv.closed_timestamp.policy_change</td><td>Number of times closed timestamp policy change occurred on ranges</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
197215
<tr><td>STORAGE</td><td>kv.concurrency.avg_lock_hold_duration_nanos</td><td>Average lock hold duration across locks currently held in lock tables. Does not include replicated locks (intents) that are not held in memory</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
198216
<tr><td>STORAGE</td><td>kv.concurrency.avg_lock_wait_duration_nanos</td><td>Average lock wait duration across requests currently waiting in lock wait-queues</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ package kvserver
88
import (
99
"context"
1010
"fmt"
11+
"strings"
1112
"sync/atomic"
1213
"time"
1314

1415
"github.com/cockroachdb/cockroach/pkg/kv/kvbase"
1516
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
17+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
1618
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1719
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rangefeed"
1820
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/split"
@@ -2672,11 +2674,12 @@ type StoreMetrics struct {
26722674
RaftFlowStateCounts [tracker.StateCount]*metric.Gauge
26732675

26742676
// Range metrics.
2675-
RangeCount *metric.Gauge
2676-
UnavailableRangeCount *metric.Gauge
2677-
UnderReplicatedRangeCount *metric.Gauge
2678-
OverReplicatedRangeCount *metric.Gauge
2679-
DecommissioningRangeCount *metric.Gauge
2677+
RangeCount *metric.Gauge
2678+
UnavailableRangeCount *metric.Gauge
2679+
UnderReplicatedRangeCount *metric.Gauge
2680+
OverReplicatedRangeCount *metric.Gauge
2681+
DecommissioningRangeCount *metric.Gauge
2682+
RangeClosedTimestampPolicyCount [ctpb.MAX_CLOSED_TIMESTAMP_POLICY]*metric.Gauge
26802683

26812684
// Lease request metrics for successful and failed lease requests. These
26822685
// count proposals (i.e. it does not matter how many replicas apply the
@@ -3385,11 +3388,12 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
33853388
RaftFlowStateCounts: raftFlowStateGaugeSlice(),
33863389

33873390
// Range metrics.
3388-
RangeCount: metric.NewGauge(metaRangeCount),
3389-
UnavailableRangeCount: metric.NewGauge(metaUnavailableRangeCount),
3390-
UnderReplicatedRangeCount: metric.NewGauge(metaUnderReplicatedRangeCount),
3391-
OverReplicatedRangeCount: metric.NewGauge(metaOverReplicatedRangeCount),
3392-
DecommissioningRangeCount: metric.NewGauge(metaDecommissioningRangeCount),
3391+
RangeCount: metric.NewGauge(metaRangeCount),
3392+
UnavailableRangeCount: metric.NewGauge(metaUnavailableRangeCount),
3393+
UnderReplicatedRangeCount: metric.NewGauge(metaUnderReplicatedRangeCount),
3394+
OverReplicatedRangeCount: metric.NewGauge(metaOverReplicatedRangeCount),
3395+
DecommissioningRangeCount: metric.NewGauge(metaDecommissioningRangeCount),
3396+
RangeClosedTimestampPolicyCount: makePolicyRefresherMetrics(),
33933397

33943398
// Lease request metrics.
33953399
LeaseRequestSuccessCount: metric.NewCounter(metaLeaseRequestSuccessCount),
@@ -4168,6 +4172,20 @@ func raftFlowStateGaugeSlice() [tracker.StateCount]*metric.Gauge {
41684172
return gauges
41694173
}
41704174

4175+
func makePolicyRefresherMetrics() [ctpb.MAX_CLOSED_TIMESTAMP_POLICY]*metric.Gauge {
4176+
var policyGauges [ctpb.MAX_CLOSED_TIMESTAMP_POLICY]*metric.Gauge
4177+
for policy := ctpb.LAG_BY_CLUSTER_SETTING; policy < ctpb.MAX_CLOSED_TIMESTAMP_POLICY; policy++ {
4178+
meta := metric.Metadata{
4179+
Name: fmt.Sprintf("kv.closed_timestamp.policy.%s", strings.ToLower(policy.String())),
4180+
Help: fmt.Sprintf("Number of ranges with %s closed timestamp policy", policy.String()),
4181+
Measurement: "Ranges",
4182+
Unit: metric.Unit_COUNT,
4183+
}
4184+
policyGauges[policy] = metric.NewGauge(meta)
4185+
}
4186+
return policyGauges
4187+
}
4188+
41714189
func storageLevelMetricMetadata(
41724190
name, helpTpl, measurement string, unit metric.Unit,
41734191
) [7]metric.Metadata {

pkg/kv/kvserver/replica_metrics.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/cockroachdb/cockroach/pkg/base"
1313
"github.com/cockroachdb/cockroach/pkg/keys"
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl"
15+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
1516
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
1617
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1718
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
@@ -62,6 +63,7 @@ type ReplicaMetrics struct {
6263
PendingRaftProposalCount int64
6364
SlowRaftProposalCount int64
6465
RaftFlowStateCounts [tracker.StateCount]int64
66+
ClosedTimestampPolicy ctpb.RangeClosedTimestampPolicy
6567

6668
QuotaPoolPercentUsed int64 // [0,100]
6769

@@ -123,6 +125,7 @@ func (r *Replica) Metrics(
123125
paused: r.mu.pausedFollowers,
124126
pendingRaftProposalCount: r.numPendingProposalsRLocked(),
125127
slowRaftProposalCount: r.mu.slowProposalCount,
128+
closedTimestampPolicy: ctpb.RangeClosedTimestampPolicy(r.cachedClosedTimestampPolicy.Load()),
126129
}
127130

128131
r.mu.RUnlock()
@@ -154,6 +157,7 @@ type calcReplicaMetricsInput struct {
154157
paused map[roachpb.ReplicaID]struct{}
155158
pendingRaftProposalCount int64
156159
slowRaftProposalCount int64
160+
closedTimestampPolicy ctpb.RangeClosedTimestampPolicy
157161
}
158162

159163
func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
@@ -226,6 +230,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
226230
QuotaPoolPercentUsed: calcQuotaPoolPercentUsed(d.qpUsed, d.qpCapacity),
227231
LatchMetrics: d.latchMetrics,
228232
LockTableMetrics: d.lockTableMetrics,
233+
ClosedTimestampPolicy: d.closedTimestampPolicy,
229234
}
230235
}
231236

pkg/kv/kvserver/store.go

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/load"
3434
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool"
3535
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
36+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
3637
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/policyrefresher"
3738
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/sidetransport"
3839
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/idalloc"
@@ -3360,17 +3361,18 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
33603361
totalRaftLogSize int64
33613362
maxRaftLogSize int64
33623363

3363-
rangeCount int64
3364-
unavailableRangeCount int64
3365-
underreplicatedRangeCount int64
3366-
overreplicatedRangeCount int64
3367-
decommissioningRangeCount int64
3368-
behindCount int64
3369-
pausedFollowerCount int64
3370-
ioOverload float64
3371-
pendingRaftProposalCount int64
3372-
slowRaftProposalCount int64
3373-
raftFlowStateCounts [tracker.StateCount]int64
3364+
rangeCount int64
3365+
unavailableRangeCount int64
3366+
underreplicatedRangeCount int64
3367+
overreplicatedRangeCount int64
3368+
decommissioningRangeCount int64
3369+
behindCount int64
3370+
pausedFollowerCount int64
3371+
ioOverload float64
3372+
pendingRaftProposalCount int64
3373+
slowRaftProposalCount int64
3374+
raftFlowStateCounts [tracker.StateCount]int64
3375+
closedTimestampPolicyCounts [ctpb.MAX_CLOSED_TIMESTAMP_POLICY]int64
33743376

33753377
locks int64
33763378
totalLockHoldDurationNanos int64
@@ -3429,6 +3431,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
34293431
}
34303432
if metrics.Leaseholder {
34313433
s.metrics.RaftQuotaPoolPercentUsed.RecordValue(metrics.QuotaPoolPercentUsed)
3434+
closedTimestampPolicyCounts[metrics.ClosedTimestampPolicy] += 1
34323435
leaseHolderCount++
34333436
switch metrics.LeaseType {
34343437
case roachpb.LeaseNone:
@@ -3531,6 +3534,9 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
35313534
for state, cnt := range raftFlowStateCounts {
35323535
s.metrics.RaftFlowStateCounts[state].Update(cnt)
35333536
}
3537+
for policy, count := range closedTimestampPolicyCounts {
3538+
s.metrics.RangeClosedTimestampPolicyCount[policy].Update(count)
3539+
}
35343540
s.metrics.RaftLogTotalSize.Update(totalRaftLogSize)
35353541
s.metrics.RaftLogMaxSize.Update(maxRaftLogSize)
35363542
s.metrics.AverageQueriesPerSecond.Update(averageQueriesPerSecond)

0 commit comments

Comments
 (0)