Skip to content

Commit a4f4cac

Browse files
committed
kvserver: add Store metrics for raft log size
Epic: none Release note: none
1 parent a5d7c25 commit a4f4cac

File tree

5 files changed

+30
-0
lines changed

5 files changed

+30
-0
lines changed

docs/generated/metrics/metrics.html

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,8 @@
531531
<tr><td>STORAGE</td><td>raft.transport.sends-dropped</td><td>Number of Raft message sends dropped by the Raft Transport</td><td>Messages</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
532532
<tr><td>STORAGE</td><td>raft.transport.sent</td><td>Number of Raft messages sent by the Raft Transport</td><td>Messages</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
533533
<tr><td>STORAGE</td><td>raftlog.behind</td><td>Number of Raft log entries followers on other stores are behind.<br/><br/>This gauge provides a view of the aggregate number of log entries the Raft leaders<br/>on this node think the followers are behind. Since a raft leader may not always<br/>have a good estimate for this information for all of its followers, and since<br/>followers are expected to be behind (when they are not required as part of a<br/>quorum) *and* the aggregate thus scales like the count of such followers, it is<br/>difficult to meaningfully interpret this metric.</td><td>Log Entries</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
534+
<tr><td>STORAGE</td><td>raftlog.size.max</td><td>Approximate size of the largest Raft log on the store.</td><td>Bytes</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
535+
<tr><td>STORAGE</td><td>raftlog.size.total</td><td>Approximate size of all Raft logs on the store.</td><td>Bytes</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
534536
<tr><td>STORAGE</td><td>raftlog.truncated</td><td>Number of Raft log entries truncated</td><td>Log Entries</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
535537
<tr><td>STORAGE</td><td>range.adds</td><td>Number of range additions</td><td>Range Ops</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
536538
<tr><td>STORAGE</td><td>range.merges</td><td>Number of range merges</td><td>Range Ops</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1799,6 +1799,19 @@ difficult to meaningfully interpret this metric.`,
17991799
Unit: metric.Unit_COUNT,
18001800
}
18011801

1802+
metaRaftLogTotalSize = metric.Metadata{
1803+
Name: "raftlog.size.total",
1804+
Help: "Approximate size of all Raft logs on the store.",
1805+
Measurement: "Bytes",
1806+
Unit: metric.Unit_BYTES,
1807+
}
1808+
metaRaftLogMaxSize = metric.Metadata{
1809+
Name: "raftlog.size.max",
1810+
Help: "Approximate size of the largest Raft log on the store.",
1811+
Measurement: "Bytes",
1812+
Unit: metric.Unit_BYTES,
1813+
}
1814+
18021815
metaRaftFollowerPaused = metric.Metadata{
18031816
Name: "admission.raft.paused_replicas",
18041817
Help: `Number of followers (i.e. Replicas) to which replication is currently paused to help them recover from I/O overload.
@@ -2874,6 +2887,8 @@ type StoreMetrics struct {
28742887
// Raft log metrics.
28752888
RaftLogFollowerBehindCount *metric.Gauge
28762889
RaftLogTruncated *metric.Counter
2890+
RaftLogTotalSize *metric.Gauge
2891+
RaftLogMaxSize *metric.Gauge
28772892

28782893
RaftPausedFollowerCount *metric.Gauge
28792894
RaftPausedFollowerDroppedMsgs *metric.Counter
@@ -3643,6 +3658,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
36433658
// Raft log metrics.
36443659
RaftLogFollowerBehindCount: metric.NewGauge(metaRaftLogFollowerBehindCount),
36453660
RaftLogTruncated: metric.NewCounter(metaRaftLogTruncated),
3661+
RaftLogTotalSize: metric.NewGauge(metaRaftLogTotalSize),
3662+
RaftLogMaxSize: metric.NewGauge(metaRaftLogMaxSize),
36463663

36473664
RaftPausedFollowerCount: metric.NewGauge(metaRaftFollowerPaused),
36483665
RaftPausedFollowerDroppedMsgs: metric.NewCounter(metaRaftPausedFollowerDroppedMsgs),

pkg/kv/kvserver/replica_metrics.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ type ReplicaMetrics struct {
5454
Underreplicated bool
5555
Overreplicated bool
5656
Decommissioning bool
57+
RaftLogSize int64
5758
RaftLogTooLarge bool
5859
RangeTooLarge bool
5960
BehindCount int64
@@ -213,6 +214,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
213214
Underreplicated: underreplicated,
214215
Overreplicated: overreplicated,
215216
Decommissioning: decommissioning,
217+
RaftLogSize: d.raftLogSize,
216218
RaftLogTooLarge: d.raftLogSizeTrusted &&
217219
d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold,
218220
RangeTooLarge: tooLarge,

pkg/kv/kvserver/replica_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9275,6 +9275,7 @@ func TestReplicaMetrics(t *testing.T) {
92759275
Unavailable: false,
92769276
Underreplicated: false,
92779277
BehindCount: 10,
9278+
RaftLogSize: 5 * cfg.RaftLogTruncationThreshold,
92789279
RaftLogTooLarge: true,
92799280
LeaderNotFortified: true,
92809281
RaftFlowStateCounts: [3]int64{1, 0, 0},

pkg/kv/kvserver/store.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3359,6 +3359,9 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
33593359
averageWriteBytesPerSecond float64
33603360
averageCPUNanosPerSecond float64
33613361

3362+
totalRaftLogSize int64
3363+
maxRaftLogSize int64
3364+
33623365
rangeCount int64
33633366
unavailableRangeCount int64
33643367
underreplicatedRangeCount int64
@@ -3489,6 +3492,9 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
34893492
s.metrics.RecentReplicaCPUNanosPerSecond.RecordValue(replicaCPUNanosPerSecond)
34903493
s.metrics.RecentReplicaQueriesPerSecond.RecordValue(loadStats.QueriesPerSecond)
34913494

3495+
totalRaftLogSize += metrics.RaftLogSize
3496+
maxRaftLogSize = max(maxRaftLogSize, metrics.RaftLogSize)
3497+
34923498
locks += metrics.LockTableMetrics.Locks
34933499
totalLockHoldDurationNanos += metrics.LockTableMetrics.TotalLockHoldDurationNanos
34943500
locksWithWaitQueues += metrics.LockTableMetrics.LocksWithWaitQueues
@@ -3527,6 +3533,8 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
35273533
for state, cnt := range raftFlowStateCounts {
35283534
s.metrics.RaftFlowStateCounts[state].Update(cnt)
35293535
}
3536+
s.metrics.RaftLogTotalSize.Update(totalRaftLogSize)
3537+
s.metrics.RaftLogMaxSize.Update(maxRaftLogSize)
35303538
s.metrics.AverageQueriesPerSecond.Update(averageQueriesPerSecond)
35313539
s.metrics.AverageRequestsPerSecond.Update(averageRequestsPerSecond)
35323540
s.metrics.AverageWritesPerSecond.Update(averageWritesPerSecond)

0 commit comments

Comments
 (0)