Skip to content

Commit 8c6b50a

Browse files
committed
rpc: add RTT metrics to different RPC classes
This commit adds 4 more histogram metrics to correspond with our 4 RPC classes. I have done it this way as opposed to adding just one metric with 4 labels using `NewExportedHistogramVec` because I couldn't find the labels in the DBConsole if I used `NewExportedHistogramVec`. However, the different labels showed up in Prometheus. For that reason, I added 4 different metrics. Also, I think the code needs some refactoring to separate the RTT recording from the clock offset updating. Fixes: #151701 Release note: None
1 parent 8c2ac17 commit 8c6b50a

File tree

9 files changed

+259
-25
lines changed

9 files changed

+259
-25
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7369,6 +7369,17 @@ layers:
73697369
unit: COUNT
73707370
aggregation: AVG
73717371
derivative: NONE
7372+
- name: round-trip-default-class-latency
7373+
exported_name: round_trip_default_class_latency
7374+
description: |
7375+
Distribution of round-trip latencies with other nodes.
7376+
7377+
Similar to round-trip-latency, but only for default class connections.
7378+
y_axis_label: Round-trip time
7379+
type: HISTOGRAM
7380+
unit: NANOSECONDS
7381+
aggregation: AVG
7382+
derivative: NONE
73727383
- name: round-trip-latency
73737384
exported_name: round_trip_latency
73747385
description: |
@@ -7386,6 +7397,39 @@ layers:
73867397
unit: NANOSECONDS
73877398
aggregation: AVG
73887399
derivative: NONE
7400+
- name: round-trip-raft-class-latency
7401+
exported_name: round_trip_raft_class_latency
7402+
description: |
7403+
Distribution of round-trip latencies with other nodes.
7404+
7405+
Similar to round-trip-latency, but only for raft class connections.
7406+
y_axis_label: Round-trip time
7407+
type: HISTOGRAM
7408+
unit: NANOSECONDS
7409+
aggregation: AVG
7410+
derivative: NONE
7411+
- name: round-trip-rangefeed-class-latency
7412+
exported_name: round_trip_rangefeed_class_latency
7413+
description: |
7414+
Distribution of round-trip latencies with other nodes.
7415+
7416+
Similar to round-trip-latency, but only for rangefeed class connections.
7417+
y_axis_label: Round-trip time
7418+
type: HISTOGRAM
7419+
unit: NANOSECONDS
7420+
aggregation: AVG
7421+
derivative: NONE
7422+
- name: round-trip-system-class-latency
7423+
exported_name: round_trip_system_class_latency
7424+
description: |
7425+
Distribution of round-trip latencies with other nodes.
7426+
7427+
Similar to round-trip-latency, but only for system class connections.
7428+
y_axis_label: Round-trip time
7429+
type: HISTOGRAM
7430+
unit: NANOSECONDS
7431+
aggregation: AVG
7432+
derivative: NONE
73897433
- name: rpc.client.bytes.egress
73907434
exported_name: rpc_client_bytes_egress
73917435
description: Counter of TCP bytes sent via gRPC on connections we initiated.

pkg/ccl/kvccl/kvfollowerreadsccl/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ go_test(
6262
"//pkg/kv/kvtestutils",
6363
"//pkg/roachpb",
6464
"//pkg/rpc",
65+
"//pkg/rpc/rpcbase",
6566
"//pkg/security/securityassets",
6667
"//pkg/security/securitytest",
6768
"//pkg/server",

pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/cockroachdb/cockroach/pkg/kv/kvtestutils"
3030
"github.com/cockroachdb/cockroach/pkg/roachpb"
3131
"github.com/cockroachdb/cockroach/pkg/rpc"
32+
"github.com/cockroachdb/cockroach/pkg/rpc/rpcbase"
3233
"github.com/cockroachdb/cockroach/pkg/server"
3334
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
3435
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
@@ -539,7 +540,7 @@ func TestOracle(t *testing.T) {
539540
// the exponentially-weighted moving average to work properly. See the
540541
// comment on the WARMUP_SAMPLES const in the ewma package for details.
541542
for i := 0; i < 11; i++ {
542-
rpcContext.RemoteClocks.UpdateOffset(ctx, id, rpc.RemoteOffset{}, latency)
543+
rpcContext.RemoteClocks.UpdateOffset(ctx, id, rpc.RemoteOffset{}, latency, rpcbase.DefaultClass)
543544
}
544545
}
545546
setLatency(1, 100*time.Millisecond)

pkg/roachprod/opentelemetry/cockroachdb_metrics.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1481,6 +1481,22 @@ var cockroachdbMetrics = map[string]string{
14811481
"round_trip_latency_bucket": "round_trip_latency.bucket",
14821482
"round_trip_latency_count": "round_trip_latency.count",
14831483
"round_trip_latency_sum": "round_trip_latency.sum",
1484+
"round_trip_default_class_latency": "round_trip.default_class.latency",
1485+
"round_trip_default_class_latency_bucket": "round_trip.default_class.latency.bucket",
1486+
"round_trip_default_class_latency_count": "round_trip.default_class.latency.count",
1487+
"round_trip_default_class_latency_sum": "round_trip.default_class.latency.sum",
1488+
"round_trip_system_class_latency": "round_trip.system_class.latency",
1489+
"round_trip_system_class_latency_bucket": "round_trip.system_class.latency.bucket",
1490+
"round_trip_system_class_latency_count": "round_trip.system_class.latency.count",
1491+
"round_trip_system_class_latency_sum": "round_trip.system_class.latency.sum",
1492+
"round_trip_rangefeed_class_latency": "round_trip.rangefeed_class.latency",
1493+
"round_trip_rangefeed_class_latency_bucket": "round_trip.rangefeed_class.latency.bucket",
1494+
"round_trip_rangefeed_class_latency_count": "round_trip.rangefeed_class.latency.count",
1495+
"round_trip_rangefeed_class_latency_sum": "round_trip.rangefeed_class.latency.sum",
1496+
"round_trip_raft_class_latency": "round_trip.raft_class.latency",
1497+
"round_trip_raft_class_latency_bucket": "round_trip.raft_class.latency.bucket",
1498+
"round_trip_raft_class_latency_count": "round_trip.raft_class.latency.count",
1499+
"round_trip_raft_class_latency_sum": "round_trip.raft_class.latency.sum",
14841500
"rpc_batches_recv": "rpc.batches.recv",
14851501
"rpc_connection_avg_round_trip_latency": "rpc.connection.avg_round_trip_latency",
14861502
"rpc_connection_failures": "rpc.connection.failures",

pkg/rpc/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ go_test(
160160
"//pkg/util/hlc",
161161
"//pkg/util/leaktest",
162162
"//pkg/util/log",
163+
"//pkg/util/metric",
163164
"//pkg/util/netutil",
164165
"//pkg/util/randutil",
165166
"//pkg/util/stop",

pkg/rpc/clock_offset.go

Lines changed: 94 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
"github.com/VividCortex/ewma"
1515
"github.com/cockroachdb/cockroach/pkg/roachpb"
16+
"github.com/cockroachdb/cockroach/pkg/rpc/rpcbase"
1617
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
1718
"github.com/cockroachdb/cockroach/pkg/util/hlc"
1819
"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -24,11 +25,15 @@ import (
2425

2526
// RemoteClockMetrics is the collection of metrics for the clock monitor.
2627
type RemoteClockMetrics struct {
27-
ClockOffsetMeanNanos *metric.Gauge
28-
ClockOffsetStdDevNanos *metric.Gauge
29-
ClockOffsetMedianNanos *metric.Gauge
30-
ClockOffsetMedianAbsDevNanos *metric.Gauge
31-
RoundTripLatency metric.IHistogram
28+
ClockOffsetMeanNanos *metric.Gauge
29+
ClockOffsetStdDevNanos *metric.Gauge
30+
ClockOffsetMedianNanos *metric.Gauge
31+
ClockOffsetMedianAbsDevNanos *metric.Gauge
32+
RoundTripLatency metric.IHistogram
33+
RoundTripDefaultClassLatency metric.IHistogram
34+
RoundTripSystemClassLatency metric.IHistogram
35+
RoundTripRangefeedClassLatency metric.IHistogram
36+
RoundTripRaftClassLatency metric.IHistogram
3237
}
3338

3439
// avgLatencyMeasurementAge determines how to exponentially weight the
@@ -85,6 +90,43 @@ can similarly elevate this metric. The operator should look towards OS-level
8590
metrics such as packet loss, retransmits, etc, to conclusively diagnose network
8691
issues. Heartbeats are not very frequent (~seconds), so they may not capture
8792
rare or short-lived degradations.
93+
`,
94+
Measurement: "Round-trip time",
95+
Unit: metric.Unit_NANOSECONDS,
96+
}
97+
98+
metaDefaultConnectionRoundTripLatency = metric.Metadata{
99+
Name: "round-trip-default-class-latency",
100+
Help: `Distribution of round-trip latencies with other nodes.
101+
102+
Similar to round-trip-latency, but only for default class connections.
103+
`,
104+
Measurement: "Round-trip time",
105+
Unit: metric.Unit_NANOSECONDS,
106+
}
107+
metaSystemConnectionRoundTripLatency = metric.Metadata{
108+
Name: "round-trip-system-class-latency",
109+
Help: `Distribution of round-trip latencies with other nodes.
110+
111+
Similar to round-trip-latency, but only for system class connections.
112+
`,
113+
Measurement: "Round-trip time",
114+
Unit: metric.Unit_NANOSECONDS,
115+
}
116+
metaRangefeedConnectionRoundTripLatency = metric.Metadata{
117+
Name: "round-trip-rangefeed-class-latency",
118+
Help: `Distribution of round-trip latencies with other nodes.
119+
120+
Similar to round-trip-latency, but only for rangefeed class connections.
121+
`,
122+
Measurement: "Round-trip time",
123+
Unit: metric.Unit_NANOSECONDS,
124+
}
125+
metaRaftConnectionRoundTripLatency = metric.Metadata{
126+
Name: "round-trip-raft-class-latency",
127+
Help: `Distribution of round-trip latencies with other nodes.
128+
129+
Similar to round-trip-latency, but only for raft class connections.
88130
`,
89131
Measurement: "Round-trip time",
90132
Unit: metric.Unit_NANOSECONDS,
@@ -146,6 +188,20 @@ func (r *RemoteClockMonitor) TestingResetLatencyInfos() {
146188
}
147189
}
148190

191+
// createRoundtripLatencyMetricHelper is a helper function to create a histogram
192+
// metric for an RTT metric with the given metadata and histogram window
193+
// interval.
194+
func createRoundtripLatencyMetricHelper(
195+
meta metric.Metadata, histogramWindowInterval time.Duration,
196+
) metric.IHistogram {
197+
return metric.NewHistogram(metric.HistogramOptions{
198+
Mode: metric.HistogramModePreferHdrLatency,
199+
Metadata: meta,
200+
Duration: histogramWindowInterval,
201+
BucketConfig: metric.IOLatencyBuckets,
202+
})
203+
}
204+
149205
// newRemoteClockMonitor returns a monitor with the given server clock. A
150206
// toleratedOffset of 0 disables offset checking and metrics, but still records
151207
// latency metrics.
@@ -171,15 +227,16 @@ func newRemoteClockMonitor(
171227
ClockOffsetStdDevNanos: metric.NewGauge(metaClockOffsetStdDevNanos),
172228
ClockOffsetMedianNanos: metric.NewGauge(metaClockOffsetMedianNanos),
173229
ClockOffsetMedianAbsDevNanos: metric.NewGauge(metaClockOffsetMedianAbsDevNanos),
174-
RoundTripLatency: metric.NewHistogram(metric.HistogramOptions{
175-
Mode: metric.HistogramModePreferHdrLatency,
176-
Metadata: metaConnectionRoundTripLatency,
177-
Duration: histogramWindowInterval,
178-
// NB: the choice of IO over Network buckets is somewhat debatable, but
179-
// it's fine. Heartbeats can take >1s which the IO buckets can represent,
180-
// but the Network buckets top out at 1s.
181-
BucketConfig: metric.IOLatencyBuckets,
182-
}),
230+
RoundTripLatency: createRoundtripLatencyMetricHelper(metaConnectionRoundTripLatency,
231+
histogramWindowInterval),
232+
RoundTripDefaultClassLatency: createRoundtripLatencyMetricHelper(metaDefaultConnectionRoundTripLatency,
233+
histogramWindowInterval),
234+
RoundTripSystemClassLatency: createRoundtripLatencyMetricHelper(metaSystemConnectionRoundTripLatency,
235+
histogramWindowInterval),
236+
RoundTripRangefeedClassLatency: createRoundtripLatencyMetricHelper(metaRangefeedConnectionRoundTripLatency,
237+
histogramWindowInterval),
238+
RoundTripRaftClassLatency: createRoundtripLatencyMetricHelper(metaRaftConnectionRoundTripLatency,
239+
histogramWindowInterval),
183240
}
184241
return &r
185242
}
@@ -251,7 +308,11 @@ func (r *RemoteClockMonitor) OnDisconnect(_ context.Context, nodeID roachpb.Node
251308
//
252309
// Pass a roundTripLatency of 0 or less to avoid recording the latency.
253310
func (r *RemoteClockMonitor) UpdateOffset(
254-
ctx context.Context, id roachpb.NodeID, offset RemoteOffset, roundTripLatency time.Duration,
311+
ctx context.Context,
312+
id roachpb.NodeID,
313+
offset RemoteOffset,
314+
roundTripLatency time.Duration,
315+
rpcClass rpcbase.ConnectionClass,
255316
) {
256317
emptyOffset := offset == RemoteOffset{}
257318
// At startup the remote node's id may not be set. Skip recording latency.
@@ -299,6 +360,22 @@ func (r *RemoteClockMonitor) UpdateOffset(
299360
prevAvg := info.avgNanos.Value()
300361
info.avgNanos.Add(newLatencyf)
301362
r.metrics.RoundTripLatency.RecordValue(roundTripLatency.Nanoseconds())
363+
switch rpcClass {
364+
case rpcbase.DefaultClass:
365+
r.metrics.RoundTripDefaultClassLatency.RecordValue(roundTripLatency.Nanoseconds())
366+
case rpcbase.SystemClass:
367+
r.metrics.RoundTripSystemClassLatency.RecordValue(roundTripLatency.Nanoseconds())
368+
case rpcbase.RangefeedClass:
369+
r.metrics.RoundTripRangefeedClassLatency.RecordValue(roundTripLatency.Nanoseconds())
370+
case rpcbase.RaftClass:
371+
r.metrics.RoundTripRaftClassLatency.RecordValue(roundTripLatency.Nanoseconds())
372+
default:
373+
log.Dev.Warningf(ctx, "unknown RPC class: %s", rpcClass)
374+
if buildutil.CrdbTestBuild {
375+
panic(errors.AssertionFailedf("unknown RPC class: %s", rpcClass))
376+
}
377+
378+
}
302379

303380
// See: https://github.com/cockroachdb/cockroach/issues/96262
304381
// See: https://github.com/cockroachdb/cockroach/issues/98066
@@ -427,6 +504,7 @@ func updateClockOffsetTracking(
427504
nodeID roachpb.NodeID,
428505
sendTime, serverTime, receiveTime time.Time,
429506
toleratedOffset time.Duration,
507+
rpcClass rpcbase.ConnectionClass,
430508
) (time.Duration, RemoteOffset, error) {
431509
pingDuration := receiveTime.Sub(sendTime)
432510
if remoteClocks == nil {
@@ -455,7 +533,7 @@ func updateClockOffsetTracking(
455533
remoteTimeNow := serverTime.Add(pingDuration / 2)
456534
offset.Offset = remoteTimeNow.Sub(receiveTime).Nanoseconds()
457535
}
458-
remoteClocks.UpdateOffset(ctx, nodeID, offset, pingDuration)
536+
remoteClocks.UpdateOffset(ctx, nodeID, offset, pingDuration, rpcClass)
459537
return pingDuration, offset, remoteClocks.VerifyClockOffset(ctx)
460538
}
461539

0 commit comments

Comments
 (0)