@@ -13,6 +13,7 @@ import (
13
13
14
14
"github.com/VividCortex/ewma"
15
15
"github.com/cockroachdb/cockroach/pkg/roachpb"
16
+ "github.com/cockroachdb/cockroach/pkg/rpc/rpcbase"
16
17
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
17
18
"github.com/cockroachdb/cockroach/pkg/util/hlc"
18
19
"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -24,11 +25,15 @@ import (
24
25
25
26
// RemoteClockMetrics is the collection of metrics for the clock monitor.
26
27
type RemoteClockMetrics struct {
27
- ClockOffsetMeanNanos * metric.Gauge
28
- ClockOffsetStdDevNanos * metric.Gauge
29
- ClockOffsetMedianNanos * metric.Gauge
30
- ClockOffsetMedianAbsDevNanos * metric.Gauge
31
- RoundTripLatency metric.IHistogram
28
+ ClockOffsetMeanNanos * metric.Gauge
29
+ ClockOffsetStdDevNanos * metric.Gauge
30
+ ClockOffsetMedianNanos * metric.Gauge
31
+ ClockOffsetMedianAbsDevNanos * metric.Gauge
32
+ RoundTripLatency metric.IHistogram
33
+ RoundTripDefaultClassLatency metric.IHistogram
34
+ RoundTripSystemClassLatency metric.IHistogram
35
+ RoundTripRangefeedClassLatency metric.IHistogram
36
+ RoundTripRaftClassLatency metric.IHistogram
32
37
}
33
38
34
39
// avgLatencyMeasurementAge determines how to exponentially weight the
@@ -85,6 +90,43 @@ can similarly elevate this metric. The operator should look towards OS-level
85
90
metrics such as packet loss, retransmits, etc, to conclusively diagnose network
86
91
issues. Heartbeats are not very frequent (~seconds), so they may not capture
87
92
rare or short-lived degradations.
93
+ ` ,
94
+ Measurement : "Round-trip time" ,
95
+ Unit : metric .Unit_NANOSECONDS ,
96
+ }
97
+
98
+ metaDefaultConnectionRoundTripLatency = metric.Metadata {
99
+ Name : "round-trip-default-class-latency" ,
100
+ Help : `Distribution of round-trip latencies with other nodes.
101
+
102
+ Similar to round-trip-latency, but only for default class connections.
103
+ ` ,
104
+ Measurement : "Round-trip time" ,
105
+ Unit : metric .Unit_NANOSECONDS ,
106
+ }
107
+ metaSystemConnectionRoundTripLatency = metric.Metadata {
108
+ Name : "round-trip-system-class-latency" ,
109
+ Help : `Distribution of round-trip latencies with other nodes.
110
+
111
+ Similar to round-trip-latency, but only for system class connections.
112
+ ` ,
113
+ Measurement : "Round-trip time" ,
114
+ Unit : metric .Unit_NANOSECONDS ,
115
+ }
116
+ metaRangefeedConnectionRoundTripLatency = metric.Metadata {
117
+ Name : "round-trip-rangefeed-class-latency" ,
118
+ Help : `Distribution of round-trip latencies with other nodes.
119
+
120
+ Similar to round-trip-latency, but only for rangefeed class connections.
121
+ ` ,
122
+ Measurement : "Round-trip time" ,
123
+ Unit : metric .Unit_NANOSECONDS ,
124
+ }
125
+ metaRaftConnectionRoundTripLatency = metric.Metadata {
126
+ Name : "round-trip-raft-class-latency" ,
127
+ Help : `Distribution of round-trip latencies with other nodes.
128
+
129
+ Similar to round-trip-latency, but only for raft class connections.
88
130
` ,
89
131
Measurement : "Round-trip time" ,
90
132
Unit : metric .Unit_NANOSECONDS ,
@@ -146,6 +188,20 @@ func (r *RemoteClockMonitor) TestingResetLatencyInfos() {
146
188
}
147
189
}
148
190
191
+ // createRoundtripLatencyMetricHelper is a helper function to create a histogram
192
+ // metric for an RTT metric with the given metadata and histogram window
193
+ // interval.
194
+ func createRoundtripLatencyMetricHelper (
195
+ meta metric.Metadata , histogramWindowInterval time.Duration ,
196
+ ) metric.IHistogram {
197
+ return metric .NewHistogram (metric.HistogramOptions {
198
+ Mode : metric .HistogramModePreferHdrLatency ,
199
+ Metadata : meta ,
200
+ Duration : histogramWindowInterval ,
201
+ BucketConfig : metric .IOLatencyBuckets ,
202
+ })
203
+ }
204
+
149
205
// newRemoteClockMonitor returns a monitor with the given server clock. A
150
206
// toleratedOffset of 0 disables offset checking and metrics, but still records
151
207
// latency metrics.
@@ -171,15 +227,16 @@ func newRemoteClockMonitor(
171
227
ClockOffsetStdDevNanos : metric .NewGauge (metaClockOffsetStdDevNanos ),
172
228
ClockOffsetMedianNanos : metric .NewGauge (metaClockOffsetMedianNanos ),
173
229
ClockOffsetMedianAbsDevNanos : metric .NewGauge (metaClockOffsetMedianAbsDevNanos ),
174
- RoundTripLatency : metric .NewHistogram (metric.HistogramOptions {
175
- Mode : metric .HistogramModePreferHdrLatency ,
176
- Metadata : metaConnectionRoundTripLatency ,
177
- Duration : histogramWindowInterval ,
178
- // NB: the choice of IO over Network buckets is somewhat debatable, but
179
- // it's fine. Heartbeats can take >1s which the IO buckets can represent,
180
- // but the Network buckets top out at 1s.
181
- BucketConfig : metric .IOLatencyBuckets ,
182
- }),
230
+ RoundTripLatency : createRoundtripLatencyMetricHelper (metaConnectionRoundTripLatency ,
231
+ histogramWindowInterval ),
232
+ RoundTripDefaultClassLatency : createRoundtripLatencyMetricHelper (metaDefaultConnectionRoundTripLatency ,
233
+ histogramWindowInterval ),
234
+ RoundTripSystemClassLatency : createRoundtripLatencyMetricHelper (metaSystemConnectionRoundTripLatency ,
235
+ histogramWindowInterval ),
236
+ RoundTripRangefeedClassLatency : createRoundtripLatencyMetricHelper (metaRangefeedConnectionRoundTripLatency ,
237
+ histogramWindowInterval ),
238
+ RoundTripRaftClassLatency : createRoundtripLatencyMetricHelper (metaRaftConnectionRoundTripLatency ,
239
+ histogramWindowInterval ),
183
240
}
184
241
return & r
185
242
}
@@ -251,7 +308,11 @@ func (r *RemoteClockMonitor) OnDisconnect(_ context.Context, nodeID roachpb.Node
251
308
//
252
309
// Pass a roundTripLatency of 0 or less to avoid recording the latency.
253
310
func (r * RemoteClockMonitor ) UpdateOffset (
254
- ctx context.Context , id roachpb.NodeID , offset RemoteOffset , roundTripLatency time.Duration ,
311
+ ctx context.Context ,
312
+ id roachpb.NodeID ,
313
+ offset RemoteOffset ,
314
+ roundTripLatency time.Duration ,
315
+ rpcClass rpcbase.ConnectionClass ,
255
316
) {
256
317
emptyOffset := offset == RemoteOffset {}
257
318
// At startup the remote node's id may not be set. Skip recording latency.
@@ -299,6 +360,22 @@ func (r *RemoteClockMonitor) UpdateOffset(
299
360
prevAvg := info .avgNanos .Value ()
300
361
info .avgNanos .Add (newLatencyf )
301
362
r .metrics .RoundTripLatency .RecordValue (roundTripLatency .Nanoseconds ())
363
+ switch rpcClass {
364
+ case rpcbase .DefaultClass :
365
+ r .metrics .RoundTripDefaultClassLatency .RecordValue (roundTripLatency .Nanoseconds ())
366
+ case rpcbase .SystemClass :
367
+ r .metrics .RoundTripSystemClassLatency .RecordValue (roundTripLatency .Nanoseconds ())
368
+ case rpcbase .RangefeedClass :
369
+ r .metrics .RoundTripRangefeedClassLatency .RecordValue (roundTripLatency .Nanoseconds ())
370
+ case rpcbase .RaftClass :
371
+ r .metrics .RoundTripRaftClassLatency .RecordValue (roundTripLatency .Nanoseconds ())
372
+ default :
373
+ log .Dev .Warningf (ctx , "unknown RPC class: %s" , rpcClass )
374
+ if buildutil .CrdbTestBuild {
375
+ panic (errors .AssertionFailedf ("unknown RPC class: %s" , rpcClass ))
376
+ }
377
+
378
+ }
302
379
303
380
// See: https://github.com/cockroachdb/cockroach/issues/96262
304
381
// See: https://github.com/cockroachdb/cockroach/issues/98066
@@ -427,6 +504,7 @@ func updateClockOffsetTracking(
427
504
nodeID roachpb.NodeID ,
428
505
sendTime , serverTime , receiveTime time.Time ,
429
506
toleratedOffset time.Duration ,
507
+ rpcClass rpcbase.ConnectionClass ,
430
508
) (time.Duration , RemoteOffset , error ) {
431
509
pingDuration := receiveTime .Sub (sendTime )
432
510
if remoteClocks == nil {
@@ -455,7 +533,7 @@ func updateClockOffsetTracking(
455
533
remoteTimeNow := serverTime .Add (pingDuration / 2 )
456
534
offset .Offset = remoteTimeNow .Sub (receiveTime ).Nanoseconds ()
457
535
}
458
- remoteClocks .UpdateOffset (ctx , nodeID , offset , pingDuration )
536
+ remoteClocks .UpdateOffset (ctx , nodeID , offset , pingDuration , rpcClass )
459
537
return pingDuration , offset , remoteClocks .VerifyClockOffset (ctx )
460
538
}
461
539
0 commit comments