Skip to content

Commit 3d8c5ca

Browse files
committed
rpc: add tcp_rtt and tcp_rtt_var metrics for gRPC
Previously, our only metric for gauging network latency was rpc.connection.avg_round_trip_latency. This metric was calculated by timing heartbeats in the RPC layer. However, because these measurements are computed within cockroach, they can be confounded by CPU-heavy workloads. Through escalations, we've found that elevated network latencies (outside of CRDB's control) can severely degrade cluster performance. So, being able to directly and accurately identify these cases would be helpful. To address this, this patch introduces two new metrics whose values are computed by Linux. As kernel-computed metrics, these are less sensitive to CPU overload: 1. rpc.connection.tcp_rtt: TCP smoothed round-trip time 2. rpc.connection.tcp_rtt_var: TCP round-trip time variance Since these metrics are internally aggregated by Linux, we only need to sample them periodically. We update them in the heartbeat loop, at the same cadence as the original avg_round_trip_latency. To obtain these metrics, we need access to the underlying *net.TCPConn of our gRPC peer connection. So, the dial function we pass to gRPC has been modified to update the tcpConn field of the peer struct on each network dial. Part of: #149959 Release note: None
1 parent dbc05bb commit 3d8c5ca

File tree

10 files changed

+191
-23
lines changed

10 files changed

+191
-23
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,10 @@ layers:
124124
description: |
125125
Sum of exponentially weighted moving average of round-trip latencies, as measured through a gRPC RPC.
126126

127+
Since this metric is based on gRPC RPCs, it is affected by application-level
128+
processing delays and CPU overload effects. See rpc.connection.tcp_rtt for a
129+
metric that is obtained from the kernel's TCP stack.
130+
127131
Dividing this Gauge by rpc.connection.healthy gives an approximation of average
128132
latency, but the top-level round-trip-latency histogram is more useful. Instead,
129133
users should consult the label families of this metric if they are available
@@ -191,6 +195,40 @@ layers:
191195
derivative: NON_NEGATIVE_DERIVATIVE
192196
how_to_use: See Description.
193197
essential: true
198+
- name: rpc.connection.tcp_rtt
199+
exported_name: rpc_connection_tcp_rtt
200+
description: |
201+
Kernel-level TCP round-trip time as measured by the Linux TCP stack.
202+
203+
This metric reports the smoothed round-trip time (SRTT) as maintained by the
204+
kernel's TCP implementation. Unlike application-level RPC latency measurements,
205+
this reflects pure network latency and is less affected by CPU overload effects.
206+
207+
This metric is only available on Linux.
208+
y_axis_label: Latency
209+
type: GAUGE
210+
unit: NANOSECONDS
211+
aggregation: AVG
212+
derivative: NONE
213+
how_to_use: High TCP RTT values indicate network issues outside of CockroachDB that could be impacting the user's workload.
214+
essential: true
215+
- name: rpc.connection.tcp_rtt_var
216+
exported_name: rpc_connection_tcp_rtt_var
217+
description: |
218+
Kernel-level TCP round-trip time variance as measured by the Linux TCP stack.
219+
220+
This metric reports the smoothed round-trip time variance (RTTVAR) as maintained
221+
by the kernel's TCP implementation. This measures the stability of the
222+
connection latency.
223+
224+
This metric is only available on Linux.
225+
y_axis_label: Latency Variance
226+
type: GAUGE
227+
unit: NANOSECONDS
228+
aggregation: AVG
229+
derivative: NONE
230+
how_to_use: High TCP RTT variance values indicate network stability issues outside of CockroachDB that could be impacting the user's workload.
231+
essential: true
194232
- name: rpc.connection.unhealthy
195233
exported_name: rpc_connection_unhealthy
196234
description: Gauge of current connections in an unhealthy state (not bidirectionally connected or heartbeating)

pkg/rpc/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ go_library(
6262
"//pkg/util/netutil/addr",
6363
"//pkg/util/stop",
6464
"//pkg/util/syncutil",
65+
"//pkg/util/sysutil",
6566
"//pkg/util/timeutil",
6667
"//pkg/util/tracing",
6768
"//pkg/util/tracing/grpcinterceptor",

pkg/rpc/context.go

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1389,12 +1389,23 @@ func (rpcCtx *Context) GRPCDialOptions(
13891389
// See the explanation on loopbackDialFn for an explanation about this.
13901390
transport = loopbackTransport
13911391
}
1392-
return rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport)
1392+
// In other invokations of grpcDialOptionsInternal, we care about having a
1393+
// hook into each network dial so we can store the most recent TCP
1394+
// connection that we've dialed.
1395+
//
1396+
// Here, though, we don't currently care about the underlying TCP connection
1397+
// backing a gRPC channel so onNetworkDial is a no-op.
1398+
onNetworkDial := func(conn net.Conn) {}
1399+
return rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport, onNetworkDial)
13931400
}
13941401

13951402
// grpcDialOptions produces dial options suitable for connecting to the given target and class.
13961403
func (rpcCtx *Context) grpcDialOptionsInternal(
1397-
ctx context.Context, target string, class rpcbase.ConnectionClass, transport transportType,
1404+
ctx context.Context,
1405+
target string,
1406+
class rpcbase.ConnectionClass,
1407+
transport transportType,
1408+
onNetworkDial onDialFunc,
13981409
) ([]grpc.DialOption, error) {
13991410
dialOpts, err := rpcCtx.dialOptsCommon(ctx, target, class)
14001411
if err != nil {
@@ -1403,7 +1414,7 @@ func (rpcCtx *Context) grpcDialOptionsInternal(
14031414

14041415
switch transport {
14051416
case tcpTransport:
1406-
netOpts, err := rpcCtx.dialOptsNetwork(ctx, target, class)
1417+
netOpts, err := rpcCtx.dialOptsNetwork(ctx, target, class, onNetworkDial)
14071418
if err != nil {
14081419
return nil, err
14091420
}
@@ -1548,10 +1559,27 @@ func (t *statsTracker) HandleConn(ctx context.Context, s stats.ConnStats) {
15481559
}
15491560
}
15501561

1562+
type onDialFunc func(conn net.Conn)
1563+
1564+
func (rpcCtx *Context) dialerWithCallback(
1565+
dialerFunc dialerFunc, onNetworkDial onDialFunc,
1566+
) dialerFunc {
1567+
return func(ctx context.Context, addr string) (net.Conn, error) {
1568+
conn, err := dialerFunc(ctx, addr)
1569+
if err != nil {
1570+
return nil, err
1571+
}
1572+
if onNetworkDial != nil {
1573+
onNetworkDial(conn)
1574+
}
1575+
return conn, nil
1576+
}
1577+
}
1578+
15511579
// dialOptsNetwork compute options used only for over-the-network RPC
15521580
// connections.
15531581
func (rpcCtx *Context) dialOptsNetwork(
1554-
ctx context.Context, target string, class rpcbase.ConnectionClass,
1582+
ctx context.Context, target string, class rpcbase.ConnectionClass, onNetworkDial onDialFunc,
15551583
) ([]grpc.DialOption, error) {
15561584
dialOpts, err := rpcCtx.dialOptsNetworkCredentials()
15571585
if err != nil {
@@ -1638,6 +1666,11 @@ func (rpcCtx *Context) dialOptsNetwork(
16381666
}
16391667
dialerFunc = dialer.dial
16401668
}
1669+
// Wrap the dial function with the callback that's been passed down so we
1670+
// have a hook into each network dial from higher up.
1671+
//
1672+
// This allows us to keep the peer's tcpConn up to date.
1673+
dialerFunc = rpcCtx.dialerWithCallback(dialerFunc, onNetworkDial)
16411674
dialOpts = append(dialOpts, grpc.WithContextDialer(dialerFunc))
16421675

16431676
// Don't retry on dial errors either, otherwise the onlyOnceDialer will get
@@ -1981,14 +2014,15 @@ func (rpcCtx *Context) grpcDialRaw(
19812014
ctx context.Context,
19822015
target string,
19832016
class rpcbase.ConnectionClass,
2017+
onNetworkDial onDialFunc,
19842018
additionalOpts ...grpc.DialOption,
19852019
) (*grpc.ClientConn, error) {
19862020
transport := tcpTransport
19872021
if rpcCtx.ContextOptions.AdvertiseAddr == target && !rpcCtx.ClientOnly {
19882022
// See the explanation on loopbackDialFn for an explanation about this.
19892023
transport = loopbackTransport
19902024
}
1991-
dialOpts, err := rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport)
2025+
dialOpts, err := rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport, onNetworkDial)
19922026
if err != nil {
19932027
return nil, err
19942028
}
@@ -2189,7 +2223,7 @@ type Dialbacker interface {
21892223
GRPCUnvalidatedDial(string, roachpb.Locality) *GRPCConnection
21902224
GRPCDialNode(string, roachpb.NodeID, roachpb.Locality, rpcbase.ConnectionClass) *GRPCConnection
21912225
grpcDialRaw(
2192-
context.Context, string, rpcbase.ConnectionClass, ...grpc.DialOption,
2226+
context.Context, string, rpcbase.ConnectionClass, onDialFunc, ...grpc.DialOption,
21932227
) (*grpc.ClientConn, error)
21942228
wrapCtx(
21952229
ctx context.Context, target string, remoteNodeID roachpb.NodeID, class rpcbase.ConnectionClass,
@@ -2265,7 +2299,8 @@ func VerifyDialback(
22652299
// A throwaway connection keeps it simple.
22662300
ctx := rpcCtx.wrapCtx(ctx, target, request.OriginNodeID, rpcbase.SystemClass)
22672301
ctx = logtags.AddTag(ctx, "dialback", nil)
2268-
conn, err := rpcCtx.grpcDialRaw(ctx, target, rpcbase.SystemClass, grpc.WithBlock())
2302+
onNetworkDial := func(conn net.Conn) {}
2303+
conn, err := rpcCtx.grpcDialRaw(ctx, target, rpcbase.SystemClass, onNetworkDial, grpc.WithBlock())
22692304
if conn != nil { // NB: the nil check simplifies mocking in TestVerifyDialback
22702305
_ = conn.Close() // nolint:grpcconnclose
22712306
}

pkg/rpc/context_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1690,7 +1690,7 @@ func BenchmarkGRPCDial(b *testing.B) {
16901690

16911691
b.RunParallel(func(pb *testing.PB) {
16921692
for pb.Next() {
1693-
_, err := rpcCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass)
1693+
_, err := rpcCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass, nil /* onNetworkDial */)
16941694
if err != nil {
16951695
b.Fatal(err)
16961696
}
@@ -2052,8 +2052,8 @@ func TestVerifyDialback(t *testing.T) {
20522052
ctx context.Context, _ string, _ roachpb.NodeID, _ rpcbase.ConnectionClass) context.Context {
20532053
return ctx
20542054
})
2055-
mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any()).
2056-
DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, ...grpc.DialOption) (*grpc.ClientConn, error) {
2055+
mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any() /* onDialFunc */, gomock.Any()).
2056+
DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, onDialFunc, ...grpc.DialOption) (*grpc.ClientConn, error) {
20572057
if dialbackOK {
20582058
return nil, nil
20592059
}
@@ -2088,8 +2088,8 @@ func TestVerifyDialback(t *testing.T) {
20882088
ctx context.Context, _ string, _ roachpb.NodeID, _ rpcbase.ConnectionClass) context.Context {
20892089
return ctx
20902090
})
2091-
mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any()).
2092-
DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, ...grpc.DialOption) (*grpc.ClientConn, error) {
2091+
mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any() /* onDialFunc */, gomock.Any()).
2092+
DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, onDialFunc, ...grpc.DialOption) (*grpc.ClientConn, error) {
20932093
return nil, nil
20942094
})
20952095
require.NoError(t, VerifyDialback(context.Background(), mockRPCCtx, req, &PingResponse{}, roachpb.Locality{}, sv))
@@ -2297,7 +2297,7 @@ func BenchmarkGRPCPing(b *testing.B) {
22972297

22982298
cliRPCCtx := newTestContext(uuid.MakeV4(), clock, maxOffset, stopper)
22992299
cliRPCCtx.NodeID.Set(ctx, 2)
2300-
cc, err := cliRPCCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass)
2300+
cc, err := cliRPCCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass, nil /* onNetworkDial */)
23012301
require.NoError(b, err)
23022302

23032303
for _, tc := range []struct {

pkg/rpc/grpc.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@ package rpc
77

88
import (
99
"context"
10+
"net"
1011

1112
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
1213
"github.com/cockroachdb/cockroach/pkg/roachpb"
1314
"github.com/cockroachdb/cockroach/pkg/rpc/rpcbase"
15+
"github.com/cockroachdb/cockroach/pkg/util/log"
1416
"github.com/cockroachdb/cockroach/pkg/util/stop"
1517
"google.golang.org/grpc"
1618
"google.golang.org/grpc/connectivity"
@@ -63,7 +65,26 @@ func newGRPCPeerOptions(
6365
dial: func(ctx context.Context, target string, class rpcbase.ConnectionClass) (*grpc.ClientConn, error) {
6466
additionalDialOpts := []grpc.DialOption{grpc.WithStatsHandler(&statsTracker{lm})}
6567
additionalDialOpts = append(additionalDialOpts, rpcCtx.testingDialOpts...)
66-
return rpcCtx.grpcDialRaw(ctx, target, class, additionalDialOpts...)
68+
// onNetworkDial is a callback that is called after we dial a TCP connection.
69+
// It is not called if we use the loopback dialer.
70+
// We define it here because we need access to the peer map.
71+
onNetworkDial := func(conn net.Conn) {
72+
tcpConn, ok := conn.(*net.TCPConn)
73+
if !ok {
74+
return
75+
}
76+
77+
rpcCtx.peers.mu.Lock()
78+
defer rpcCtx.peers.mu.Unlock()
79+
p := rpcCtx.peers.mu.m[k]
80+
81+
p.mu.Lock()
82+
defer p.mu.Unlock()
83+
p.mu.tcpConn = tcpConn
84+
85+
log.VEventf(ctx, 2, "gRPC network dial: laddr=%v", tcpConn.LocalAddr())
86+
}
87+
return rpcCtx.grpcDialRaw(ctx, target, class, onNetworkDial, additionalDialOpts...)
6788
},
6889
connEquals: func(a, b *grpc.ClientConn) bool {
6990
return a == b

pkg/rpc/metrics.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ Decommissioned peers are excluded.
128128
Unit: metric.Unit_NANOSECONDS,
129129
Help: `Sum of exponentially weighted moving average of round-trip latencies, as measured through a gRPC RPC.
130130
131+
Since this metric is based on gRPC RPCs, it is affected by application-level
132+
processing delays and CPU overload effects. See rpc.connection.tcp_rtt for a
133+
metric that is obtained from the kernel's TCP stack.
134+
131135
Dividing this Gauge by rpc.connection.healthy gives an approximation of average
132136
latency, but the top-level round-trip-latency histogram is more useful. Instead,
133137
users should consult the label families of this metric if they are available
@@ -142,6 +146,40 @@ is reset to zero.
142146
Category: metric.Metadata_NETWORKING,
143147
HowToUse: `This metric is helpful in understanding general network issues outside of CockroachDB that could be impacting the user’s workload.`,
144148
}
149+
150+
metaConnectionTCPRTT = metric.Metadata{
151+
Name: "rpc.connection.tcp_rtt",
152+
Unit: metric.Unit_NANOSECONDS,
153+
Help: `Kernel-level TCP round-trip time as measured by the Linux TCP stack.
154+
155+
This metric reports the smoothed round-trip time (SRTT) as maintained by the
156+
kernel's TCP implementation. Unlike application-level RPC latency measurements,
157+
this reflects pure network latency and is less affected by CPU overload effects.
158+
159+
This metric is only available on Linux.
160+
`,
161+
Measurement: "Latency",
162+
Essential: true,
163+
Category: metric.Metadata_NETWORKING,
164+
HowToUse: `High TCP RTT values indicate network issues outside of CockroachDB that could be impacting the user's workload.`,
165+
}
166+
167+
metaConnectionTCPRTTVar = metric.Metadata{
168+
Name: "rpc.connection.tcp_rtt_var",
169+
Unit: metric.Unit_NANOSECONDS,
170+
Help: `Kernel-level TCP round-trip time variance as measured by the Linux TCP stack.
171+
172+
This metric reports the smoothed round-trip time variance (RTTVAR) as maintained
173+
by the kernel's TCP implementation. This measures the stability of the
174+
connection latency.
175+
176+
This metric is only available on Linux.
177+
`,
178+
Measurement: "Latency Variance",
179+
Essential: true,
180+
Category: metric.Metadata_NETWORKING,
181+
HowToUse: `High TCP RTT variance values indicate network stability issues outside of CockroachDB that could be impacting the user's workload.`,
182+
}
145183
metaConnectionConnected = metric.Metadata{
146184
Name: "rpc.connection.connected",
147185
Help: `Counter of TCP level connected connections.
@@ -226,6 +264,8 @@ func newMetrics(locality roachpb.Locality) *Metrics {
226264
ConnectionBytesSent: aggmetric.NewCounter(metaNetworkBytesEgress, localityLabels...),
227265
ConnectionBytesRecv: aggmetric.NewCounter(metaNetworkBytesIngress, localityLabels...),
228266
ConnectionAvgRoundTripLatency: aggmetric.NewGauge(metaConnectionAvgRoundTripLatency, childLabels...),
267+
ConnectionTCPRTT: aggmetric.NewGauge(metaConnectionTCPRTT, childLabels...),
268+
ConnectionTCPRTTVar: aggmetric.NewGauge(metaConnectionTCPRTTVar, childLabels...),
229269
}
230270
m.mu.peerMetrics = make(map[string]peerMetrics)
231271
m.mu.localityMetrics = make(map[string]localityMetrics)
@@ -270,6 +310,8 @@ type Metrics struct {
270310
ConnectionBytesSent *aggmetric.AggCounter
271311
ConnectionBytesRecv *aggmetric.AggCounter
272312
ConnectionAvgRoundTripLatency *aggmetric.AggGauge
313+
ConnectionTCPRTT *aggmetric.AggGauge
314+
ConnectionTCPRTTVar *aggmetric.AggGauge
273315
mu struct {
274316
syncutil.Mutex
275317
// peerMetrics is a map of peerKey to peerMetrics.
@@ -318,6 +360,12 @@ type peerMetrics struct {
318360
// Updated on each successful heartbeat, reset (along with roundTripLatency)
319361
// after runHeartbeatUntilFailure returns.
320362
AvgRoundTripLatency *aggmetric.Gauge
363+
// TCP-level round trip time as measured by the kernel's TCP stack.
364+
// This provides network-level latency without application overhead.
365+
TCPRTT *aggmetric.Gauge
366+
// TCP-level round trip time variance as measured by the kernel's TCP stack.
367+
// This indicates connection stability and jitter.
368+
TCPRTTVar *aggmetric.Gauge
321369
// roundTripLatency is the source for the AvgRoundTripLatency gauge. We don't
322370
// want to maintain a full histogram per peer, so instead on each heartbeat we
323371
// update roundTripLatency and flush the result into AvgRoundTripLatency.
@@ -353,6 +401,8 @@ func (m *Metrics) acquire(k peerKey, l roachpb.Locality) (peerMetrics, localityM
353401
ConnectionHeartbeats: m.ConnectionHeartbeats.AddChild(labelVals...),
354402
ConnectionFailures: m.ConnectionFailures.AddChild(labelVals...),
355403
AvgRoundTripLatency: m.ConnectionAvgRoundTripLatency.AddChild(labelVals...),
404+
TCPRTT: m.ConnectionTCPRTT.AddChild(labelVals...),
405+
TCPRTTVar: m.ConnectionTCPRTTVar.AddChild(labelVals...),
356406
// We use a SimpleEWMA which uses the zero value to mean "uninitialized"
357407
// and operates on a ~60s decay rate.
358408
roundTripLatency: &ThreadSafeMovingAverage{ma: &ewma.SimpleEWMA{}},

pkg/rpc/metrics_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func TestMetricsRelease(t *testing.T) {
6060
return metricFields
6161
}
6262

63-
const expectedCount = 11
63+
const expectedCount = 13
6464
k1 := peerKey{NodeID: 5, TargetAddr: "192.168.0.1:1234", Class: rpcbase.DefaultClass}
6565
k2 := peerKey{NodeID: 6, TargetAddr: "192.168.0.1:1234", Class: rpcbase.DefaultClass}
6666
l1 := roachpb.Locality{Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}

0 commit comments

Comments
 (0)