rpc: add tcp_rtt and tcp_rtt_var metrics for gRPC

xuchef · xuchef · commit 3d8c5ca253a4 · 2025-08-12T17:51:06.000Z
Previously, our only metric for gauging network latency was rpc.connection.avg_round_trip_latency. This metric was calculated by timing heartbeats in the RPC layer. However, because these measurements are computed within cockroach, they can be confounded by CPU-heavy workloads. Through escalations, we've found that elevated network latencies (outside of CRDB's control) can severely degrade cluster performance. So, being able to directly and accurately identify these cases would be helpful. To address this, this patch introduces two new metrics whose values are computed by Linux. As kernel-computed metrics, these are less sensitive to CPU overload: 1. rpc.connection.tcp_rtt: TCP smoothed round-trip time 2. rpc.connection.tcp_rtt_var: TCP round-trip time variance Since these metrics are internally aggregated by Linux, we only need to sample them periodically. We update them in the heartbeat loop, at the same cadence as the original avg_round_trip_latency. To obtain these metrics, we need access to the underlying *net.TCPConn of our gRPC peer connection. So, the dial function we pass to gRPC has been modified to update the tcpConn field of the peer struct on each network dial. Part of: #149959 Release note: None
diff --git a/docs/generated/metrics/metrics.yaml b/docs/generated/metrics/metrics.yaml
@@ -124,6 +124,10 @@ layers:
       description: |
         Sum of exponentially weighted moving average of round-trip latencies, as measured through a gRPC RPC.
 
+        Since this metric is based on gRPC RPCs, it is affected by application-level
+        processing delays and CPU overload effects. See rpc.connection.tcp_rtt for a
+        metric that is obtained from the kernel's TCP stack.
+
         Dividing this Gauge by rpc.connection.healthy gives an approximation of average
         latency, but the top-level round-trip-latency histogram is more useful. Instead,
         users should consult the label families of this metric if they are available
@@ -191,6 +195,40 @@ layers:
       derivative: NON_NEGATIVE_DERIVATIVE
       how_to_use: See Description.
       essential: true
+    - name: rpc.connection.tcp_rtt
+      exported_name: rpc_connection_tcp_rtt
+      description: |
+        Kernel-level TCP round-trip time as measured by the Linux TCP stack.
+
+        This metric reports the smoothed round-trip time (SRTT) as maintained by the
+        kernel's TCP implementation. Unlike application-level RPC latency measurements,
+        this reflects pure network latency and is less affected by CPU overload effects.
+
+        This metric is only available on Linux.
+      y_axis_label: Latency
+      type: GAUGE
+      unit: NANOSECONDS
+      aggregation: AVG
+      derivative: NONE
+      how_to_use: High TCP RTT values indicate network issues outside of CockroachDB that could be impacting the user's workload.
+      essential: true
+    - name: rpc.connection.tcp_rtt_var
+      exported_name: rpc_connection_tcp_rtt_var
+      description: |
+        Kernel-level TCP round-trip time variance as measured by the Linux TCP stack.
+
+        This metric reports the smoothed round-trip time variance (RTTVAR) as maintained
+        by the kernel's TCP implementation. This measures the stability of the
+        connection latency.
+
+        This metric is only available on Linux.
+      y_axis_label: Latency Variance
+      type: GAUGE
+      unit: NANOSECONDS
+      aggregation: AVG
+      derivative: NONE
+      how_to_use: High TCP RTT variance values indicate network stability issues outside of CockroachDB that could be impacting the user's workload.
+      essential: true
     - name: rpc.connection.unhealthy
       exported_name: rpc_connection_unhealthy
       description: Gauge of current connections in an unhealthy state (not bidirectionally connected or heartbeating)
diff --git a/pkg/rpc/BUILD.bazel b/pkg/rpc/BUILD.bazel
@@ -62,6 +62,7 @@ go_library(
         "//pkg/util/netutil/addr",
         "//pkg/util/stop",
         "//pkg/util/syncutil",
+        "//pkg/util/sysutil",
         "//pkg/util/timeutil",
         "//pkg/util/tracing",
         "//pkg/util/tracing/grpcinterceptor",
diff --git a/pkg/rpc/context.go b/pkg/rpc/context.go
@@ -1389,12 +1389,23 @@ func (rpcCtx *Context) GRPCDialOptions(
 		// See the explanation on loopbackDialFn for an explanation about this.
 		transport = loopbackTransport
 	}
-	return rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport)
+	// In other invokations of grpcDialOptionsInternal, we care about having a
+	// hook into each network dial so we can store the most recent TCP
+	// connection that we've dialed.
+	//
+	// Here, though, we don't currently care about the underlying TCP connection
+	// backing a gRPC channel so onNetworkDial is a no-op.
+	onNetworkDial := func(conn net.Conn) {}
+	return rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport, onNetworkDial)
 }
 
 // grpcDialOptions produces dial options suitable for connecting to the given target and class.
 func (rpcCtx *Context) grpcDialOptionsInternal(
-	ctx context.Context, target string, class rpcbase.ConnectionClass, transport transportType,
+	ctx context.Context,
+	target string,
+	class rpcbase.ConnectionClass,
+	transport transportType,
+	onNetworkDial onDialFunc,
 ) ([]grpc.DialOption, error) {
 	dialOpts, err := rpcCtx.dialOptsCommon(ctx, target, class)
 	if err != nil {
@@ -1403,7 +1414,7 @@ func (rpcCtx *Context) grpcDialOptionsInternal(
 
 	switch transport {
 	case tcpTransport:
-		netOpts, err := rpcCtx.dialOptsNetwork(ctx, target, class)
+		netOpts, err := rpcCtx.dialOptsNetwork(ctx, target, class, onNetworkDial)
 		if err != nil {
 			return nil, err
 		}
@@ -1548,10 +1559,27 @@ func (t *statsTracker) HandleConn(ctx context.Context, s stats.ConnStats) {
 	}
 }
 
+type onDialFunc func(conn net.Conn)
+
+func (rpcCtx *Context) dialerWithCallback(
+	dialerFunc dialerFunc, onNetworkDial onDialFunc,
+) dialerFunc {
+	return func(ctx context.Context, addr string) (net.Conn, error) {
+		conn, err := dialerFunc(ctx, addr)
+		if err != nil {
+			return nil, err
+		}
+		if onNetworkDial != nil {
+			onNetworkDial(conn)
+		}
+		return conn, nil
+	}
+}
+
 // dialOptsNetwork compute options used only for over-the-network RPC
 // connections.
 func (rpcCtx *Context) dialOptsNetwork(
-	ctx context.Context, target string, class rpcbase.ConnectionClass,
+	ctx context.Context, target string, class rpcbase.ConnectionClass, onNetworkDial onDialFunc,
 ) ([]grpc.DialOption, error) {
 	dialOpts, err := rpcCtx.dialOptsNetworkCredentials()
 	if err != nil {
@@ -1638,6 +1666,11 @@ func (rpcCtx *Context) dialOptsNetwork(
 		}
 		dialerFunc = dialer.dial
 	}
+	// Wrap the dial function with the callback that's been passed down so we
+	// have a hook into each network dial from higher up.
+	//
+	// This allows us to keep the peer's tcpConn up to date.
+	dialerFunc = rpcCtx.dialerWithCallback(dialerFunc, onNetworkDial)
 	dialOpts = append(dialOpts, grpc.WithContextDialer(dialerFunc))
 
 	// Don't retry on dial errors either, otherwise the onlyOnceDialer will get
@@ -1981,14 +2014,15 @@ func (rpcCtx *Context) grpcDialRaw(
 	ctx context.Context,
 	target string,
 	class rpcbase.ConnectionClass,
+	onNetworkDial onDialFunc,
 	additionalOpts ...grpc.DialOption,
 ) (*grpc.ClientConn, error) {
 	transport := tcpTransport
 	if rpcCtx.ContextOptions.AdvertiseAddr == target && !rpcCtx.ClientOnly {
 		// See the explanation on loopbackDialFn for an explanation about this.
 		transport = loopbackTransport
 	}
-	dialOpts, err := rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport)
+	dialOpts, err := rpcCtx.grpcDialOptionsInternal(ctx, target, class, transport, onNetworkDial)
 	if err != nil {
 		return nil, err
 	}
@@ -2189,7 +2223,7 @@ type Dialbacker interface {
 	GRPCUnvalidatedDial(string, roachpb.Locality) *GRPCConnection
 	GRPCDialNode(string, roachpb.NodeID, roachpb.Locality, rpcbase.ConnectionClass) *GRPCConnection
 	grpcDialRaw(
-		context.Context, string, rpcbase.ConnectionClass, ...grpc.DialOption,
+		context.Context, string, rpcbase.ConnectionClass, onDialFunc, ...grpc.DialOption,
 	) (*grpc.ClientConn, error)
 	wrapCtx(
 		ctx context.Context, target string, remoteNodeID roachpb.NodeID, class rpcbase.ConnectionClass,
@@ -2265,7 +2299,8 @@ func VerifyDialback(
 		// A throwaway connection keeps it simple.
 		ctx := rpcCtx.wrapCtx(ctx, target, request.OriginNodeID, rpcbase.SystemClass)
 		ctx = logtags.AddTag(ctx, "dialback", nil)
-		conn, err := rpcCtx.grpcDialRaw(ctx, target, rpcbase.SystemClass, grpc.WithBlock())
+		onNetworkDial := func(conn net.Conn) {}
+		conn, err := rpcCtx.grpcDialRaw(ctx, target, rpcbase.SystemClass, onNetworkDial, grpc.WithBlock())
 		if conn != nil { // NB: the nil check simplifies mocking in TestVerifyDialback
 			_ = conn.Close() // nolint:grpcconnclose
 		}
diff --git a/pkg/rpc/context_test.go b/pkg/rpc/context_test.go
@@ -1690,7 +1690,7 @@ func BenchmarkGRPCDial(b *testing.B) {
 
 	b.RunParallel(func(pb *testing.PB) {
 		for pb.Next() {
-			_, err := rpcCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass)
+			_, err := rpcCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass, nil /* onNetworkDial */)
 			if err != nil {
 				b.Fatal(err)
 			}
@@ -2052,8 +2052,8 @@ func TestVerifyDialback(t *testing.T) {
 				ctx context.Context, _ string, _ roachpb.NodeID, _ rpcbase.ConnectionClass) context.Context {
 				return ctx
 			})
-			mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any()).
-				DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, ...grpc.DialOption) (*grpc.ClientConn, error) {
+			mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any() /* onDialFunc */, gomock.Any()).
+				DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, onDialFunc, ...grpc.DialOption) (*grpc.ClientConn, error) {
 					if dialbackOK {
 						return nil, nil
 					}
@@ -2088,8 +2088,8 @@ func TestVerifyDialback(t *testing.T) {
 			ctx context.Context, _ string, _ roachpb.NodeID, _ rpcbase.ConnectionClass) context.Context {
 			return ctx
 		})
-		mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any()).
-			DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, ...grpc.DialOption) (*grpc.ClientConn, error) {
+		mockRPCCtx.EXPECT().grpcDialRaw(gomock.Any() /* ctx */, "1.1.1.1", rpcbase.SystemClass, gomock.Any() /* onDialFunc */, gomock.Any()).
+			DoAndReturn(func(context.Context, string, rpcbase.ConnectionClass, onDialFunc, ...grpc.DialOption) (*grpc.ClientConn, error) {
 				return nil, nil
 			})
 		require.NoError(t, VerifyDialback(context.Background(), mockRPCCtx, req, &PingResponse{}, roachpb.Locality{}, sv))
@@ -2297,7 +2297,7 @@ func BenchmarkGRPCPing(b *testing.B) {
 
 			cliRPCCtx := newTestContext(uuid.MakeV4(), clock, maxOffset, stopper)
 			cliRPCCtx.NodeID.Set(ctx, 2)
-			cc, err := cliRPCCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass)
+			cc, err := cliRPCCtx.grpcDialRaw(ctx, remoteAddr, rpcbase.DefaultClass, nil /* onNetworkDial */)
 			require.NoError(b, err)
 
 			for _, tc := range []struct {
diff --git a/pkg/rpc/grpc.go b/pkg/rpc/grpc.go
@@ -7,10 +7,12 @@ package rpc
 
 import (
 	"context"
+	"net"
 
 	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/rpc/rpcbase"
+	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/stop"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/connectivity"
@@ -63,7 +65,26 @@ func newGRPCPeerOptions(
 			dial: func(ctx context.Context, target string, class rpcbase.ConnectionClass) (*grpc.ClientConn, error) {
 				additionalDialOpts := []grpc.DialOption{grpc.WithStatsHandler(&statsTracker{lm})}
 				additionalDialOpts = append(additionalDialOpts, rpcCtx.testingDialOpts...)
-				return rpcCtx.grpcDialRaw(ctx, target, class, additionalDialOpts...)
+				// onNetworkDial is a callback that is called after we dial a TCP connection.
+				// It is not called if we use the loopback dialer.
+				// We define it here because we need access to the peer map.
+				onNetworkDial := func(conn net.Conn) {
+					tcpConn, ok := conn.(*net.TCPConn)
+					if !ok {
+						return
+					}
+
+					rpcCtx.peers.mu.Lock()
+					defer rpcCtx.peers.mu.Unlock()
+					p := rpcCtx.peers.mu.m[k]
+
+					p.mu.Lock()
+					defer p.mu.Unlock()
+					p.mu.tcpConn = tcpConn
+
+					log.VEventf(ctx, 2, "gRPC network dial: laddr=%v", tcpConn.LocalAddr())
+				}
+				return rpcCtx.grpcDialRaw(ctx, target, class, onNetworkDial, additionalDialOpts...)
 			},
 			connEquals: func(a, b *grpc.ClientConn) bool {
 				return a == b
diff --git a/pkg/rpc/metrics.go b/pkg/rpc/metrics.go
@@ -128,6 +128,10 @@ Decommissioned peers are excluded.
 		Unit: metric.Unit_NANOSECONDS,
 		Help: `Sum of exponentially weighted moving average of round-trip latencies, as measured through a gRPC RPC.
 
+Since this metric is based on gRPC RPCs, it is affected by application-level
+processing delays and CPU overload effects. See rpc.connection.tcp_rtt for a
+metric that is obtained from the kernel's TCP stack.
+
 Dividing this Gauge by rpc.connection.healthy gives an approximation of average
 latency, but the top-level round-trip-latency histogram is more useful. Instead,
 users should consult the label families of this metric if they are available
@@ -142,6 +146,40 @@ is reset to zero.
 		Category:    metric.Metadata_NETWORKING,
 		HowToUse:    `This metric is helpful in understanding general network issues outside of CockroachDB that could be impacting the user’s workload.`,
 	}
+
+	metaConnectionTCPRTT = metric.Metadata{
+		Name: "rpc.connection.tcp_rtt",
+		Unit: metric.Unit_NANOSECONDS,
+		Help: `Kernel-level TCP round-trip time as measured by the Linux TCP stack.
+
+This metric reports the smoothed round-trip time (SRTT) as maintained by the
+kernel's TCP implementation. Unlike application-level RPC latency measurements,
+this reflects pure network latency and is less affected by CPU overload effects.
+
+This metric is only available on Linux.
+`,
+		Measurement: "Latency",
+		Essential:   true,
+		Category:    metric.Metadata_NETWORKING,
+		HowToUse:    `High TCP RTT values indicate network issues outside of CockroachDB that could be impacting the user's workload.`,
+	}
+
+	metaConnectionTCPRTTVar = metric.Metadata{
+		Name: "rpc.connection.tcp_rtt_var",
+		Unit: metric.Unit_NANOSECONDS,
+		Help: `Kernel-level TCP round-trip time variance as measured by the Linux TCP stack.
+
+This metric reports the smoothed round-trip time variance (RTTVAR) as maintained
+by the kernel's TCP implementation. This measures the stability of the
+connection latency.
+
+This metric is only available on Linux.
+`,
+		Measurement: "Latency Variance",
+		Essential:   true,
+		Category:    metric.Metadata_NETWORKING,
+		HowToUse:    `High TCP RTT variance values indicate network stability issues outside of CockroachDB that could be impacting the user's workload.`,
+	}
 	metaConnectionConnected = metric.Metadata{
 		Name: "rpc.connection.connected",
 		Help: `Counter of TCP level connected connections.
@@ -226,6 +264,8 @@ func newMetrics(locality roachpb.Locality) *Metrics {
 		ConnectionBytesSent:           aggmetric.NewCounter(metaNetworkBytesEgress, localityLabels...),
 		ConnectionBytesRecv:           aggmetric.NewCounter(metaNetworkBytesIngress, localityLabels...),
 		ConnectionAvgRoundTripLatency: aggmetric.NewGauge(metaConnectionAvgRoundTripLatency, childLabels...),
+		ConnectionTCPRTT:              aggmetric.NewGauge(metaConnectionTCPRTT, childLabels...),
+		ConnectionTCPRTTVar:           aggmetric.NewGauge(metaConnectionTCPRTTVar, childLabels...),
 	}
 	m.mu.peerMetrics = make(map[string]peerMetrics)
 	m.mu.localityMetrics = make(map[string]localityMetrics)
@@ -270,6 +310,8 @@ type Metrics struct {
 	ConnectionBytesSent           *aggmetric.AggCounter
 	ConnectionBytesRecv           *aggmetric.AggCounter
 	ConnectionAvgRoundTripLatency *aggmetric.AggGauge
+	ConnectionTCPRTT              *aggmetric.AggGauge
+	ConnectionTCPRTTVar           *aggmetric.AggGauge
 	mu                            struct {
 		syncutil.Mutex
 		// peerMetrics is a map of peerKey to peerMetrics.
@@ -318,6 +360,12 @@ type peerMetrics struct {
 	// Updated on each successful heartbeat, reset (along with roundTripLatency)
 	// after runHeartbeatUntilFailure returns.
 	AvgRoundTripLatency *aggmetric.Gauge
+	// TCP-level round trip time as measured by the kernel's TCP stack.
+	// This provides network-level latency without application overhead.
+	TCPRTT *aggmetric.Gauge
+	// TCP-level round trip time variance as measured by the kernel's TCP stack.
+	// This indicates connection stability and jitter.
+	TCPRTTVar *aggmetric.Gauge
 	// roundTripLatency is the source for the AvgRoundTripLatency gauge. We don't
 	// want to maintain a full histogram per peer, so instead on each heartbeat we
 	// update roundTripLatency and flush the result into AvgRoundTripLatency.
@@ -353,6 +401,8 @@ func (m *Metrics) acquire(k peerKey, l roachpb.Locality) (peerMetrics, localityM
 			ConnectionHeartbeats:   m.ConnectionHeartbeats.AddChild(labelVals...),
 			ConnectionFailures:     m.ConnectionFailures.AddChild(labelVals...),
 			AvgRoundTripLatency:    m.ConnectionAvgRoundTripLatency.AddChild(labelVals...),
+			TCPRTT:                 m.ConnectionTCPRTT.AddChild(labelVals...),
+			TCPRTTVar:              m.ConnectionTCPRTTVar.AddChild(labelVals...),
 			// We use a SimpleEWMA which uses the zero value to mean "uninitialized"
 			// and operates on a ~60s decay rate.
 			roundTripLatency: &ThreadSafeMovingAverage{ma: &ewma.SimpleEWMA{}},
diff --git a/pkg/rpc/metrics_test.go b/pkg/rpc/metrics_test.go
@@ -60,7 +60,7 @@ func TestMetricsRelease(t *testing.T) {
 		return metricFields
 	}
 
-	const expectedCount = 11
+	const expectedCount = 13
 	k1 := peerKey{NodeID: 5, TargetAddr: "192.168.0.1:1234", Class: rpcbase.DefaultClass}
 	k2 := peerKey{NodeID: 6, TargetAddr: "192.168.0.1:1234", Class: rpcbase.DefaultClass}
 	l1 := roachpb.Locality{Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}
diff --git a/pkg/rpc/peer.go b/pkg/rpc/peer.go
diff --git a/pkg/rpc/rpc_mock_test.go b/pkg/rpc/rpc_mock_test.go
diff --git a/pkg/util/sysutil/tcpinfo_stub.go b/pkg/util/sysutil/tcpinfo_stub.go

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ func TestMetricsRelease(t *testing.T) {`
`60`	`60`	`return metricFields`
`61`	`61`	`}`
`62`	`62`
`63`		`- const expectedCount = 11`
	`63`	`+ const expectedCount = 13`
`64`	`64`	`k1 := peerKey{NodeID: 5, TargetAddr: "192.168.0.1:1234", Class: rpcbase.DefaultClass}`
`65`	`65`	`k2 := peerKey{NodeID: 6, TargetAddr: "192.168.0.1:1234", Class: rpcbase.DefaultClass}`
`66`	`66`	`l1 := roachpb.Locality{Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}`