Skip to content

Commit 545822f

Browse files
authored
backend, metrics: add metrics for traffic and handshake (#477)
1 parent 3d80091 commit 545822f

File tree

9 files changed

+805
-35
lines changed

9 files changed

+805
-35
lines changed

pkg/metrics/grafana/tiproxy_summary.json

Lines changed: 472 additions & 8 deletions
Large diffs are not rendered by default.

pkg/metrics/grafana/tiproxy_summary.jsonnet

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,32 @@ local cpsByCMDP = graphPanel.new(
306306
)
307307
);
308308

309+
local hsDurP= graphPanel.new(
310+
title='Handshake Duration',
311+
datasource=myDS,
312+
legend_rightSide=true,
313+
description='TiProxy handshake durations by different percents.',
314+
format='s',
315+
)
316+
.addTarget(
317+
prometheus.target(
318+
'histogram_quantile(0.99, sum(rate(tiproxy_session_handshake_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) by (le))',
319+
legendFormat='99',
320+
)
321+
)
322+
.addTarget(
323+
prometheus.target(
324+
'histogram_quantile(0.95, sum(rate(tiproxy_session_handshake_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) by (le))',
325+
legendFormat='95',
326+
)
327+
)
328+
.addTarget(
329+
prometheus.target(
330+
'sum(rate(tiproxy_session_handshake_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_session_handshake_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))',
331+
legendFormat='avg',
332+
)
333+
);
334+
309335
// Balance Summary
310336
local balanceRow = row.new(collapse=true, title='Balance');
311337
local bConnP = graphPanel.new(
@@ -419,6 +445,64 @@ graphPanel.new(
419445
)
420446
);
421447

448+
// Traffic row and its panels
449+
local trafficRow = row.new(collapse=true, title='Traffic');
450+
local inBytesP = graphPanel.new(
451+
title='Bytes/Second from Backends',
452+
datasource=myDS,
453+
legend_rightSide=true,
454+
description='Bytes per second from backends to TiProxy.',
455+
format='short',
456+
)
457+
.addTarget(
458+
prometheus.target(
459+
'label_replace(sum(rate(tiproxy_traffic_inbound_bytes{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (instance, backend), "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")',
460+
legendFormat='{{backend}} => {{instance}}',
461+
)
462+
);
463+
464+
local inPacketsP = graphPanel.new(
465+
title='Packets/Second from Backends',
466+
datasource=myDS,
467+
legend_rightSide=true,
468+
description='MySQL packets per second from backends to TiProxy.',
469+
format='short',
470+
)
471+
.addTarget(
472+
prometheus.target(
473+
'label_replace(sum(rate(tiproxy_traffic_inbound_packets{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (instance, backend), "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")',
474+
legendFormat='{{backend}} => {{instance}}',
475+
)
476+
);
477+
478+
local outBytesP = graphPanel.new(
479+
title='Bytes/Second to Backends',
480+
datasource=myDS,
481+
legend_rightSide=true,
482+
description='Bytes per second from TiProxy to backends.',
483+
format='short',
484+
)
485+
.addTarget(
486+
prometheus.target(
487+
'label_replace(sum(rate(tiproxy_traffic_outbound_bytes{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (instance, backend), "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")',
488+
legendFormat='{{instance}} => {{backend}}',
489+
)
490+
);
491+
492+
local outPacketsP = graphPanel.new(
493+
title='Packets/Second to Backends',
494+
datasource=myDS,
495+
legend_rightSide=true,
496+
description='Packets per second from TiProxy to backends.',
497+
format='short',
498+
)
499+
.addTarget(
500+
prometheus.target(
501+
'label_replace(sum(rate(tiproxy_traffic_outbound_packets{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (instance, backend), "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")',
502+
legendFormat='{{instance}} => {{backend}}',
503+
)
504+
);
505+
422506
// Merge together.
423507
local panelW = 12;
424508
local panelH = 6;
@@ -450,6 +534,7 @@ newDash
450534
.addPanel(cpsByInstP, gridPos=rightPanelPos)
451535
.addPanel(cpsByBackP, gridPos=leftPanelPos)
452536
.addPanel(cpsByCMDP, gridPos=rightPanelPos)
537+
.addPanel(hsDurP, gridPos=leftPanelPos)
453538
,
454539
gridPos=rowPos
455540
)
@@ -469,3 +554,12 @@ newDash
469554
,
470555
gridPos=rowPos
471556
)
557+
.addPanel(
558+
trafficRow
559+
.addPanel(inBytesP, gridPos=leftPanelPos)
560+
.addPanel(inPacketsP, gridPos=rightPanelPos)
561+
.addPanel(outBytesP, gridPos=leftPanelPos)
562+
.addPanel(outPacketsP, gridPos=rightPanelPos)
563+
,
564+
gridPos=rowPos
565+
)

pkg/metrics/metrics.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ const (
3131
LabelSession = "session"
3232
LabelMonitor = "monitor"
3333
LabelBackend = "backend"
34+
LabelTraffic = "traffic"
3435
)
3536

3637
// MetricsManager manages metrics.
@@ -98,6 +99,7 @@ func registerProxyMetrics() {
9899
prometheus.MustRegister(KeepAliveCounter)
99100
prometheus.MustRegister(QueryTotalCounter)
100101
prometheus.MustRegister(QueryDurationHistogram)
102+
prometheus.MustRegister(HandshakeDurationHistogram)
101103
prometheus.MustRegister(BackendStatusGauge)
102104
prometheus.MustRegister(GetBackendHistogram)
103105
prometheus.MustRegister(GetBackendCounter)
@@ -106,6 +108,10 @@ func registerProxyMetrics() {
106108
prometheus.MustRegister(HealthCheckCycleGauge)
107109
prometheus.MustRegister(MigrateCounter)
108110
prometheus.MustRegister(MigrateDurationHistogram)
111+
prometheus.MustRegister(InboundBytesCounter)
112+
prometheus.MustRegister(InboundPacketsCounter)
113+
prometheus.MustRegister(OutboundBytesCounter)
114+
prometheus.MustRegister(OutboundPacketsCounter)
109115
}
110116

111117
// ReadCounter reads the value from the counter. It is only used for testing.

pkg/metrics/session.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,13 @@ var (
3030
Help: "Bucketed histogram of processing time (s) of handled queries.",
3131
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
3232
}, []string{LblBackend, LblCmdType})
33+
34+
HandshakeDurationHistogram = prometheus.NewHistogramVec(
35+
prometheus.HistogramOpts{
36+
Namespace: ModuleProxy,
37+
Subsystem: LabelSession,
38+
Name: "handshake_duration_seconds",
39+
Help: "Bucketed histogram of processing time (s) of handshakes.",
40+
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
41+
}, []string{LblBackend})
3342
)

pkg/metrics/traffic.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package metrics
5+
6+
import "github.com/prometheus/client_golang/prometheus"
7+
8+
var (
9+
InboundBytesCounter = prometheus.NewCounterVec(
10+
prometheus.CounterOpts{
11+
Namespace: ModuleProxy,
12+
Subsystem: LabelTraffic,
13+
Name: "inbound_bytes",
14+
Help: "Counter of bytes from backends.",
15+
}, []string{LblBackend})
16+
17+
InboundPacketsCounter = prometheus.NewCounterVec(
18+
prometheus.CounterOpts{
19+
Namespace: ModuleProxy,
20+
Subsystem: LabelTraffic,
21+
Name: "inbound_packets",
22+
Help: "Counter of packets from backends.",
23+
}, []string{LblBackend})
24+
25+
OutboundBytesCounter = prometheus.NewCounterVec(
26+
prometheus.CounterOpts{
27+
Namespace: ModuleProxy,
28+
Subsystem: LabelTraffic,
29+
Name: "outbound_bytes",
30+
Help: "Counter of bytes to backends.",
31+
}, []string{LblBackend})
32+
33+
OutboundPacketsCounter = prometheus.NewCounterVec(
34+
prometheus.CounterOpts{
35+
Namespace: ModuleProxy,
36+
Subsystem: LabelTraffic,
37+
Name: "outbound_packets",
38+
Help: "Counter of packets to backends.",
39+
}, []string{LblBackend})
40+
)

pkg/proxy/backend/backend_conn_mgr.go

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"encoding/binary"
1010
"encoding/json"
1111
"fmt"
12+
1213
"net"
1314
"os"
1415
"strings"
@@ -126,6 +127,8 @@ type BackendConnManager struct {
126127
closeStatus atomic.Int32
127128
// The last time when the backend is active.
128129
lastActiveTime monotime.Time
130+
// The traffic recorded last time.
131+
inBytes, inPackets, outBytes, outPackets uint64
129132
// cancelFunc is used to cancel the signal processing goroutine.
130133
cancelFunc context.CancelFunc
131134
clientIO *pnet.PacketIO
@@ -179,6 +182,7 @@ func (mgr *BackendConnManager) Connect(ctx context.Context, clientIO *pnet.Packe
179182
mgr.quitSource = SrcProxyQuit
180183
return errors.New("graceful shutdown before connecting")
181184
}
185+
startTime := monotime.Now()
182186
err := mgr.authenticator.handshakeFirstTime(ctx, mgr.logger.Named("authenticator"), mgr, clientIO, mgr.handshakeHandler, mgr.getBackendIO, frontendTLSConfig, backendTLSConfig)
183187
if err != nil {
184188
src := Error2Source(err)
@@ -191,11 +195,14 @@ func (mgr *BackendConnManager) Connect(ctx context.Context, clientIO *pnet.Packe
191195
return err
192196
}
193197
mgr.handshakeHandler.OnHandshake(mgr, mgr.ServerAddr(), nil, SrcNone)
198+
endTime := monotime.Now()
199+
addHandshakeMetrics(mgr.ServerAddr(), time.Duration(endTime-startTime))
200+
mgr.updateTraffic(mgr.backendIO.Load())
194201

195202
mgr.cmdProcessor.capability = mgr.authenticator.capability
196203
childCtx, cancelFunc := context.WithCancel(ctx)
197204
mgr.cancelFunc = cancelFunc
198-
mgr.lastActiveTime = monotime.Now()
205+
mgr.lastActiveTime = endTime
199206
mgr.wg.Run(func() {
200207
mgr.processSignals(childCtx)
201208
})
@@ -290,9 +297,11 @@ func (mgr *BackendConnManager) ExecuteCmd(ctx context.Context, request []byte) (
290297
}
291298
waitingRedirect := mgr.redirectInfo.Load() != nil
292299
var holdRequest bool
293-
holdRequest, err = mgr.cmdProcessor.executeCmd(request, mgr.clientIO, mgr.backendIO.Load(), waitingRedirect)
300+
backendIO := mgr.backendIO.Load()
301+
holdRequest, err = mgr.cmdProcessor.executeCmd(request, mgr.clientIO, backendIO, waitingRedirect)
294302
if !holdRequest {
295-
addCmdMetrics(cmd, mgr.ServerAddr(), startTime)
303+
addCmdMetrics(cmd, backendIO.RemoteAddr().String(), startTime)
304+
mgr.updateTraffic(backendIO)
296305
}
297306
if err != nil {
298307
if !pnet.IsMySQLError(err) {
@@ -326,25 +335,33 @@ func (mgr *BackendConnManager) ExecuteCmd(ctx context.Context, request []byte) (
326335
}
327336
// Even if it meets an MySQL error, it may have changed the status, such as when executing multi-statements.
328337
if mgr.cmdProcessor.finishedTxn() {
329-
if waitingRedirect && holdRequest {
330-
mgr.tryRedirect(ctx)
331-
// Execute the held request no matter redirection succeeds or not.
332-
_, err = mgr.cmdProcessor.executeCmd(request, mgr.clientIO, mgr.backendIO.Load(), false)
333-
addCmdMetrics(cmd, mgr.ServerAddr(), startTime)
334-
if err != nil && !pnet.IsMySQLError(err) {
335-
return
336-
}
337-
} else if mgr.closeStatus.Load() == statusNotifyClose {
338+
if mgr.closeStatus.Load() == statusNotifyClose {
338339
mgr.tryGracefulClose(ctx)
339340
} else if waitingRedirect {
340341
mgr.tryRedirect(ctx)
341342
}
342343
}
344+
// Execute the held request no matter redirection succeeds or not.
345+
if holdRequest && mgr.closeStatus.Load() < statusNotifyClose {
346+
backendIO = mgr.backendIO.Load()
347+
_, err = mgr.cmdProcessor.executeCmd(request, mgr.clientIO, backendIO, false)
348+
addCmdMetrics(cmd, backendIO.RemoteAddr().String(), startTime)
349+
mgr.updateTraffic(backendIO)
350+
if err != nil && !pnet.IsMySQLError(err) {
351+
return
352+
}
353+
}
343354
// Ignore MySQL errors, only return unexpected errors.
344355
err = nil
345356
return
346357
}
347358

359+
func (mgr *BackendConnManager) updateTraffic(backendIO *pnet.PacketIO) {
360+
inBytes, inPackets, outBytes, outPackets := backendIO.InBytes(), backendIO.InPackets(), backendIO.OutBytes(), backendIO.OutPackets()
361+
addTraffic(backendIO.RemoteAddr().String(), inBytes-mgr.inBytes, inPackets-mgr.inPackets, outBytes-mgr.outBytes, outPackets-mgr.outPackets)
362+
mgr.inBytes, mgr.inPackets, mgr.outBytes, mgr.outPackets = inBytes, inPackets, outBytes, outPackets
363+
}
364+
348365
// SetEventReceiver implements RedirectableConn.SetEventReceiver interface.
349366
// The receiver sends redirection signals and watches redirecting events.
350367
func (mgr *BackendConnManager) SetEventReceiver(receiver router.ConnEventReceiver) {
@@ -484,6 +501,9 @@ func (mgr *BackendConnManager) tryRedirect(ctx context.Context) {
484501
}
485502
return
486503
}
504+
mgr.updateTraffic(backendIO)
505+
mgr.inBytes, mgr.inPackets, mgr.outBytes, mgr.outPackets = 0, 0, 0, 0
506+
mgr.updateTraffic(newBackendIO)
487507
if ignoredErr := backendIO.Close(); ignoredErr != nil && !pnet.IsDisconnectError(ignoredErr) {
488508
mgr.logger.Error("close previous backend connection failed", zap.Error(ignoredErr))
489509
}

pkg/proxy/backend/backend_conn_mgr_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,6 +1226,77 @@ func TestCloseWhileGracefulClose(t *testing.T) {
12261226
ts.runTests(runners)
12271227
}
12281228

1229+
func TestTrafficMetrics(t *testing.T) {
1230+
ts := newBackendMgrTester(t)
1231+
var inBytes, inPackets, outBytes, outPackets int
1232+
runners := []runner{
1233+
// 1st handshake
1234+
{
1235+
client: ts.mc.authenticate,
1236+
proxy: ts.firstHandshake4Proxy,
1237+
backend: ts.handshake4Backend,
1238+
},
1239+
// receive at least 1000 packets
1240+
{
1241+
client: func(packetIO *pnet.PacketIO) error {
1242+
ts.mc.sql = "select * from t"
1243+
return ts.mc.request(packetIO)
1244+
},
1245+
proxy: func(clientIO, backendIO *pnet.PacketIO) error {
1246+
addr := ts.tc.backendListener.Addr().String()
1247+
var err error
1248+
inBytes, inPackets, outBytes, outPackets, err = readTraffic(addr)
1249+
require.NoError(t, err)
1250+
require.True(t, inBytes > 0 && inPackets > 0 && outBytes > 0 && outPackets > 0)
1251+
require.NoError(t, ts.forwardCmd4Proxy(clientIO, backendIO))
1252+
inBytes2, inPackets2, outBytes2, outPackets2, err := readTraffic(addr)
1253+
require.NoError(t, err)
1254+
require.True(t, inBytes2 > inBytes && inPackets2 > inPackets && outBytes2 > outBytes && outPackets2 > outPackets)
1255+
require.True(t, inBytes2 > 4096 && inPackets2 > 1000)
1256+
inBytes, inPackets, outBytes, outPackets = inBytes2, inPackets2, outBytes2, outPackets2
1257+
return nil
1258+
},
1259+
backend: func(packetIO *pnet.PacketIO) error {
1260+
ts.mb.respondType = responseTypeResultSet
1261+
ts.mb.columns = 1
1262+
ts.mb.rows = 1000
1263+
return ts.mb.respond(packetIO)
1264+
},
1265+
},
1266+
// 2nd handshake: redirect
1267+
{
1268+
client: nil,
1269+
proxy: ts.redirectSucceed4Proxy,
1270+
backend: ts.redirectSucceed4Backend,
1271+
},
1272+
// the traffic should still increase after redirection
1273+
{
1274+
client: func(packetIO *pnet.PacketIO) error {
1275+
ts.mc.sql = "select 1"
1276+
return ts.mc.request(packetIO)
1277+
},
1278+
proxy: func(clientIO, backendIO *pnet.PacketIO) error {
1279+
addr := ts.tc.backendListener.Addr().String()
1280+
inBytes1, inPackets1, outBytes1, outPackets1, err := readTraffic(addr)
1281+
require.NoError(t, err)
1282+
require.True(t, inBytes1 > inBytes && inPackets1 > inPackets && outBytes1 > outBytes && outPackets1 > outPackets)
1283+
require.NoError(t, ts.forwardCmd4Proxy(clientIO, backendIO))
1284+
inBytes2, inPackets2, outBytes2, outPackets2, err := readTraffic(addr)
1285+
require.NoError(t, err)
1286+
require.True(t, inBytes2 > inBytes1 && inPackets2 > inPackets1 && outBytes2 > outBytes1 && outPackets2 > outPackets1)
1287+
return nil
1288+
},
1289+
backend: func(packetIO *pnet.PacketIO) error {
1290+
ts.mb.respondType = responseTypeResultSet
1291+
ts.mb.columns = 1
1292+
ts.mb.rows = 1
1293+
return ts.mb.respond(packetIO)
1294+
},
1295+
},
1296+
}
1297+
ts.runTests(runners)
1298+
}
1299+
12291300
func BenchmarkSyncMap(b *testing.B) {
12301301
for i := 0; i < b.N; i++ {
12311302
var m sync.Map

0 commit comments

Comments
 (0)