Skip to content

Commit 6ef55c7

Browse files
authored
factor: define query rules for factors (#599)
1 parent aa354b7 commit 6ef55c7

File tree

9 files changed

+253
-5
lines changed

9 files changed

+253
-5
lines changed

pkg/balance/factor/factor_cpu.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/pingcap/tiproxy/lib/config"
1111
"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
1212
"github.com/pingcap/tiproxy/pkg/util/monotime"
13+
dto "github.com/prometheus/client_model/go"
1314
"github.com/prometheus/common/model"
1415
)
1516

@@ -34,6 +35,31 @@ var (
3435
HasLabel: true,
3536
Range: 1 * time.Minute,
3637
}
38+
cpuQueryRule = metricsreader.QueryRule{
39+
Names: []string{"process_cpu_seconds_total", "tidb_server_maxprocs"},
40+
Retention: 1 * time.Minute,
41+
Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
42+
cpuTotal := mfs["process_cpu_seconds_total"].Metric[0].Untyped
43+
maxProcs := mfs["tidb_server_maxprocs"].Metric[0].Untyped
44+
if cpuTotal == nil || maxProcs == nil {
45+
return model.SampleValue(math.NaN())
46+
}
47+
return model.SampleValue(*cpuTotal.Value / *maxProcs.Value)
48+
},
49+
Range2Value: func(pairs []model.SamplePair) model.SampleValue {
50+
if len(pairs) < 2 {
51+
return model.SampleValue(math.NaN())
52+
}
53+
pair1 := pairs[len(pairs)-2]
54+
pair2 := pairs[len(pairs)-1]
55+
timeDiff := float64(pair2.Timestamp-pair1.Timestamp) / 1000.0
56+
if timeDiff < 1e-4 {
57+
return model.SampleValue(math.NaN())
58+
}
59+
return (pair2.Value - pair1.Value) / model.SampleValue(timeDiff)
60+
},
61+
ResultType: model.ValMatrix,
62+
}
3763
)
3864

3965
type cpuBackendSnapshot struct {

pkg/balance/factor/factor_cpu_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@ package factor
66
import (
77
"math"
88
"strconv"
9+
"strings"
910
"testing"
1011
"time"
1112

1213
"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
1314
"github.com/pingcap/tiproxy/pkg/util/monotime"
15+
"github.com/prometheus/common/expfmt"
1416
"github.com/prometheus/common/model"
1517
"github.com/stretchr/testify/require"
1618
)
@@ -296,3 +298,53 @@ func TestCPUResultNotUpdated(t *testing.T) {
296298
require.EqualValues(t, test.expectedScore, backends[0].score(), "test index %d", i)
297299
}
298300
}
301+
302+
func TestCPUQueryRule(t *testing.T) {
303+
tests := []struct {
304+
text string
305+
timestamp model.Time
306+
curValue model.SampleValue
307+
finalValue model.SampleValue
308+
}{
309+
{
310+
text: `process_cpu_seconds_total 10
311+
tidb_server_maxprocs 2
312+
`,
313+
timestamp: model.Time(0),
314+
curValue: 5,
315+
finalValue: model.SampleValue(math.NaN()),
316+
},
317+
{
318+
text: `process_cpu_seconds_total 10
319+
tidb_server_maxprocs 2
320+
`,
321+
timestamp: model.Time(1000),
322+
curValue: 5,
323+
finalValue: 0,
324+
},
325+
{
326+
text: `process_cpu_seconds_total 12
327+
tidb_server_maxprocs 2
328+
`,
329+
timestamp: model.Time(2000),
330+
curValue: 6,
331+
finalValue: 1,
332+
},
333+
}
334+
335+
historyPair := make([]model.SamplePair, 0)
336+
for i, test := range tests {
337+
var parser expfmt.TextParser
338+
mfs, err := parser.TextToMetricFamilies(strings.NewReader(test.text))
339+
require.NoError(t, err, "case %d", i)
340+
value := cpuQueryRule.Metric2Value(mfs)
341+
require.Equal(t, test.curValue, value, "case %d", i)
342+
historyPair = append(historyPair, model.SamplePair{Timestamp: test.timestamp, Value: value})
343+
value = cpuQueryRule.Range2Value(historyPair)
344+
if math.IsNaN(float64(test.finalValue)) {
345+
require.True(t, math.IsNaN(float64(value)), "case %d", i)
346+
} else {
347+
require.Equal(t, test.finalValue, value, "case %d", i)
348+
}
349+
}
350+
}

pkg/balance/factor/factor_health.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/pingcap/tiproxy/lib/config"
1111
"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
1212
"github.com/pingcap/tiproxy/pkg/util/monotime"
13+
dto "github.com/prometheus/client_model/go"
1314
"github.com/prometheus/common/model"
1415
)
1516

@@ -31,6 +32,7 @@ const (
3132
)
3233

3334
type errDefinition struct {
35+
queryRule metricsreader.QueryRule
3436
promQL string
3537
failThreshold int
3638
recoverThreshold int
@@ -63,6 +65,32 @@ var (
6365
promQL: `sum(increase(tidb_tikvclient_backoff_seconds_count{type="pdRPC"}[2m])) by (instance)`,
6466
failThreshold: 50,
6567
recoverThreshold: 10,
68+
queryRule: metricsreader.QueryRule{
69+
Names: []string{"tidb_tikvclient_backoff_seconds_count"},
70+
Retention: 2 * time.Minute,
71+
Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
72+
mt := mfs["tidb_tikvclient_backoff_seconds_count"].Metric
73+
total := 0
74+
for _, m := range mt {
75+
for _, label := range m.Label {
76+
if *label.Name == "type" {
77+
if *label.Value == "pdRPC" && m.Untyped != nil {
78+
total += int(*m.Untyped.Value)
79+
}
80+
break
81+
}
82+
}
83+
}
84+
return model.SampleValue(total)
85+
},
86+
Range2Value: func(pairs []model.SamplePair) model.SampleValue {
87+
if len(pairs) < 2 {
88+
return model.SampleValue(math.NaN())
89+
}
90+
return pairs[len(pairs)-1].Value - pairs[0].Value
91+
},
92+
ResultType: model.ValVector,
93+
},
6694
},
6795
{
6896
// may be caused by disconnection to TiKV
@@ -71,6 +99,32 @@ var (
7199
promQL: `sum(increase(tidb_tikvclient_backoff_seconds_count{type=~"regionMiss|tikvRPC"}[2m])) by (instance)`,
72100
failThreshold: 1000,
73101
recoverThreshold: 100,
102+
queryRule: metricsreader.QueryRule{
103+
Names: []string{"tidb_tikvclient_backoff_seconds_count"},
104+
Retention: 2 * time.Minute,
105+
Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
106+
mt := mfs["tidb_tikvclient_backoff_seconds_count"].Metric
107+
total := 0
108+
for _, m := range mt {
109+
for _, label := range m.Label {
110+
if *label.Name == "type" {
111+
if (*label.Value == "regionMiss" || *label.Value == "tikvRPC") && m.Untyped != nil {
112+
total += int(*m.Untyped.Value)
113+
}
114+
break
115+
}
116+
}
117+
}
118+
return model.SampleValue(total)
119+
},
120+
Range2Value: func(pairs []model.SamplePair) model.SampleValue {
121+
if len(pairs) < 2 {
122+
return model.SampleValue(math.NaN())
123+
}
124+
return pairs[len(pairs)-1].Value - pairs[0].Value
125+
},
126+
ResultType: model.ValVector,
127+
},
74128
},
75129
}
76130
)

pkg/balance/factor/factor_health_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@ package factor
66
import (
77
"math"
88
"sort"
9+
"strings"
910
"testing"
1011

1112
"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
1213
"github.com/pingcap/tiproxy/pkg/util/monotime"
14+
"github.com/prometheus/common/expfmt"
1315
"github.com/prometheus/common/model"
1416
"github.com/stretchr/testify/require"
1517
)
@@ -282,3 +284,60 @@ func TestHealthBalanceCount(t *testing.T) {
282284
require.Equal(t, test.count, count, "test idx: %d", i)
283285
}
284286
}
287+
288+
func TestHealthQueryRule(t *testing.T) {
289+
tests := []struct {
290+
text string
291+
curValue []model.SampleValue
292+
finalValue []model.SampleValue
293+
}{
294+
{
295+
text: `tidb_tikvclient_backoff_seconds_count{type=""} 0
296+
tidb_tikvclient_backoff_seconds_count{type="dataNotReady"} 0
297+
tidb_tikvclient_backoff_seconds_count{type="pdRPC"} 0
298+
tidb_tikvclient_backoff_seconds_count{type="regionMiss"} 10
299+
tidb_tikvclient_backoff_seconds_count{type="tikvRPC"} 0
300+
`,
301+
curValue: []model.SampleValue{0, 10},
302+
finalValue: []model.SampleValue{model.SampleValue(math.NaN()), model.SampleValue(math.NaN())},
303+
},
304+
{
305+
text: `tidb_tikvclient_backoff_seconds_count{type=""} 10
306+
tidb_tikvclient_backoff_seconds_count{type="dataNotReady"} 10
307+
tidb_tikvclient_backoff_seconds_count{type="pdRPC"} 10
308+
tidb_tikvclient_backoff_seconds_count{type="regionMiss"} 110
309+
tidb_tikvclient_backoff_seconds_count{type="tikvRPC"} 100
310+
`,
311+
curValue: []model.SampleValue{10, 210},
312+
finalValue: []model.SampleValue{10, 200},
313+
},
314+
{
315+
text: `tidb_tikvclient_backoff_seconds_count{type=""} 10
316+
tidb_tikvclient_backoff_seconds_count{type="dataNotReady"} 10
317+
tidb_tikvclient_backoff_seconds_count{type="pdRPC"} 10
318+
tidb_tikvclient_backoff_seconds_count{type="regionMiss"} 110
319+
tidb_tikvclient_backoff_seconds_count{type="tikvRPC"} 100
320+
`,
321+
curValue: []model.SampleValue{10, 210},
322+
finalValue: []model.SampleValue{10, 200},
323+
},
324+
}
325+
326+
historyPair := make([][]model.SamplePair, len(errDefinitions))
327+
for i, test := range tests {
328+
var parser expfmt.TextParser
329+
mfs, err := parser.TextToMetricFamilies(strings.NewReader(test.text))
330+
require.NoError(t, err, "case %d", i)
331+
for j, ed := range errDefinitions {
332+
value := ed.queryRule.Metric2Value(mfs)
333+
require.Equal(t, test.curValue[j], value, "case %d %d", i, j)
334+
historyPair[j] = append(historyPair[j], model.SamplePair{Value: value})
335+
value = ed.queryRule.Range2Value(historyPair[j])
336+
if math.IsNaN(float64(test.finalValue[j])) {
337+
require.True(t, math.IsNaN(float64(value)), "case %d %d", i, j)
338+
} else {
339+
require.Equal(t, test.finalValue[j], value, "case %d %d", i, j)
340+
}
341+
}
342+
}
343+
}

pkg/balance/factor/factor_memory.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/pingcap/tiproxy/lib/config"
1111
"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
1212
"github.com/pingcap/tiproxy/pkg/util/monotime"
13+
dto "github.com/prometheus/client_model/go"
1314
"github.com/prometheus/common/model"
1415
)
1516

@@ -32,6 +33,25 @@ var (
3233
HasLabel: true,
3334
Range: 1 * time.Minute,
3435
}
36+
memoryQueryRule = metricsreader.QueryRule{
37+
Names: []string{"process_resident_memory_bytes", "tidb_server_memory_quota_bytes"},
38+
Retention: 1 * time.Minute,
39+
Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
40+
memoryUsage := mfs["process_resident_memory_bytes"].Metric[0].Untyped
41+
memoryQuota := mfs["tidb_server_memory_quota_bytes"].Metric[0].Untyped
42+
if memoryUsage == nil || memoryQuota == nil {
43+
return model.SampleValue(math.NaN())
44+
}
45+
return model.SampleValue(*memoryUsage.Value / *memoryQuota.Value)
46+
},
47+
Range2Value: func(pairs []model.SamplePair) model.SampleValue {
48+
if len(pairs) < 1 {
49+
return model.SampleValue(math.NaN())
50+
}
51+
return pairs[len(pairs)-1].Value
52+
},
53+
ResultType: model.ValMatrix,
54+
}
3555
)
3656

3757
type oomRiskLevel struct {

pkg/balance/factor/factor_memory_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@ package factor
66
import (
77
"math"
88
"sort"
9+
"strings"
910
"testing"
1011
"time"
1112

1213
"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
1314
"github.com/pingcap/tiproxy/pkg/util/monotime"
15+
"github.com/prometheus/common/expfmt"
1416
"github.com/prometheus/common/model"
1517
"github.com/stretchr/testify/require"
1618
)
@@ -299,3 +301,38 @@ func TestMemoryBalanceCount(t *testing.T) {
299301
require.Equal(t, test.count, count, "test idx: %d", i)
300302
}
301303
}
304+
305+
func TestMemoryQueryRule(t *testing.T) {
306+
tests := []struct {
307+
text string
308+
curValue model.SampleValue
309+
finalValue model.SampleValue
310+
}{
311+
{
312+
text: `process_resident_memory_bytes 4e+08
313+
tidb_server_memory_quota_bytes 8e+08
314+
`,
315+
curValue: 0.5,
316+
finalValue: 0.5,
317+
},
318+
{
319+
text: `process_resident_memory_bytes 6e+08
320+
tidb_server_memory_quota_bytes 8e+08
321+
`,
322+
curValue: 0.75,
323+
finalValue: 0.75,
324+
},
325+
}
326+
327+
historyPair := make([]model.SamplePair, 0)
328+
for i, test := range tests {
329+
var parser expfmt.TextParser
330+
mfs, err := parser.TextToMetricFamilies(strings.NewReader(test.text))
331+
require.NoError(t, err, "case %d", i)
332+
value := memoryQueryRule.Metric2Value(mfs)
333+
require.Equal(t, test.curValue, value, "case %d", i)
334+
historyPair = append(historyPair, model.SamplePair{Value: value})
335+
value = memoryQueryRule.Range2Value(historyPair)
336+
require.Equal(t, test.finalValue, value, "case %d", i)
337+
}
338+
}

pkg/balance/metricsreader/backend_reader.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,8 @@ func filterMetrics(all string, names []string) string {
424424
all = all[idx+1:]
425425
}
426426
for i := range names {
427-
// strings.Contains() includes the metric description in the result but it's slower.
427+
// strings.Contains() includes the metric type in the result but it's slower.
428+
// Note that the result is always in `Metric.Untyped` because the metric type is ignored.
428429
if strings.HasPrefix(line, names[i]) {
429430
buffer.WriteString(line)
430431
break

pkg/balance/metricsreader/backend_reader_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"math/rand"
1111
"net"
1212
"net/http"
13+
"slices"
1314
"strconv"
1415
"strings"
1516
"testing"
@@ -77,6 +78,7 @@ func TestGetBackendAddrs(t *testing.T) {
7778
require.Error(t, err, "case %d", i)
7879
} else {
7980
require.NoError(t, err, "case %d", i)
81+
slices.Sort(addrs)
8082
require.Equal(t, test.expected, addrs, "case %d", i)
8183
}
8284
}

pkg/metrics/metrics_test.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ package metrics
66
import (
77
"context"
88
"slices"
9-
"sort"
109
"strings"
1110
"testing"
1211

@@ -80,9 +79,7 @@ func TestDelLabelValues(t *testing.T) {
8079
}
8180
}
8281
}
83-
sort.Slice(addrs, func(i, j int) bool {
84-
return addrs[i] < addrs[j]
85-
})
82+
slices.Sort(addrs)
8683
return addrs
8784
}
8885
for i, test := range tests {

0 commit comments

Comments
 (0)