factor: define query rules for factors (#599)

djshow832 · web-flow · commit 6ef55c7e454b · 2024-07-17T13:43:30.000Z
diff --git a/pkg/balance/factor/factor_cpu.go b/pkg/balance/factor/factor_cpu.go
@@ -10,6 +10,7 @@ import (
 	"github.com/pingcap/tiproxy/lib/config"
 	"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
 	"github.com/pingcap/tiproxy/pkg/util/monotime"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/prometheus/common/model"
 )
 
@@ -34,6 +35,31 @@ var (
 		HasLabel: true,
 		Range:    1 * time.Minute,
 	}
+	cpuQueryRule = metricsreader.QueryRule{
+		Names:     []string{"process_cpu_seconds_total", "tidb_server_maxprocs"},
+		Retention: 1 * time.Minute,
+		Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
+			cpuTotal := mfs["process_cpu_seconds_total"].Metric[0].Untyped
+			maxProcs := mfs["tidb_server_maxprocs"].Metric[0].Untyped
+			if cpuTotal == nil || maxProcs == nil {
+				return model.SampleValue(math.NaN())
+			}
+			return model.SampleValue(*cpuTotal.Value / *maxProcs.Value)
+		},
+		Range2Value: func(pairs []model.SamplePair) model.SampleValue {
+			if len(pairs) < 2 {
+				return model.SampleValue(math.NaN())
+			}
+			pair1 := pairs[len(pairs)-2]
+			pair2 := pairs[len(pairs)-1]
+			timeDiff := float64(pair2.Timestamp-pair1.Timestamp) / 1000.0
+			if timeDiff < 1e-4 {
+				return model.SampleValue(math.NaN())
+			}
+			return (pair2.Value - pair1.Value) / model.SampleValue(timeDiff)
+		},
+		ResultType: model.ValMatrix,
+	}
 )
 
 type cpuBackendSnapshot struct {
diff --git a/pkg/balance/factor/factor_cpu_test.go b/pkg/balance/factor/factor_cpu_test.go
@@ -6,11 +6,13 @@ package factor
 import (
 	"math"
 	"strconv"
+	"strings"
 	"testing"
 	"time"
 
 	"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
 	"github.com/pingcap/tiproxy/pkg/util/monotime"
+	"github.com/prometheus/common/expfmt"
 	"github.com/prometheus/common/model"
 	"github.com/stretchr/testify/require"
 )
@@ -296,3 +298,53 @@ func TestCPUResultNotUpdated(t *testing.T) {
 		require.EqualValues(t, test.expectedScore, backends[0].score(), "test index %d", i)
 	}
 }
+
+func TestCPUQueryRule(t *testing.T) {
+	tests := []struct {
+		text       string
+		timestamp  model.Time
+		curValue   model.SampleValue
+		finalValue model.SampleValue
+	}{
+		{
+			text: `process_cpu_seconds_total 10
+tidb_server_maxprocs 2
+`,
+			timestamp:  model.Time(0),
+			curValue:   5,
+			finalValue: model.SampleValue(math.NaN()),
+		},
+		{
+			text: `process_cpu_seconds_total 10
+tidb_server_maxprocs 2
+`,
+			timestamp:  model.Time(1000),
+			curValue:   5,
+			finalValue: 0,
+		},
+		{
+			text: `process_cpu_seconds_total 12
+tidb_server_maxprocs 2
+`,
+			timestamp:  model.Time(2000),
+			curValue:   6,
+			finalValue: 1,
+		},
+	}
+
+	historyPair := make([]model.SamplePair, 0)
+	for i, test := range tests {
+		var parser expfmt.TextParser
+		mfs, err := parser.TextToMetricFamilies(strings.NewReader(test.text))
+		require.NoError(t, err, "case %d", i)
+		value := cpuQueryRule.Metric2Value(mfs)
+		require.Equal(t, test.curValue, value, "case %d", i)
+		historyPair = append(historyPair, model.SamplePair{Timestamp: test.timestamp, Value: value})
+		value = cpuQueryRule.Range2Value(historyPair)
+		if math.IsNaN(float64(test.finalValue)) {
+			require.True(t, math.IsNaN(float64(value)), "case %d", i)
+		} else {
+			require.Equal(t, test.finalValue, value, "case %d", i)
+		}
+	}
+}
diff --git a/pkg/balance/factor/factor_health.go b/pkg/balance/factor/factor_health.go
@@ -10,6 +10,7 @@ import (
 	"github.com/pingcap/tiproxy/lib/config"
 	"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
 	"github.com/pingcap/tiproxy/pkg/util/monotime"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/prometheus/common/model"
 )
 
@@ -31,6 +32,7 @@ const (
 )
 
 type errDefinition struct {
+	queryRule        metricsreader.QueryRule
 	promQL           string
 	failThreshold    int
 	recoverThreshold int
@@ -63,6 +65,32 @@ var (
 			promQL:           `sum(increase(tidb_tikvclient_backoff_seconds_count{type="pdRPC"}[2m])) by (instance)`,
 			failThreshold:    50,
 			recoverThreshold: 10,
+			queryRule: metricsreader.QueryRule{
+				Names:     []string{"tidb_tikvclient_backoff_seconds_count"},
+				Retention: 2 * time.Minute,
+				Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
+					mt := mfs["tidb_tikvclient_backoff_seconds_count"].Metric
+					total := 0
+					for _, m := range mt {
+						for _, label := range m.Label {
+							if *label.Name == "type" {
+								if *label.Value == "pdRPC" && m.Untyped != nil {
+									total += int(*m.Untyped.Value)
+								}
+								break
+							}
+						}
+					}
+					return model.SampleValue(total)
+				},
+				Range2Value: func(pairs []model.SamplePair) model.SampleValue {
+					if len(pairs) < 2 {
+						return model.SampleValue(math.NaN())
+					}
+					return pairs[len(pairs)-1].Value - pairs[0].Value
+				},
+				ResultType: model.ValVector,
+			},
 		},
 		{
 			// may be caused by disconnection to TiKV
@@ -71,6 +99,32 @@ var (
 			promQL:           `sum(increase(tidb_tikvclient_backoff_seconds_count{type=~"regionMiss|tikvRPC"}[2m])) by (instance)`,
 			failThreshold:    1000,
 			recoverThreshold: 100,
+			queryRule: metricsreader.QueryRule{
+				Names:     []string{"tidb_tikvclient_backoff_seconds_count"},
+				Retention: 2 * time.Minute,
+				Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
+					mt := mfs["tidb_tikvclient_backoff_seconds_count"].Metric
+					total := 0
+					for _, m := range mt {
+						for _, label := range m.Label {
+							if *label.Name == "type" {
+								if (*label.Value == "regionMiss" || *label.Value == "tikvRPC") && m.Untyped != nil {
+									total += int(*m.Untyped.Value)
+								}
+								break
+							}
+						}
+					}
+					return model.SampleValue(total)
+				},
+				Range2Value: func(pairs []model.SamplePair) model.SampleValue {
+					if len(pairs) < 2 {
+						return model.SampleValue(math.NaN())
+					}
+					return pairs[len(pairs)-1].Value - pairs[0].Value
+				},
+				ResultType: model.ValVector,
+			},
 		},
 	}
 )
diff --git a/pkg/balance/factor/factor_health_test.go b/pkg/balance/factor/factor_health_test.go
@@ -6,10 +6,12 @@ package factor
 import (
 	"math"
 	"sort"
+	"strings"
 	"testing"
 
 	"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
 	"github.com/pingcap/tiproxy/pkg/util/monotime"
+	"github.com/prometheus/common/expfmt"
 	"github.com/prometheus/common/model"
 	"github.com/stretchr/testify/require"
 )
@@ -282,3 +284,60 @@ func TestHealthBalanceCount(t *testing.T) {
 		require.Equal(t, test.count, count, "test idx: %d", i)
 	}
 }
+
+func TestHealthQueryRule(t *testing.T) {
+	tests := []struct {
+		text       string
+		curValue   []model.SampleValue
+		finalValue []model.SampleValue
+	}{
+		{
+			text: `tidb_tikvclient_backoff_seconds_count{type=""} 0
+tidb_tikvclient_backoff_seconds_count{type="dataNotReady"} 0
+tidb_tikvclient_backoff_seconds_count{type="pdRPC"} 0
+tidb_tikvclient_backoff_seconds_count{type="regionMiss"} 10
+tidb_tikvclient_backoff_seconds_count{type="tikvRPC"} 0
+`,
+			curValue:   []model.SampleValue{0, 10},
+			finalValue: []model.SampleValue{model.SampleValue(math.NaN()), model.SampleValue(math.NaN())},
+		},
+		{
+			text: `tidb_tikvclient_backoff_seconds_count{type=""} 10
+tidb_tikvclient_backoff_seconds_count{type="dataNotReady"} 10
+tidb_tikvclient_backoff_seconds_count{type="pdRPC"} 10
+tidb_tikvclient_backoff_seconds_count{type="regionMiss"} 110
+tidb_tikvclient_backoff_seconds_count{type="tikvRPC"} 100
+`,
+			curValue:   []model.SampleValue{10, 210},
+			finalValue: []model.SampleValue{10, 200},
+		},
+		{
+			text: `tidb_tikvclient_backoff_seconds_count{type=""} 10
+tidb_tikvclient_backoff_seconds_count{type="dataNotReady"} 10
+tidb_tikvclient_backoff_seconds_count{type="pdRPC"} 10
+tidb_tikvclient_backoff_seconds_count{type="regionMiss"} 110
+tidb_tikvclient_backoff_seconds_count{type="tikvRPC"} 100
+`,
+			curValue:   []model.SampleValue{10, 210},
+			finalValue: []model.SampleValue{10, 200},
+		},
+	}
+
+	historyPair := make([][]model.SamplePair, len(errDefinitions))
+	for i, test := range tests {
+		var parser expfmt.TextParser
+		mfs, err := parser.TextToMetricFamilies(strings.NewReader(test.text))
+		require.NoError(t, err, "case %d", i)
+		for j, ed := range errDefinitions {
+			value := ed.queryRule.Metric2Value(mfs)
+			require.Equal(t, test.curValue[j], value, "case %d %d", i, j)
+			historyPair[j] = append(historyPair[j], model.SamplePair{Value: value})
+			value = ed.queryRule.Range2Value(historyPair[j])
+			if math.IsNaN(float64(test.finalValue[j])) {
+				require.True(t, math.IsNaN(float64(value)), "case %d %d", i, j)
+			} else {
+				require.Equal(t, test.finalValue[j], value, "case %d %d", i, j)
+			}
+		}
+	}
+}
diff --git a/pkg/balance/factor/factor_memory.go b/pkg/balance/factor/factor_memory.go
@@ -10,6 +10,7 @@ import (
 	"github.com/pingcap/tiproxy/lib/config"
 	"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
 	"github.com/pingcap/tiproxy/pkg/util/monotime"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/prometheus/common/model"
 )
 
@@ -32,6 +33,25 @@ var (
 		HasLabel: true,
 		Range:    1 * time.Minute,
 	}
+	memoryQueryRule = metricsreader.QueryRule{
+		Names:     []string{"process_resident_memory_bytes", "tidb_server_memory_quota_bytes"},
+		Retention: 1 * time.Minute,
+		Metric2Value: func(mfs map[string]*dto.MetricFamily) model.SampleValue {
+			memoryUsage := mfs["process_resident_memory_bytes"].Metric[0].Untyped
+			memoryQuota := mfs["tidb_server_memory_quota_bytes"].Metric[0].Untyped
+			if memoryUsage == nil || memoryQuota == nil {
+				return model.SampleValue(math.NaN())
+			}
+			return model.SampleValue(*memoryUsage.Value / *memoryQuota.Value)
+		},
+		Range2Value: func(pairs []model.SamplePair) model.SampleValue {
+			if len(pairs) < 1 {
+				return model.SampleValue(math.NaN())
+			}
+			return pairs[len(pairs)-1].Value
+		},
+		ResultType: model.ValMatrix,
+	}
 )
 
 type oomRiskLevel struct {
diff --git a/pkg/balance/factor/factor_memory_test.go b/pkg/balance/factor/factor_memory_test.go
@@ -6,11 +6,13 @@ package factor
 import (
 	"math"
 	"sort"
+	"strings"
 	"testing"
 	"time"
 
 	"github.com/pingcap/tiproxy/pkg/balance/metricsreader"
 	"github.com/pingcap/tiproxy/pkg/util/monotime"
+	"github.com/prometheus/common/expfmt"
 	"github.com/prometheus/common/model"
 	"github.com/stretchr/testify/require"
 )
@@ -299,3 +301,38 @@ func TestMemoryBalanceCount(t *testing.T) {
 		require.Equal(t, test.count, count, "test idx: %d", i)
 	}
 }
+
+func TestMemoryQueryRule(t *testing.T) {
+	tests := []struct {
+		text       string
+		curValue   model.SampleValue
+		finalValue model.SampleValue
+	}{
+		{
+			text: `process_resident_memory_bytes 4e+08
+tidb_server_memory_quota_bytes 8e+08
+`,
+			curValue:   0.5,
+			finalValue: 0.5,
+		},
+		{
+			text: `process_resident_memory_bytes 6e+08
+tidb_server_memory_quota_bytes 8e+08
+`,
+			curValue:   0.75,
+			finalValue: 0.75,
+		},
+	}
+
+	historyPair := make([]model.SamplePair, 0)
+	for i, test := range tests {
+		var parser expfmt.TextParser
+		mfs, err := parser.TextToMetricFamilies(strings.NewReader(test.text))
+		require.NoError(t, err, "case %d", i)
+		value := memoryQueryRule.Metric2Value(mfs)
+		require.Equal(t, test.curValue, value, "case %d", i)
+		historyPair = append(historyPair, model.SamplePair{Value: value})
+		value = memoryQueryRule.Range2Value(historyPair)
+		require.Equal(t, test.finalValue, value, "case %d", i)
+	}
+}
diff --git a/pkg/balance/metricsreader/backend_reader.go b/pkg/balance/metricsreader/backend_reader.go
@@ -424,7 +424,8 @@ func filterMetrics(all string, names []string) string {
 			all = all[idx+1:]
 		}
 		for i := range names {
-			// strings.Contains() includes the metric description in the result but it's slower.
+			// strings.Contains() includes the metric type in the result but it's slower.
+			// Note that the result is always in `Metric.Untyped` because the metric type is ignored.
 			if strings.HasPrefix(line, names[i]) {
 				buffer.WriteString(line)
 				break
diff --git a/pkg/balance/metricsreader/backend_reader_test.go b/pkg/balance/metricsreader/backend_reader_test.go
@@ -10,6 +10,7 @@ import (
 	"math/rand"
 	"net"
 	"net/http"
+	"slices"
 	"strconv"
 	"strings"
 	"testing"
@@ -77,6 +78,7 @@ func TestGetBackendAddrs(t *testing.T) {
 			require.Error(t, err, "case %d", i)
 		} else {
 			require.NoError(t, err, "case %d", i)
+			slices.Sort(addrs)
 			require.Equal(t, test.expected, addrs, "case %d", i)
 		}
 	}
diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go
@@ -6,7 +6,6 @@ package metrics
 import (
 	"context"
 	"slices"
-	"sort"
 	"strings"
 	"testing"
 
@@ -80,9 +79,7 @@ func TestDelLabelValues(t *testing.T) {
 				}
 			}
 		}
-		sort.Slice(addrs, func(i, j int) bool {
-			return addrs[i] < addrs[j]
-		})
+		slices.Sort(addrs)
 		return addrs
 	}
 	for i, test := range tests {

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,6 @@ package metrics`
`6`	`6`	`import (`
`7`	`7`	`"context"`
`8`	`8`	`"slices"`
`9`		`- "sort"`
`10`	`9`	`"strings"`
`11`	`10`	`"testing"`
`12`	`11`
`@@ -80,9 +79,7 @@ func TestDelLabelValues(t *testing.T) {`
`80`	`79`	`}`
`81`	`80`	`}`
`82`	`81`	`}`
`83`		`- sort.Slice(addrs, func(i, j int) bool {`
`84`		`- return addrs[i] < addrs[j]`
`85`		`- })`
	`82`	`+ slices.Sort(addrs)`
`86`	`83`	`return addrs`
`87`	`84`	`}`
`88`	`85`	`for i, test := range tests {`