@@ -34,6 +34,7 @@ const (
3434type errDefinition struct {
3535 queryRule metricsreader.QueryRule
3636 promQL string
37+ key string
3738 failThreshold int
3839 recoverThreshold int
3940}
6566 promQL : `sum(increase(tidb_tikvclient_backoff_seconds_count{type="pdRPC"}[2m])) by (instance)` ,
6667 failThreshold : 50 ,
6768 recoverThreshold : 10 ,
69+ key : "health_pd" ,
6870 queryRule : metricsreader.QueryRule {
6971 Names : []string {"tidb_tikvclient_backoff_seconds_count" },
7072 Retention : 2 * time .Minute ,
8789 if len (pairs ) < 2 {
8890 return model .SampleValue (math .NaN ())
8991 }
90- return pairs [len (pairs )- 1 ].Value - pairs [0 ].Value
92+ diff := pairs [len (pairs )- 1 ].Value - pairs [0 ].Value
93+ // Maybe the backend just rebooted.
94+ if diff < 0 {
95+ return model .SampleValue (math .NaN ())
96+ }
97+ return diff
9198 },
9299 ResultType : model .ValVector ,
93100 },
99106 promQL : `sum(increase(tidb_tikvclient_backoff_seconds_count{type=~"regionMiss|tikvRPC"}[2m])) by (instance)` ,
100107 failThreshold : 1000 ,
101108 recoverThreshold : 100 ,
109+ key : "health_tikv" ,
102110 queryRule : metricsreader.QueryRule {
103111 Names : []string {"tidb_tikvclient_backoff_seconds_count" },
104112 Retention : 2 * time .Minute ,
@@ -121,7 +129,12 @@ var (
121129 if len (pairs ) < 2 {
122130 return model .SampleValue (math .NaN ())
123131 }
124- return pairs [len (pairs )- 1 ].Value - pairs [0 ].Value
132+ diff := pairs [len (pairs )- 1 ].Value - pairs [0 ].Value
133+ // Maybe the backend just rebooted.
134+ if diff < 0 {
135+ return model .SampleValue (math .NaN ())
136+ }
137+ return diff
125138 },
126139 ResultType : model .ValVector ,
127140 },
@@ -141,8 +154,9 @@ type healthBackendSnapshot struct {
141154
142155type errIndicator struct {
143156 queryExpr metricsreader.QueryExpr
157+ queryRule metricsreader.QueryRule
144158 queryResult metricsreader.QueryResult
145- queryID uint64
159+ key string
146160 failThreshold int
147161 recoverThreshold int
148162}
@@ -170,10 +184,12 @@ func initErrIndicator(mr metricsreader.MetricsReader) []errIndicator {
170184 queryExpr : metricsreader.QueryExpr {
171185 PromQL : def .promQL ,
172186 },
187+ queryRule : def .queryRule ,
188+ key : def .key ,
173189 failThreshold : def .failThreshold ,
174190 recoverThreshold : def .recoverThreshold ,
175191 }
176- indicator . queryID = mr .AddQueryExpr (indicator .queryExpr )
192+ mr .AddQueryExpr (indicator .key , indicator . queryExpr , indicator . queryRule )
177193 indicators = append (indicators , indicator )
178194 }
179195 return indicators
@@ -189,8 +205,8 @@ func (fh *FactorHealth) UpdateScore(backends []scoredBackend) {
189205 }
190206 needUpdateSnapshot , latestTime := false , monotime .Time (0 )
191207 for i := 0 ; i < len (fh .indicators ); i ++ {
192- qr := fh .mr .GetQueryResult (fh .indicators [i ].queryID )
193- if qr .Err != nil || qr . Empty () {
208+ qr := fh .mr .GetQueryResult (fh .indicators [i ].key )
209+ if qr .Empty () {
194210 continue
195211 }
196212 if fh .indicators [i ].queryResult .UpdateTime != qr .UpdateTime {
@@ -313,6 +329,6 @@ func (fh *FactorHealth) SetConfig(cfg *config.Config) {
313329
314330func (fh * FactorHealth ) Close () {
315331 for _ , indicator := range fh .indicators {
316- fh .mr .RemoveQueryExpr (indicator .queryID )
332+ fh .mr .RemoveQueryExpr (indicator .key )
317333 }
318334}
0 commit comments