Skip to content

Commit bb27724

Browse files
F-cqylgeeker
authored andcommitted
feat(dbha-v2): Add metric for analysis. issue: #16382
1 parent cac76e1 commit bb27724

File tree

4 files changed

+213
-1
lines changed

4 files changed

+213
-1
lines changed

dbm-services/common/dbha-v2/internal/analysis/apm/metric.go

Lines changed: 133 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,145 @@ import (
2929
"dbm-services/common/go-pubpkg/apm/metric"
3030
)
3131

32-
var Metrics []*metric.Metric
32+
const (
33+
MetricLabelSwitchID = "switch_id"
34+
MetricLabelActionScope = "action_scope"
35+
MetricLabelDbType = "db_type"
36+
)
37+
38+
var (
39+
Metrics []*metric.Metric
40+
41+
ScanBusinessTotal *haapm.HaCounter
42+
ScanBusinessTimeConsumingMs *haapm.HaHistogram
43+
SwitchingTimeConsumingMs *haapm.HaHistogram
44+
MysqlClusterSwitchingTimeConsumingMs *haapm.HaHistogram
45+
MysqlHostSwitchingTimeConsumingMs *haapm.HaHistogram
46+
MysqlInstanceSwitchingTimeConsumingMs *haapm.HaHistogram
47+
SwitchingSuccessTotal *haapm.HaCounter
48+
SwitchingErrorTotal *haapm.HaCounter
49+
MysqlSwitchingSuccessTotal *haapm.HaCounter
50+
MysqlSwitchingErrorTotal *haapm.HaCounter
51+
RedisSwitchingSuccessTotal *haapm.HaCounter
52+
RedisSwitchingErrorTotal *haapm.HaCounter
53+
)
54+
55+
// Default histogram buckets for latency (milliseconds)
56+
var defaultLatencyBuckets = []float64{1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000}
57+
58+
func init() {
59+
// Scan business total counter
60+
ScanBusinessTotal = haapm.NewHaCounter(
61+
"scan_business_total",
62+
"Total number of scan business",
63+
haapm.MetricLabelServiceID, haapm.MetricLabelServiceName,
64+
)
65+
66+
// Scan business time consuming histogram
67+
ScanBusinessTimeConsumingMs = haapm.NewHaHistogramWithBuckets(
68+
"scan_business_time_consuming_ms",
69+
"Time consuming of scan business in milliseconds",
70+
defaultLatencyBuckets,
71+
haapm.MetricLabelServiceID, haapm.MetricLabelServiceName,
72+
)
73+
74+
// Switching time consuming histogram
75+
SwitchingTimeConsumingMs = haapm.NewHaHistogramWithBuckets(
76+
"switching_time_consuming_ms",
77+
"Time consuming of switching in milliseconds",
78+
defaultLatencyBuckets,
79+
MetricLabelDbType,
80+
)
81+
82+
// Mysql cluster switching time consuming histogram
83+
MysqlClusterSwitchingTimeConsumingMs = haapm.NewHaHistogramWithBuckets(
84+
"mysql_cluster_switching_time_consuming_ms",
85+
"Time consuming of MySQL cluster switching in milliseconds",
86+
defaultLatencyBuckets,
87+
MetricLabelSwitchID, MetricLabelActionScope, MetricLabelDbType,
88+
)
89+
90+
// Mysql host switching time consuming histogram
91+
MysqlHostSwitchingTimeConsumingMs = haapm.NewHaHistogramWithBuckets(
92+
"mysql_host_switching_time_consuming_ms",
93+
"Time consuming of MySQL host switching in milliseconds",
94+
defaultLatencyBuckets,
95+
MetricLabelSwitchID, MetricLabelActionScope, MetricLabelDbType,
96+
)
97+
98+
// Mysql instance switching time consuming histogram
99+
MysqlInstanceSwitchingTimeConsumingMs = haapm.NewHaHistogramWithBuckets(
100+
"mysql_instance_switching_time_consuming_ms",
101+
"Time consuming of MySQL instance switching in milliseconds",
102+
defaultLatencyBuckets,
103+
MetricLabelSwitchID, MetricLabelActionScope, MetricLabelDbType,
104+
)
105+
106+
// Switching success total counter
107+
SwitchingSuccessTotal = haapm.NewHaCounter("switching_success_total", "Total number of switching success")
33108

109+
// Switching error total counter
110+
SwitchingErrorTotal = haapm.NewHaCounter("switching_error_total", "Total number of switching error")
111+
112+
// Mysql switching success total counter
113+
MysqlSwitchingSuccessTotal = haapm.NewHaCounter(
114+
"mysql_switching_success_total",
115+
"Total number of MySQL switching success",
116+
MetricLabelActionScope, MetricLabelDbType,
117+
)
118+
119+
// Mysql switching error total counter
120+
MysqlSwitchingErrorTotal = haapm.NewHaCounter(
121+
"mysql_switching_error_total",
122+
"Total number of MySQL switching error",
123+
MetricLabelActionScope, MetricLabelDbType,
124+
)
125+
126+
// Redis switching success total counter
127+
RedisSwitchingSuccessTotal = haapm.NewHaCounter(
128+
"redis_switching_success_total",
129+
"Total number of Redis switching success",
130+
MetricLabelActionScope, MetricLabelDbType,
131+
)
132+
133+
// Redis switching error total counter
134+
RedisSwitchingErrorTotal = haapm.NewHaCounter(
135+
"redis_switching_error_total",
136+
"Total number of Redis switching error",
137+
MetricLabelActionScope, MetricLabelDbType,
138+
)
139+
}
140+
141+
// InitAPM init apm
34142
func InitAPM(serviceID, serviceName string) {
35143
haapm.AppStartupMetric.UpdateLabel(map[string]string{
36144
haapm.MetricLabelServiceID: serviceID,
37145
haapm.MetricLabelServiceName: serviceName,
38146
})
39147

40148
Metrics = append(Metrics, haapm.AppStartupMetric.ToMetric())
149+
150+
// Scan business total counter
151+
Metrics = append(Metrics, ScanBusinessTotal.ToMetric())
152+
Metrics = append(Metrics, ScanBusinessTimeConsumingMs.ToMetric())
153+
154+
// Switching total counter
155+
Metrics = append(Metrics, SwitchingSuccessTotal.ToMetric())
156+
Metrics = append(Metrics, SwitchingErrorTotal.ToMetric())
157+
158+
// Switching time consuming histogram
159+
Metrics = append(Metrics, SwitchingTimeConsumingMs.ToMetric())
160+
161+
// Mysql switching time consuming histogram
162+
Metrics = append(Metrics, MysqlClusterSwitchingTimeConsumingMs.ToMetric())
163+
Metrics = append(Metrics, MysqlHostSwitchingTimeConsumingMs.ToMetric())
164+
Metrics = append(Metrics, MysqlInstanceSwitchingTimeConsumingMs.ToMetric())
165+
166+
// Mysql switching total counter
167+
Metrics = append(Metrics, MysqlSwitchingSuccessTotal.ToMetric())
168+
Metrics = append(Metrics, MysqlSwitchingErrorTotal.ToMetric())
169+
170+
// Redis switching total counter
171+
Metrics = append(Metrics, RedisSwitchingSuccessTotal.ToMetric())
172+
Metrics = append(Metrics, RedisSwitchingErrorTotal.ToMetric())
41173
}

dbm-services/common/dbha-v2/internal/analysis/switcher/mysql.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,15 @@ import (
2828
"context"
2929
"strings"
3030
"sync"
31+
"time"
3132

33+
"dbm-services/common/dbha-v2/internal/analysis/apm"
3234
"dbm-services/common/dbha-v2/internal/analysis/dbm"
3335
"dbm-services/common/dbha-v2/internal/analysis/switcher/mysql"
3436
"dbm-services/common/dbha-v2/internal/analysis/switcher/switchcore"
3537
"dbm-services/common/dbha-v2/internal/analysis/switcher/switchlogger"
3638
"dbm-services/common/dbha-v2/pkg/gerrors"
39+
"dbm-services/common/dbha-v2/pkg/haapm"
3740
"dbm-services/common/dbha-v2/pkg/logger"
3841
"dbm-services/common/dbha-v2/pkg/storage/hamodel"
3942
"dbm-services/common/dbha-v2/pkg/storage/haprobe"
@@ -113,6 +116,8 @@ func (m *Mysql) NewSwitchLogger() ([]switchlogger.DbSwitchLogger, error) {
113116

114117
// InstanceLevelSwitch handles MySQL instance switching operations
115118
func (m *Mysql) InstanceLevelSwitch(ctx context.Context, switchLoggers []switchlogger.DbSwitchLogger, req *Request) *Response {
119+
start := time.Now()
120+
116121
rsp := &Response{
117122
MySqlFailureInsts: map[switchcore.MetadataKey]*dbm.DbInstMetadata{},
118123
}
@@ -169,6 +174,8 @@ func (m *Mysql) InstanceLevelSwitch(ctx context.Context, switchLoggers []switchl
169174

170175
wg.Wait()
171176

177+
m.reportMysqlSwitchingMetrics(apm.MysqlInstanceSwitchingTimeConsumingMs, start, req, rsp)
178+
172179
if len(rsp.MySqlFailureInsts) == 0 {
173180
return rsp
174181
}
@@ -253,6 +260,8 @@ func (m *Mysql) checkHostInstanceCompleteness(ctx context.Context, host switchco
253260

254261
// HostLevelSwitch handles MySQL host switching operations
255262
func (m *Mysql) HostLevelSwitch(ctx context.Context, switchLoggers []switchlogger.DbSwitchLogger, req *Request) *Response {
263+
start := time.Now()
264+
256265
rsp := &Response{
257266
MySqlFailureInsts: map[switchcore.MetadataKey]*dbm.DbInstMetadata{},
258267
}
@@ -318,13 +327,44 @@ func (m *Mysql) HostLevelSwitch(ctx context.Context, switchLoggers []switchlogge
318327

319328
wg.Wait()
320329

330+
m.reportMysqlSwitchingMetrics(apm.MysqlHostSwitchingTimeConsumingMs, start, req, rsp)
331+
321332
if len(rsp.MySqlFailureInsts) > 0 {
322333
rsp.Err = ErrSwitchPartialSuccess
323334
}
324335

325336
return rsp
326337
}
327338

339+
// reportMysqlSwitchingMetrics reports the switching time consuming, success total and error total metrics
340+
func (m *Mysql) reportMysqlSwitchingMetrics(timeConsumingMetric *haapm.HaHistogram, start time.Time, req *Request, rsp *Response) {
341+
342+
// report the mysql switching time consuming
343+
if err := timeConsumingMetric.UpdateLabel(map[string]string{
344+
apm.MetricLabelSwitchID: req.SwitchID,
345+
apm.MetricLabelActionScope: string(req.ActionScope),
346+
apm.MetricLabelDbType: string(m.DbTypeName()),
347+
}).Observe(float64(time.Since(start).Milliseconds())); err != nil {
348+
logger.Error("failed to update mysql switching time consuming metric, errmsg: %s", err.Error())
349+
}
350+
351+
// report the mysql switching success total
352+
if err := apm.MysqlSwitchingSuccessTotal.UpdateLabel(map[string]string{
353+
apm.MetricLabelActionScope: string(req.ActionScope),
354+
apm.MetricLabelDbType: string(m.DbTypeName()),
355+
}).Add(float64(len(req.MySqlInstData) - len(rsp.MySqlFailureInsts))); err != nil {
356+
logger.Error("failed to update mysql switching success total metric, errmsg: %s", err.Error())
357+
}
358+
359+
// report the mysql switching error total
360+
if err := apm.MysqlSwitchingErrorTotal.UpdateLabel(map[string]string{
361+
apm.MetricLabelActionScope: string(req.ActionScope),
362+
apm.MetricLabelDbType: string(m.DbTypeName()),
363+
}).Add(float64(len(rsp.MySqlFailureInsts))); err != nil {
364+
logger.Error("failed to update mysql switching error total metric, errmsg: %s", err.Error())
365+
}
366+
}
367+
328368
// Switch handles MySQL switching operations on different levels
329369
func (m *Mysql) Switch(ctx context.Context, req *Request) *Response {
330370
rsp := &Response{

dbm-services/common/dbha-v2/internal/analysis/workflow/switch_flow.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"sort"
3131
"time"
3232

33+
"dbm-services/common/dbha-v2/internal/analysis/apm"
3334
"dbm-services/common/dbha-v2/internal/analysis/config"
3435
"dbm-services/common/dbha-v2/internal/analysis/dbm"
3536
"dbm-services/common/dbha-v2/internal/analysis/storage"
@@ -168,11 +169,30 @@ func (e *SwitchExecutor) TriggerSwitching(dbType haprobe.DbType, req *switcher.R
168169
return
169170
}
170171

172+
start := time.Now()
173+
171174
rsp := sw.Switch(context.Background(), req)
172175
if rsp.Err == nil {
173176
logger.Info("switching success for the database type: %s", dbType)
174177
}
175178

179+
// report the switching time consuming
180+
if err := apm.SwitchingTimeConsumingMs.UpdateLabel(map[string]string{
181+
apm.MetricLabelDbType: dbType.String(),
182+
}).Observe(float64(time.Since(start).Milliseconds())); err != nil {
183+
logger.Warn("failed to update switching time consuming metric, errmsg: %s", err)
184+
}
185+
186+
// report the switching success total
187+
if err := apm.SwitchingSuccessTotal.Add(float64(len(req.MySqlInstData) - len(rsp.MySqlFailureInsts))); err != nil {
188+
logger.Error("failed to update switching success total metric: %s", err.Error())
189+
}
190+
191+
// report the switching error total
192+
if err := apm.SwitchingErrorTotal.Add(float64(len(rsp.MySqlFailureInsts))); err != nil {
193+
logger.Error("failed to update switching error total metric: %s", err.Error())
194+
}
195+
176196
// post the success alarm
177197
for _, inst := range req.GetDbInstMetadata() {
178198
instKey := switchcore.GenerateMetadataKey(inst.BkCloudID, inst.IP, inst.Port)

dbm-services/common/dbha-v2/internal/analysis/workflow/workflow.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,13 @@ import (
3131
"sync"
3232
"time"
3333

34+
"dbm-services/common/dbha-v2/internal/analysis/apm"
3435
"dbm-services/common/dbha-v2/internal/analysis/config"
3536
"dbm-services/common/dbha-v2/internal/analysis/storage"
3637
"dbm-services/common/dbha-v2/internal/analysis/switcher"
3738
"dbm-services/common/dbha-v2/pkg/discovery"
3839
"dbm-services/common/dbha-v2/pkg/gerrors"
40+
"dbm-services/common/dbha-v2/pkg/haapm"
3941
"dbm-services/common/dbha-v2/pkg/logger"
4042
"dbm-services/common/dbha-v2/pkg/storage/hamysql"
4143
"dbm-services/common/dbha-v2/pkg/storage/haprobe"
@@ -210,6 +212,8 @@ func (w *Workflow) CheckBusinessWithBizID(ctx context.Context, bizId int) error
210212
// ScanBusinesses fetches business IDs, filters by instance sharding,
211213
// and runs CheckBusinessWithBizID for each (with concurrency limit).
212214
func (w *Workflow) ScanBusinesses(ctx context.Context) {
215+
start := time.Now()
216+
213217
bizIDs, err := w.hadata.GetBizIDs()
214218
if err != nil {
215219
logger.Warn("failed to get business IDs, errmsg: %s", err)
@@ -240,6 +244,22 @@ func (w *Workflow) ScanBusinesses(ctx context.Context) {
240244
}
241245

242246
wg.Wait()
247+
248+
// report the scan business time consuming
249+
if err := apm.ScanBusinessTimeConsumingMs.UpdateLabel(map[string]string{
250+
haapm.MetricLabelServiceID: w.myServiceID,
251+
haapm.MetricLabelServiceName: "analysis",
252+
}).Observe(float64(time.Since(start).Milliseconds())); err != nil {
253+
logger.Warn("failed to report the scan business time consuming, errmsg: %s", err)
254+
}
255+
256+
// report the scan business total
257+
if err := apm.ScanBusinessTotal.UpdateLabel(map[string]string{
258+
haapm.MetricLabelServiceID: w.myServiceID,
259+
haapm.MetricLabelServiceName: "analysis",
260+
}).Add(float64(len(assigned))); err != nil {
261+
logger.Warn("failed to report the scan business total, errmsg: %s", err)
262+
}
243263
}
244264

245265
// instanceKey builds a unique instance identifier from cloud id, IP and port.

0 commit comments

Comments
 (0)