Skip to content

Commit 42279fe

Browse files
yksituiSecloud
authored andcommitted
fix(sqlserver): slave重建单据增加告警屏蔽逻辑 #16278
1 parent 5b6785f commit 42279fe

File tree

5 files changed

+174
-12
lines changed

5 files changed

+174
-12
lines changed

dbm-ui/backend/db_monitor/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,3 +614,9 @@ class TimeUnit:
614614
MINUTE = SECOND * 60
615615
HOUR = MINUTE * 60
616616
DAY = HOUR * 24
617+
618+
619+
# 告警屏蔽类型
620+
class MonitorShieldType(StrStructuredEnum):
621+
DIMENSION = EnumField("dimension", _("基于维度屏蔽"))
622+
STRATEGY = EnumField("strategy", _("基于策略屏蔽"))

dbm-ui/backend/flow/engine/bamboo/scene/sqlserver/sqlserver_slave_rebuild.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
from backend.db_meta.enums import ClusterEntryType, ClusterType, InstanceRole
2020
from backend.db_meta.models import Cluster, StorageInstance
2121
from backend.db_meta.models.storage_set_dtl import SqlserverClusterSyncMode
22+
from backend.db_monitor.constants import MonitorShieldType
23+
from backend.db_monitor.models import MonitorPolicy
2224
from backend.flow.consts import SqlserverCleanMode, SqlserverLoginExecMode, SqlserverSyncMode, SqlserverSyncModeMaps
2325
from backend.flow.engine.bamboo.scene.common.builder import Builder, Conditions, SubBuilder
2426
from backend.flow.engine.bamboo.scene.common.get_file_list import GetFileList
@@ -32,7 +34,9 @@
3234
sync_dbs_for_cluster_sub_flow,
3335
)
3436
from backend.flow.engine.bamboo.scene.sqlserver.sqlserver_add_slave import SqlserverAddSlaveFlow
37+
from backend.flow.plugins.components.collections.common.add_alarm_shield import AddAlarmShieldComponent
3538
from backend.flow.plugins.components.collections.common.delete_cc_service_instance import DelCCServiceInstComponent
39+
from backend.flow.plugins.components.collections.common.disable_alarm_shield import DisableAlarmShieldComponent
3640
from backend.flow.plugins.components.collections.common.pause import PauseComponent
3741
from backend.flow.plugins.components.collections.mysql.dns_manage import MySQLDnsManageComponent
3842
from backend.flow.plugins.components.collections.sqlserver.check_slave_sync_status import CheckSlaveSyncStatusComponent
@@ -56,7 +60,6 @@
5660
DropRandomJobUserKwargs,
5761
ExecActuatorKwargs,
5862
ExecLoginKwargs,
59-
SqlserverBackupIDContext,
6063
SqlserverRebuildSlaveContext,
6164
)
6265
from backend.flow.utils.sqlserver.sqlserver_act_payload import SqlserverActPayload
@@ -130,6 +133,28 @@ def slave_rebuild_in_local_flow(self):
130133
),
131134
),
132135
)
136+
sub_pipeline.add_act(
137+
act_name=_("屏蔽镜像缺失告警策略一天"),
138+
act_component_code=AddAlarmShieldComponent.code,
139+
kwargs=asdict(
140+
AddAlarmShieldComponent.kwargs(
141+
description=("执行集群原地重建单据,单据号:{}".format(self.data.get("uid"))),
142+
duration_seconds=86400,
143+
category=MonitorShieldType.STRATEGY,
144+
strategy_id=[
145+
i.monitor_policy_id
146+
for i in MonitorPolicy.objects.filter(
147+
name__in=[_("Sqlserver-数据库镜像缺失【mirroring】"), _("Sqlserver-数据库镜像缺失【Alwayson】")]
148+
)
149+
],
150+
level=[1, 2, 3],
151+
dimensions=[
152+
{"name": "cluster_domain", "values": [cluster.immute_domain]},
153+
],
154+
)
155+
),
156+
)
157+
133158
source_act = sub_pipeline.add_act(
134159
act_name=_("检测带重建slave状态[{}]".format(rebuild_slave.ip_port)),
135160
act_component_code=CheckSlaveSyncStatusComponent.code,
@@ -193,6 +218,10 @@ def slave_rebuild_in_local_flow(self):
193218
conditions_param=SqlserverRebuildSlaveContext.conditions_var_name(),
194219
)
195220

221+
sub_pipeline.add_act(
222+
act_name=_("15 分钟后解除旧实例告警屏蔽"), act_component_code=DisableAlarmShieldComponent.code, kwargs={}
223+
)
224+
196225
# 先做克隆周边配置
197226
sub_pipeline.add_sub_pipeline(
198227
sub_flow=clone_configs_sub_flow(
@@ -406,6 +435,28 @@ def slave_rebuild_in_new_slave_flow(self):
406435
- set(get_sync_filter_dbs(cluster.id))
407436
)
408437
if len(sync_dbs) > 0:
438+
cluster_sub_pipeline.add_act(
439+
act_name=_("屏蔽镜像缺失告警策略一天"),
440+
act_component_code=AddAlarmShieldComponent.code,
441+
kwargs=asdict(
442+
AddAlarmShieldComponent.kwargs(
443+
description=_("执行集群新机重建单据,单据号:{}".format(self.data.get("uid"))),
444+
duration_seconds=86400,
445+
category=MonitorShieldType.STRATEGY,
446+
strategy_id=[
447+
i.monitor_policy_id
448+
for i in MonitorPolicy.objects.filter(
449+
name__in=[_("Sqlserver-数据库镜像缺失【mirroring】"), _("Sqlserver-数据库镜像缺失【Alwayson】")]
450+
)
451+
],
452+
level=[1, 2, 3],
453+
dimensions=[
454+
{"name": "cluster_domain", "values": [cluster.immute_domain]},
455+
],
456+
)
457+
),
458+
)
459+
409460
cluster_sub_pipeline.add_sub_pipeline(
410461
sub_flow=sync_dbs_for_cluster_sub_flow(
411462
uid=self.data["uid"],
@@ -418,6 +469,10 @@ def slave_rebuild_in_new_slave_flow(self):
418469
)
419470
)
420471

472+
cluster_sub_pipeline.add_act(
473+
act_name=_("15 分钟后解除旧实例告警屏蔽"), act_component_code=DisableAlarmShieldComponent.code, kwargs={}
474+
)
475+
421476
# 先做克隆周边配置
422477
cluster_sub_pipeline.add_sub_pipeline(
423478
sub_flow=clone_configs_sub_flow(
@@ -602,7 +657,7 @@ def slave_rebuild_in_new_slave_flow(self):
602657
# main_pipeline.run_pipeline(init_trans_data_class=SqlserverBackupIDContext())
603658
main_pipeline.run_pipeline_with_sidecar(
604659
check_ai_monitor_cluster_list=sum([info["cluster_ids"] for info in self.data["infos"]], []),
605-
init_trans_data_class=SqlserverBackupIDContext(),
660+
init_trans_data_class=SqlserverRebuildSlaveContext(),
606661
)
607662

608663
@classmethod

dbm-ui/backend/flow/plugins/components/collections/common/add_alarm_shield.py

Lines changed: 105 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,56 +10,145 @@
1010
"""
1111
import datetime
1212
import json
13+
from dataclasses import dataclass, field
14+
from typing import List, Optional
1315

1416
from pipeline.component_framework.component import Component
1517

1618
from backend import env
1719
from backend.components.bkmonitorv3.client import BKMonitorV3Api
20+
from backend.db_monitor.constants import MonitorShieldType
21+
from backend.db_monitor.utils import format_shield_description
1822
from backend.flow.plugins.components.collections.common.base_service import BaseService
23+
from backend.flow.utils.base.validate_handler import ValidateHandler, validate_list, validate_string
1924

2025
# logger = logging.getLogger("flow")
2126

2227

28+
@dataclass()
29+
class AddAlarmShieldKwargs(ValidateHandler):
30+
"""
31+
定义添加告警屏蔽(AddAlarmShieldService)活动节点的私有变量结构体。
32+
33+
时间参数说明(二选一,均不传则抛出异常):
34+
- 方式一: 传入 duration_seconds,自动以当前时间为起点计算 begin_time / end_time
35+
- 方式二: 显式传入 begin_time 和 end_time
36+
37+
屏蔽类型说明(category):
38+
- "dimension"(默认): 基于维度屏蔽,仅根据 appid + 自定义维度条件进行屏蔽
39+
- "strategy": 基于策略屏蔽,需额外指定 strategy_id 和 level
40+
41+
使用示例:
42+
# 基于维度屏蔽
43+
kwargs = AddAlarmShieldKwargs(
44+
description="屏蔽1",
45+
dimensions=[{"name": "instance_host", "values": ["xxx"]}],
46+
duration_seconds=7200,
47+
)
48+
49+
# 基于策略屏蔽
50+
kwargs = AddAlarmShieldKwargs(
51+
description="屏蔽2",
52+
dimensions=[{"name": "instance", "values": ["xxx"]}],
53+
duration_seconds=3600,
54+
category="strategy",
55+
strategy_id=12345,
56+
level=[1, 2],
57+
)
58+
"""
59+
60+
description: str = field(metadata={"validate": validate_string})
61+
dimensions: List[dict] = field(default_factory=list, metadata={"validate": validate_list})
62+
duration_seconds: Optional[int] = None
63+
begin_time: Optional[str] = None
64+
end_time: Optional[str] = None
65+
category: str = "dimension"
66+
strategy_id: Optional[List[int]] = None
67+
level: Optional[List[int]] = None
68+
69+
2370
class AddAlarmShieldService(BaseService):
2471
"""
25-
输出上下文 alarm_shield_id : int
72+
添加告警屏蔽服务节点。
73+
在流程编排中作为一个原子节点,用于在执行数据库变更操作前对监控告警进行屏蔽,
74+
避免变更过程中产生的预期告警干扰运维人员。
75+
76+
输出上下文:
77+
alarm_shield_id (int): 蓝鲸监控返回的屏蔽规则 ID,后续可用于解除屏蔽(见 DisableAlarmShieldService)
78+
79+
屏蔽类型(category):
80+
- "dimension"(默认): 基于维度屏蔽,仅根据 appid + 自定义维度条件进行屏蔽
81+
- "strategy": 基于策略屏蔽,需额外指定 strategy_id(策略ID)和 level(告警等级)
82+
83+
kwargs 入参说明:
84+
- duration_seconds (int, 可选): 屏蔽持续时间(秒),传入后自动计算 begin_time / end_time
85+
- begin_time (str, 可选): 屏蔽开始时间,格式 "YYYY-MM-DD HH:MM:SS"
86+
- end_time (str, 可选): 屏蔽结束时间,格式 "YYYY-MM-DD HH:MM:SS"
87+
- description (str): 屏蔽描述信息
88+
- category (str, 可选): 屏蔽类型,默认为 "dimension"
89+
- dimensions (list[dict]): 屏蔽维度列表,每个元素包含 name 和 values
90+
- strategy_id (int, 仅 category="strategy" 时必传): 策略 ID
91+
- level (list, 仅 category="strategy" 时必传): 告警等级
92+
93+
注意: duration_seconds 与 (begin_time + end_time) 二选一,均不传则抛出异常。
2694
"""
2795

2896
def _execute(self, data, parent_data):
2997
kwargs = data.get_one_of_inputs("kwargs")
3098
trans_data = data.get_one_of_inputs("trans_data")
3199
global_data = data.get_one_of_inputs("global_data")
32100

101+
# ============ 第一步:确定屏蔽的起止时间 ============
102+
# 优先使用 duration_seconds(持续时长),自动计算起止时间
33103
if "duration_seconds" in kwargs:
34104
begin_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
35105
end_time = (
36106
datetime.datetime.now() + datetime.timedelta(seconds=int(kwargs["duration_seconds"]))
37107
).strftime("%Y-%m-%d %H:%M:%S")
38108
else:
109+
# 其次使用显式指定的 begin_time 和 end_time
39110
if "begin_time" in kwargs and "end_time" in kwargs:
40111
begin_time = kwargs["begin_time"]
41112
end_time = kwargs["end_time"]
42113
else:
114+
# 两种方式都未提供,抛出异常
43115
raise Exception("add alarm shield missing args")
44116

45117
bk_biz_id = global_data["bk_biz_id"]
118+
# 屏蔽类型,默认为基于维度屏蔽
119+
category = kwargs.get("category", "dimension")
46120

121+
# ============ 第二步:构造蓝鲸监控屏蔽 API 的请求参数 ============
47122
shield_param = {
48-
"category": "dimension",
123+
"category": category,
49124
"begin_time": begin_time,
50125
"end_time": end_time,
126+
# 注意:这里使用的是 DBA 平台的业务 ID(env.DBA_APP_BK_BIZ_ID),而非用户业务 ID
127+
# 因为告警策略是在 DBA 平台业务下统一管理的
51128
"bk_biz_id": env.DBA_APP_BK_BIZ_ID,
129+
# cycle_config.type=1 表示"一次性"屏蔽(不循环)
52130
"cycle_config": {"begin_time": "", "end_time": "", "day_list": [], "week_list": [], "type": 1},
131+
# 屏蔽期间不发送通知
53132
"shield_notice": False,
54133
"notice_config": {},
55134
"description": kwargs["description"],
135+
# 维度配置:默认以 appid(用户业务ID)作为基础维度条件
56136
"dimension_config": {
57137
"dimension_conditions": [
58138
{"condition": "and", "key": "appid", "method": "eq", "value": [f"{bk_biz_id}"], "name": "appid"},
59139
]
60140
},
61141
}
62142

143+
# ============ 第三步:根据屏蔽类型补充策略相关参数 ============
144+
# 判断这次屏蔽操作基于什么类型操作
145+
if category == MonitorShieldType.STRATEGY.value:
146+
# 如果基于策略屏蔽,则需要传入策略id, 且必须传入屏蔽等级
147+
shield_param["dimension_config"]["id"] = kwargs["strategy_id"]
148+
shield_param["dimension_config"]["level"] = kwargs["level"]
149+
150+
# ============ 第四步:追加用户自定义的维度条件 ============
151+
# dimensions 示例: [{"name": "instance", "values": ["127.0.0.1:3306"]}]
63152
dimensions = kwargs["dimensions"]
64153
for dim in dimensions:
65154
shield_param["dimension_config"]["dimension_conditions"].append(
@@ -72,25 +161,31 @@ def _execute(self, data, parent_data):
72161
}
73162
)
74163

164+
# ============ 第五步:格式化描述信息并调用蓝鲸监控 API 创建屏蔽 ============
165+
# 在描述前添加 [dbm:appid=xxx] 前缀,便于在监控平台中追踪和识别
75166
shield_param.update(
76-
{"description": self.format_shield_description(bk_biz_id, description=shield_param["description"])}
167+
{"description": format_shield_description(bk_biz_id, description=shield_param["description"])}
77168
)
78169
self.log_info("alarm shield param: {}".format(json.dumps(shield_param)))
170+
# 调用蓝鲸监控 V3 API 创建告警屏蔽规则
79171
res = BKMonitorV3Api.add_shield(shield_param)
80172
self.log_info("alarm shield {} created".format(res))
173+
174+
# ============ 第六步:将屏蔽 ID 写入上下文,供下游节点使用 ============
175+
# 典型用途:DisableAlarmShieldService 会读取此 ID 来解除屏蔽
81176
trans_data.alarm_shield_id = res["id"]
82177
data.outputs["trans_data"] = trans_data
83178
return True
84179

85-
@staticmethod
86-
def format_shield_description(bk_biz_id, description=""):
87-
prefix = f"[dbm:appid={bk_biz_id}]"
88-
# 先删后补,避免出现多个前缀
89-
description = description.replace(prefix, "").strip()
90-
return f"{prefix}{description}"
91-
92180

93181
class AddAlarmShieldComponent(Component):
182+
"""
183+
Pipeline 组件注册类。
184+
将 AddAlarmShieldService 注册为流程引擎可调度的原子节点,
185+
在流程编排 YAML/JSON 中通过 code="add_alarm_shield" 引用此组件。
186+
"""
187+
94188
name = __name__
95189
code = "add_alarm_shield"
96190
bound_service = AddAlarmShieldService
191+
kwargs = AddAlarmShieldKwargs

dbm-ui/backend/flow/plugins/components/collections/common/disable_alarm_shield.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from backend import env
1717
from backend.components import BKMonitorV3Api
18+
from backend.db_monitor.constants import MonitorShieldType
1819
from backend.flow.plugins.components.collections.common.base_service import BaseService
1920

2021

@@ -36,6 +37,10 @@ def _execute(self, data, parent_data):
3637
"notice_config": detail["notice_config"],
3738
"id": shield_id,
3839
}
40+
# 支持策略维度的调整
41+
if detail["category"] == MonitorShieldType.STRATEGY.value:
42+
edit_param["level"] = detail["dimension_config"]["level"]
43+
3944
BKMonitorV3Api.edit_shield(edit_param)
4045

4146
return True

dbm-ui/backend/flow/utils/sqlserver/sqlserver_act_dataclass.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ class SqlserverRebuildSlaveContext:
297297
clean_dbs: list = field(default_factory=list)
298298
full_backup_id: dict = field(default_factory=dict)
299299
log_backup_id: dict = field(default_factory=dict)
300+
alarm_shield_id: int = None
300301

301302
@staticmethod
302303
def sync_dbs_var_name() -> str:

0 commit comments

Comments
 (0)