Skip to content

Commit 271763e

Browse files
committed
feat(prometheus): 实现主动拉取告警的webhook服务
添加prometheus_adapter.yml配置文件支持 重构alert_service使用配置而非环境变量 新增alert_webhook_service实现告警轮询推送 更新build.sh和deploy.sh支持配置文件部署 更新README文档说明新的webhook架构
1 parent 40d41e8 commit 271763e

File tree

10 files changed

+665
-73
lines changed

10 files changed

+665
-73
lines changed

docs/prometheus_adapter/README.md

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -221,30 +221,51 @@ internal/prometheus_adapter/
221221
- 更新元信息时,系统根据 `alert_name` + `labels` 查找并更新对应的元信息
222222
- **缓存机制**:系统在内存中缓存当前的规则和元信息,支持快速增量更新
223223

224-
## Alertmanager 集成
224+
## 告警接收 Webhook
225225

226-
- 目标:将 Prometheus 触发的告警通过 Alertmanager 转发到监控告警模块
227-
- `alertmanager.yml` 配置示例
228-
```yaml
229-
global:
230-
resolve_timeout: 5m
226+
- 目标:实现自定义 webhook 服务,主动从 Prometheus 拉取告警并转发到监控告警模块
227+
- 实现方式
228+
- 通过 Prometheus Alerts API 获取告警
229+
- 定期轮询 Prometheus 的 `/api/v1/alerts` 端点
230+
- 将获取的告警格式化后 POST 到监控告警模块
231231

232-
route:
233-
group_by: ['alertname', 'cluster', 'service']
234-
group_wait: 10s
235-
group_interval: 10s
236-
repeat_interval: 1h
237-
receiver: 'zeroops-alert-webhook'
238-
239-
receivers:
240-
- name: 'zeroops-alert-webhook'
241-
webhook_configs:
242-
- url: 'http://alert-module:8080/v1/integrations/alertmanager/webhook'
243-
send_resolved: true
232+
### Webhook 服务架构
233+
```
234+
┌─────────────────┐
235+
│ Prometheus │
236+
│ (告警规则引擎) │
237+
└────────┬────────┘
238+
│ Pull (轮询)
239+
│ GET /api/v1/alerts
240+
241+
┌─────────────────┐
242+
│ Alert Webhook │
243+
│ (自定义服务) │
244+
└────────┬────────┘
245+
│ Push
246+
│ POST /v1/integrations/prometheus/alerts
247+
248+
┌─────────────────┐
249+
│ 监控告警模块 │
250+
│ (告警处理中心) │
251+
└─────────────────┘
244252
```
245-
- 说明:
246-
- `url`:监控告警模块的 webhook 地址(按实际部署修改主机与端口)
247-
- `send_resolved`:为 `true` 时,告警恢复也会通知
253+
254+
### 实现细节
255+
- **轮询机制**
256+
- 每 10 秒从 Prometheus 拉取一次活跃告警
257+
- 通过 `GET http://prometheus:9090/api/v1/alerts` 获取告警列表
258+
- 维护告警状态缓存,避免重复推送
259+
260+
- **告警格式转换**
261+
- 将 Prometheus 告警格式转换为监控告警模块所需格式
262+
- 包含告警名称、标签、严重程度、开始时间等信息
263+
- 支持告警恢复状态通知
264+
265+
- **推送目标**
266+
- URL: `http://alert-module:8080/v1/integrations/prometheus/alerts`
267+
- Method: POST
268+
- Content-Type: application/json
248269

249270
## 支持的服务
250271

internal/prometheus_adapter/client/prometheus_client.go

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,24 @@ package client
22

33
import (
44
"context"
5+
"encoding/json"
56
"fmt"
7+
"io"
8+
"net/http"
69
"time"
710

811
"github.com/prometheus/client_golang/api"
912
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
1013
promModel "github.com/prometheus/common/model"
1114
"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
15+
"github.com/rs/zerolog/log"
1216
)
1317

1418
// PrometheusClient Prometheus 客户端
1519
type PrometheusClient struct {
16-
api v1.API
20+
api v1.API
21+
httpClient *http.Client
22+
baseURL string
1723
}
1824

1925
// NewPrometheusClient 创建新的 Prometheus 客户端
@@ -26,7 +32,9 @@ func NewPrometheusClient(address string) (*PrometheusClient, error) {
2632
}
2733

2834
return &PrometheusClient{
29-
api: v1.NewAPI(client),
35+
api: v1.NewAPI(client),
36+
httpClient: &http.Client{Timeout: 10 * time.Second},
37+
baseURL: address,
3038
}, nil
3139
}
3240

@@ -142,3 +150,35 @@ func BuildQuery(service, metric, version string) string {
142150
query += "}"
143151
return query
144152
}
153+
154+
// GetAlerts 获取 Prometheus 当前的告警
155+
func (c *PrometheusClient) GetAlerts(ctx context.Context) (*model.PrometheusAlertsResponse, error) {
156+
url := fmt.Sprintf("%s/api/v1/alerts", c.baseURL)
157+
158+
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
159+
if err != nil {
160+
return nil, fmt.Errorf("failed to create request: %w", err)
161+
}
162+
163+
resp, err := c.httpClient.Do(req)
164+
if err != nil {
165+
return nil, fmt.Errorf("failed to query alerts: %w", err)
166+
}
167+
defer resp.Body.Close()
168+
169+
if resp.StatusCode != http.StatusOK {
170+
body, _ := io.ReadAll(resp.Body)
171+
return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body))
172+
}
173+
174+
var alertsResp model.PrometheusAlertsResponse
175+
if err := json.NewDecoder(resp.Body).Decode(&alertsResp); err != nil {
176+
return nil, fmt.Errorf("failed to decode response: %w", err)
177+
}
178+
179+
log.Debug().
180+
Int("alert_count", len(alertsResp.Data.Alerts)).
181+
Msg("Retrieved alerts from Prometheus")
182+
183+
return &alertsResp, nil
184+
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
package config
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"time"
7+
8+
"github.com/rs/zerolog/log"
9+
"gopkg.in/yaml.v3"
10+
)
11+
12+
// PrometheusAdapterConfig Prometheus Adapter 配置
13+
type PrometheusAdapterConfig struct {
14+
Prometheus PrometheusConfig `yaml:"prometheus"`
15+
AlertWebhook AlertWebhookConfig `yaml:"alert_webhook"`
16+
AlertRules AlertRulesConfig `yaml:"alert_rules"`
17+
Server ServerConfig `yaml:"server"`
18+
}
19+
20+
// PrometheusConfig Prometheus 服务配置
21+
type PrometheusConfig struct {
22+
Address string `yaml:"address"` // Prometheus 地址
23+
ContainerName string `yaml:"container_name"` // 容器名称
24+
}
25+
26+
// AlertWebhookConfig 告警 Webhook 配置
27+
type AlertWebhookConfig struct {
28+
URL string `yaml:"url"` // Webhook URL
29+
PollingInterval string `yaml:"polling_interval"` // 轮询间隔
30+
}
31+
32+
// AlertRulesConfig 告警规则配置
33+
type AlertRulesConfig struct {
34+
LocalFile string `yaml:"local_file"` // 本地规则文件
35+
PrometheusRulesDir string `yaml:"prometheus_rules_dir"` // Prometheus 规则目录
36+
}
37+
38+
// ServerConfig 服务器配置
39+
type ServerConfig struct {
40+
BindAddr string `yaml:"bind_addr"` // 监听地址
41+
}
42+
43+
// LoadConfig 加载配置文件
44+
func LoadConfig(configPath string) (*PrometheusAdapterConfig, error) {
45+
// 如果没有指定配置文件,使用默认路径
46+
if configPath == "" {
47+
configPath = "internal/prometheus_adapter/config/prometheus_adapter.yml"
48+
}
49+
50+
// 读取配置文件
51+
data, err := os.ReadFile(configPath)
52+
if err != nil {
53+
// 如果文件不存在,返回默认配置
54+
if os.IsNotExist(err) {
55+
log.Warn().Msg("Config file not found, using default configuration")
56+
return getDefaultConfig(), nil
57+
}
58+
return nil, fmt.Errorf("failed to read config file: %w", err)
59+
}
60+
61+
// 解析配置
62+
var config PrometheusAdapterConfig
63+
if err := yaml.Unmarshal(data, &config); err != nil {
64+
return nil, fmt.Errorf("failed to parse config file: %w", err)
65+
}
66+
67+
// 应用环境变量覆盖
68+
applyEnvOverrides(&config)
69+
70+
// 验证配置
71+
if err := validateConfig(&config); err != nil {
72+
return nil, fmt.Errorf("invalid configuration: %w", err)
73+
}
74+
75+
log.Info().
76+
Str("config_file", configPath).
77+
Msg("Configuration loaded successfully")
78+
79+
return &config, nil
80+
}
81+
82+
// getDefaultConfig 获取默认配置
83+
func getDefaultConfig() *PrometheusAdapterConfig {
84+
return &PrometheusAdapterConfig{
85+
Prometheus: PrometheusConfig{
86+
Address: "http://10.210.10.33:9090",
87+
ContainerName: "mock-s3-prometheus",
88+
},
89+
AlertWebhook: AlertWebhookConfig{
90+
URL: "http://alert-module:8080/v1/integrations/prometheus/alerts",
91+
PollingInterval: "10s",
92+
},
93+
AlertRules: AlertRulesConfig{
94+
LocalFile: "../rules/alert_rules.yml",
95+
PrometheusRulesDir: "/etc/prometheus/rules/",
96+
},
97+
Server: ServerConfig{
98+
BindAddr: "0.0.0.0:9999",
99+
},
100+
}
101+
}
102+
103+
// applyEnvOverrides 应用环境变量覆盖
104+
func applyEnvOverrides(config *PrometheusAdapterConfig) {
105+
// Prometheus 配置
106+
if addr := os.Getenv("PROMETHEUS_ADDRESS"); addr != "" {
107+
config.Prometheus.Address = addr
108+
}
109+
if container := os.Getenv("PROMETHEUS_CONTAINER"); container != "" {
110+
config.Prometheus.ContainerName = container
111+
}
112+
113+
// Alert Webhook 配置
114+
if url := os.Getenv("ALERT_WEBHOOK_URL"); url != "" {
115+
config.AlertWebhook.URL = url
116+
}
117+
if interval := os.Getenv("ALERT_POLLING_INTERVAL"); interval != "" {
118+
config.AlertWebhook.PollingInterval = interval
119+
}
120+
121+
// Server 配置
122+
if bindAddr := os.Getenv("SERVER_BIND_ADDR"); bindAddr != "" {
123+
config.Server.BindAddr = bindAddr
124+
}
125+
}
126+
127+
// validateConfig 验证配置
128+
func validateConfig(config *PrometheusAdapterConfig) error {
129+
// 验证 Prometheus 地址
130+
if config.Prometheus.Address == "" {
131+
return fmt.Errorf("prometheus address is required")
132+
}
133+
134+
// 验证轮询间隔
135+
if config.AlertWebhook.PollingInterval != "" {
136+
if _, err := time.ParseDuration(config.AlertWebhook.PollingInterval); err != nil {
137+
return fmt.Errorf("invalid polling interval: %w", err)
138+
}
139+
}
140+
141+
// 验证服务器地址
142+
if config.Server.BindAddr == "" {
143+
return fmt.Errorf("server bind address is required")
144+
}
145+
146+
return nil
147+
}
148+
149+
// GetPollingInterval 获取轮询间隔的 Duration
150+
func (c *AlertWebhookConfig) GetPollingInterval() time.Duration {
151+
if c.PollingInterval == "" {
152+
return 10 * time.Second
153+
}
154+
155+
duration, err := time.ParseDuration(c.PollingInterval)
156+
if err != nil {
157+
log.Warn().
158+
Err(err).
159+
Str("interval", c.PollingInterval).
160+
Msg("Invalid polling interval, using default")
161+
return 10 * time.Second
162+
}
163+
164+
return duration
165+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Prometheus Adapter 配置文件
2+
3+
# Prometheus 服务配置
4+
prometheus:
5+
# Prometheus 服务地址
6+
address: "http://10.210.10.33:9090"
7+
# 容器名称(用于规则同步)
8+
container_name: "mock-s3-prometheus"
9+
10+
# 告警 Webhook 服务配置
11+
alert_webhook:
12+
# 监控告警模块地址
13+
url: "http://alert-module:8080/v1/integrations/prometheus/alerts"
14+
# 轮询间隔
15+
polling_interval: "10s"
16+
17+
# 告警规则管理配置
18+
alert_rules:
19+
# 本地规则文件路径
20+
local_file: "../rules/alert_rules.yml"
21+
# Prometheus 规则目录
22+
prometheus_rules_dir: "/etc/prometheus/rules/"
23+
24+
# 服务器配置
25+
server:
26+
# 服务监听地址
27+
bind_addr: "0.0.0.0:9999"
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package model
2+
3+
import (
4+
"time"
5+
)
6+
7+
// PrometheusAlert Prometheus 告警 API 响应结构
8+
type PrometheusAlert struct {
9+
Labels map[string]string `json:"labels"`
10+
Annotations map[string]string `json:"annotations"`
11+
State string `json:"state"` // pending, firing
12+
ActiveAt time.Time `json:"activeAt"`
13+
Value string `json:"value"` // 触发告警时的值
14+
}
15+
16+
// PrometheusAlertsResponse Prometheus /api/v1/alerts 响应
17+
type PrometheusAlertsResponse struct {
18+
Status string `json:"status"`
19+
Data struct {
20+
Alerts []PrometheusAlert `json:"alerts"`
21+
} `json:"data"`
22+
}
23+
24+
// AlertmanagerWebhookAlert 单个告警
25+
type AlertmanagerWebhookAlert struct {
26+
Status string `json:"status"` // "firing" or "resolved"
27+
Labels map[string]string `json:"labels"` // 包含 alertname, service, severity, idc, service_version 等
28+
Annotations map[string]string `json:"annotations"` // 包含 summary, description
29+
StartsAt string `json:"startsAt"` // RFC3339 格式时间
30+
EndsAt string `json:"endsAt"` // RFC3339 格式时间
31+
GeneratorURL string `json:"generatorURL"` // Prometheus 查询链接
32+
Fingerprint string `json:"fingerprint"` // 告警唯一标识
33+
}
34+
35+
// AlertmanagerWebhookRequest 发送到监控告警模块的请求格式
36+
type AlertmanagerWebhookRequest struct {
37+
Receiver string `json:"receiver"` // "our-webhook"
38+
Status string `json:"status"` // "firing" or "resolved"
39+
Alerts []AlertmanagerWebhookAlert `json:"alerts"`
40+
GroupLabels map[string]string `json:"groupLabels"` // 分组标签
41+
CommonLabels map[string]string `json:"commonLabels"` // 公共标签
42+
Version string `json:"version"` // "4"
43+
}
44+
45+
// AlertWebhookResponse 告警推送响应
46+
type AlertWebhookResponse struct {
47+
Status string `json:"status"`
48+
Message string `json:"message"`
49+
}

0 commit comments

Comments
 (0)