Skip to content

Commit 40d41e8

Browse files
committed
feat(prometheus_adapter): 实现告警规则持久化与优雅关闭功能
添加告警规则本地文件持久化功能,支持启动时加载和关闭时保存规则 重构关闭逻辑实现优雅关闭,包括保存当前规则状态 更新构建和部署脚本以处理规则文件目录 修改测试脚本以适配新的增量更新接口
1 parent 0b636a1 commit 40d41e8

File tree

7 files changed

+319
-56
lines changed

7 files changed

+319
-56
lines changed

cmd/prometheus_adapter/main.go

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
package main
22

33
import (
4+
"context"
45
"os"
6+
"os/signal"
7+
"syscall"
8+
"time"
59

610
"github.com/fox-gonic/fox"
711
"github.com/qiniu/zeroops/internal/config"
@@ -42,9 +46,40 @@ func main() {
4246
log.Fatal().Err(err).Msg("Failed to setup API routes")
4347
}
4448

45-
// 启动服务器
46-
log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr)
47-
if err := router.Run(cfg.Server.BindAddr); err != nil {
48-
log.Fatal().Err(err).Msg("Failed to start server")
49+
// 设置信号处理
50+
sigChan := make(chan os.Signal, 1)
51+
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
52+
53+
// 创建一个用于优雅关闭的context
54+
ctx, cancel := context.WithCancel(context.Background())
55+
defer cancel()
56+
57+
// 在goroutine中启动服务器
58+
serverErr := make(chan error, 1)
59+
go func() {
60+
log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr)
61+
if err := router.Run(cfg.Server.BindAddr); err != nil {
62+
serverErr <- err
63+
}
64+
}()
65+
66+
// 等待信号或服务器错误
67+
select {
68+
case sig := <-sigChan:
69+
log.Info().Msgf("Received signal %s, shutting down...", sig)
70+
71+
// 创建超时context
72+
shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 10*time.Second)
73+
defer shutdownCancel()
74+
75+
// 调用adapter的Shutdown方法
76+
if err := adapter.Close(shutdownCtx); err != nil {
77+
log.Error().Err(err).Msg("Error during shutdown")
78+
}
79+
80+
log.Info().Msg("Shutdown complete")
81+
82+
case err := <-serverErr:
83+
log.Fatal().Err(err).Msg("Server error")
4984
}
5085
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Prometheus Alert Rules
2+
# This file is managed by the Prometheus Adapter service
3+
# It will be loaded on startup and saved on shutdown
4+
5+
groups: []

internal/prometheus_adapter/server.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package prometheusadapter
22

33
import (
4+
"context"
45
"fmt"
56
"os"
67

@@ -63,9 +64,18 @@ func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
6364
return nil
6465
}
6566

66-
// Close 关闭服务器
67-
func (s *PrometheusAdapterServer) Close() error {
68-
// 当前没有需要关闭的资源
69-
log.Info().Msg("Prometheus Adapter server closed")
67+
// Close 优雅关闭服务器
68+
func (s *PrometheusAdapterServer) Close(ctx context.Context) error {
69+
log.Info().Msg("Starting shutdown...")
70+
71+
// 调用 alertService 的 Shutdown 方法保存规则
72+
if s.alertService != nil {
73+
if err := s.alertService.Shutdown(); err != nil {
74+
log.Error().Err(err).Msg("Failed to shutdown alert service")
75+
return err
76+
}
77+
}
78+
79+
log.Info().Msg("Prometheus Adapter server shut down")
7080
return nil
7181
}

internal/prometheus_adapter/service/alert_service.go

Lines changed: 155 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
"net/http"
77
"os"
88
"os/exec"
9+
"path/filepath"
10+
"strconv"
911
"strings"
1012

1113
"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
@@ -20,15 +22,167 @@ type AlertService struct {
2022
// 内存中缓存当前规则,用于增量更新
2123
currentRules []model.AlertRule
2224
currentRuleMetas []model.AlertRuleMeta
25+
// 本地规则文件路径
26+
localRulesPath string
2327
}
2428

2529
// NewAlertService 创建告警服务
2630
func NewAlertService(promClient *client.PrometheusClient) *AlertService {
27-
return &AlertService{
31+
service := &AlertService{
2832
promClient: promClient,
2933
currentRules: []model.AlertRule{},
3034
currentRuleMetas: []model.AlertRuleMeta{},
35+
localRulesPath: "../rules/alert_rules.yml",
3136
}
37+
38+
// 启动时尝试加载本地规则
39+
if err := service.LoadRulesFromFile(); err != nil {
40+
log.Warn().Err(err).Msg("Failed to load rules from file, starting with empty rules")
41+
}
42+
43+
return service
44+
}
45+
46+
// ========== 持久化方法 ==========
47+
48+
// LoadRulesFromFile 从本地文件加载规则
49+
func (s *AlertService) LoadRulesFromFile() error {
50+
// 检查文件是否存在
51+
if _, err := os.Stat(s.localRulesPath); os.IsNotExist(err) {
52+
log.Info().Str("path", s.localRulesPath).Msg("Local rules file does not exist, skipping load")
53+
return nil
54+
}
55+
56+
// 读取文件内容
57+
data, err := os.ReadFile(s.localRulesPath)
58+
if err != nil {
59+
return fmt.Errorf("failed to read local rules file: %w", err)
60+
}
61+
62+
// 解析规则文件
63+
var rulesFile model.PrometheusRuleFile
64+
if err := yaml.Unmarshal(data, &rulesFile); err != nil {
65+
return fmt.Errorf("failed to parse rules file: %w", err)
66+
}
67+
68+
// 从Prometheus格式转换回内部格式
69+
s.currentRules = []model.AlertRule{}
70+
s.currentRuleMetas = []model.AlertRuleMeta{}
71+
72+
// 用于去重的map
73+
ruleMap := make(map[string]*model.AlertRule)
74+
75+
for _, group := range rulesFile.Groups {
76+
for _, rule := range group.Rules {
77+
// 提取基础规则信息
78+
ruleName := rule.Alert
79+
80+
// 从annotations中获取description
81+
description := ""
82+
if desc, ok := rule.Annotations["description"]; ok {
83+
description = desc
84+
}
85+
86+
// 从labels中获取severity
87+
severity := "warning"
88+
if sev, ok := rule.Labels["severity"]; ok {
89+
severity = sev
90+
delete(rule.Labels, "severity") // 移除severity,剩下的是meta的labels
91+
}
92+
93+
// 创建或更新规则模板
94+
if _, exists := ruleMap[ruleName]; !exists {
95+
alertRule := model.AlertRule{
96+
Name: ruleName,
97+
Description: description,
98+
Expr: rule.Expr,
99+
Severity: severity,
100+
}
101+
102+
// 解析For字段获取WatchTime
103+
if rule.For != "" {
104+
// 简单解析,假设格式为 "300s" 或 "5m"
105+
if strings.HasSuffix(rule.For, "s") {
106+
if seconds, err := strconv.Atoi(strings.TrimSuffix(rule.For, "s")); err == nil {
107+
alertRule.WatchTime = seconds
108+
}
109+
} else if strings.HasSuffix(rule.For, "m") {
110+
if minutes, err := strconv.Atoi(strings.TrimSuffix(rule.For, "m")); err == nil {
111+
alertRule.WatchTime = minutes * 60
112+
}
113+
}
114+
}
115+
116+
ruleMap[ruleName] = &alertRule
117+
s.currentRules = append(s.currentRules, alertRule)
118+
}
119+
120+
// 创建元信息
121+
if len(rule.Labels) > 0 {
122+
labelsJSON, _ := json.Marshal(rule.Labels)
123+
meta := model.AlertRuleMeta{
124+
AlertName: ruleName,
125+
Labels: string(labelsJSON),
126+
}
127+
128+
// 从表达式中提取threshold(简单实现)
129+
// 假设表达式类似 "metric > 80" 或 "metric{labels} > 80"
130+
parts := strings.Split(rule.Expr, " ")
131+
if len(parts) >= 3 {
132+
if threshold, err := strconv.ParseFloat(parts[len(parts)-1], 64); err == nil {
133+
meta.Threshold = threshold
134+
}
135+
}
136+
137+
s.currentRuleMetas = append(s.currentRuleMetas, meta)
138+
}
139+
}
140+
}
141+
142+
log.Info().
143+
Int("rules", len(s.currentRules)).
144+
Int("metas", len(s.currentRuleMetas)).
145+
Str("path", s.localRulesPath).
146+
Msg("Loaded rules from local file")
147+
148+
return nil
149+
}
150+
151+
// SaveRulesToFile 保存规则到本地文件
152+
func (s *AlertService) SaveRulesToFile() error {
153+
// 确保目录存在
154+
dir := filepath.Dir(s.localRulesPath)
155+
if err := os.MkdirAll(dir, 0755); err != nil {
156+
return fmt.Errorf("failed to create rules directory: %w", err)
157+
}
158+
159+
// 构建Prometheus规则文件格式
160+
prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas)
161+
162+
// 序列化为YAML
163+
data, err := yaml.Marshal(prometheusRules)
164+
if err != nil {
165+
return fmt.Errorf("failed to marshal rules: %w", err)
166+
}
167+
168+
// 写入文件
169+
if err := os.WriteFile(s.localRulesPath, data, 0644); err != nil {
170+
return fmt.Errorf("failed to write rules file: %w", err)
171+
}
172+
173+
log.Info().
174+
Int("rules", len(s.currentRules)).
175+
Int("metas", len(s.currentRuleMetas)).
176+
Str("path", s.localRulesPath).
177+
Msg("Saved rules to local file")
178+
179+
return nil
180+
}
181+
182+
// Shutdown 优雅关闭,保存当前规则
183+
func (s *AlertService) Shutdown() error {
184+
log.Info().Msg("Shutting down alert service, saving rules...")
185+
return s.SaveRulesToFile()
32186
}
33187

34188
// ========== 公开 API 方法 ==========

0 commit comments

Comments
 (0)