Skip to content

Commit 70430e8

Browse files
committed
联调模块
1 parent b40dbd3 commit 70430e8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+3094
-197
lines changed

ENV_SETUP.md

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,27 @@
1-
# 环境变量设置指南
1+
# 配置文件与环境变量指南
22

33
## 概述
44

5-
ZeroOps 项目需要配置 API 密钥来使用 AI 模型。本文档说明如何正确设置环境变量,包括模型配置、搜索后端、评估器设置等
5+
ZeroOps 现已支持通过配置文件集中管理参数(推荐)。同时也兼容环境变量方式
66

7-
## 方法1:使用 .env 文件(推荐)
7+
## 使用配置文件(推荐)
8+
9+
1) 复制示例并编辑
10+
11+
```bash
12+
cp config.example.json config.json
13+
vi config.json
14+
```
15+
16+
2) 以配置文件启动服务
17+
18+
```bash
19+
go run ./cmd/zeroops -f $(pwd)/config.json
20+
```
21+
22+
配置文件中的字段包括:`server``logging``database``redis``alerting.healthcheck/prometheus/remediation/ruleset/receiver`
23+
24+
## 方法1:使用 .env 文件(可选)
825

926
### 步骤1:创建 .env 文件
1027
在项目根目录创建 `.env` 文件:

api/openapi/alerting-ml.yaml

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
openapi: 3.0.3
2+
info:
3+
title: alerting-ml service
4+
version: 0.1.0
5+
description: HTTP API for time series anomaly detection using IsolationForest
6+
servers:
7+
- url: http://localhost:8081
8+
paths:
9+
/api/v1/anomaly/detect:
10+
post:
11+
summary: Detect anomalies in a single time series
12+
operationId: detectAnomalies
13+
requestBody:
14+
required: true
15+
content:
16+
application/json:
17+
schema:
18+
$ref: '#/components/schemas/DetectRequest'
19+
responses:
20+
'200':
21+
description: Detection result
22+
content:
23+
application/json:
24+
schema:
25+
$ref: '#/components/schemas/DetectResponse'
26+
'400':
27+
description: Invalid input
28+
'500':
29+
description: Internal error
30+
components:
31+
schemas:
32+
Metadata:
33+
type: object
34+
properties:
35+
alert_name:
36+
type: string
37+
severity:
38+
type: string
39+
labels:
40+
type: object
41+
additionalProperties:
42+
type: string
43+
DataPoint:
44+
type: object
45+
required: [timestamp, value]
46+
properties:
47+
timestamp:
48+
type: string
49+
format: date-time
50+
value:
51+
type: number
52+
format: double
53+
DetectRequest:
54+
type: object
55+
required: [data]
56+
properties:
57+
metadata:
58+
$ref: '#/components/schemas/Metadata'
59+
data:
60+
type: array
61+
items:
62+
$ref: '#/components/schemas/DataPoint'
63+
contamination:
64+
type: number
65+
default: 0.05
66+
random_state:
67+
type: integer
68+
default: 42
69+
ratio_threshold:
70+
type: number
71+
default: 0.2
72+
streak_threshold:
73+
type: integer
74+
default: 20
75+
Anomaly:
76+
type: object
77+
required: [start, end]
78+
properties:
79+
start:
80+
type: string
81+
format: date-time
82+
end:
83+
type: string
84+
format: date-time
85+
DetectResponse:
86+
type: object
87+
properties:
88+
metadata:
89+
$ref: '#/components/schemas/Metadata'
90+
anomalies:
91+
type: array
92+
items:
93+
$ref: '#/components/schemas/Anomaly'
94+

cmd/zeroops/Dockerfile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
FROM golang:1.24-alpine AS builder
2+
WORKDIR /src
3+
COPY go.mod go.sum ./
4+
RUN go mod download
5+
COPY . .
6+
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/zeroops ./cmd/zeroops
7+
8+
FROM gcr.io/distroless/base-debian12
9+
WORKDIR /app
10+
COPY --from=builder /out/zeroops /app/zeroops
11+
# 复制配置文件目录
12+
COPY configs/ /app/configs/
13+
EXPOSE 8080
14+
ENTRYPOINT ["/app/zeroops"]
15+
CMD ["-f", "/app/config.json"]
16+
17+

cmd/zeroops/main.go

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package main
33
import (
44
"context"
55
"fmt"
6-
"os"
76
"strconv"
87
"time"
98

@@ -17,16 +16,34 @@ import (
1716
servicemanager "github.com/qiniu/zeroops/internal/service_manager"
1817

1918
// releasesystem "github.com/qiniu/zeroops/internal/release_system/api"
19+
"strings"
20+
21+
"github.com/rs/zerolog"
2022
"github.com/rs/zerolog/log"
2123
)
2224

2325
func main() {
26+
// load config first
2427
log.Info().Msg("Starting zeroops api server")
2528
cfg, err := config.Load()
2629
if err != nil {
2730
log.Fatal().Err(err).Msg("failed to load config")
2831
}
2932

33+
// configure log level from config
34+
switch strings.ToLower(cfg.Logging.Level) {
35+
case "trace":
36+
zerolog.SetGlobalLevel(zerolog.TraceLevel)
37+
case "debug":
38+
zerolog.SetGlobalLevel(zerolog.DebugLevel)
39+
case "warn", "warning":
40+
zerolog.SetGlobalLevel(zerolog.WarnLevel)
41+
case "error":
42+
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
43+
default:
44+
zerolog.SetGlobalLevel(zerolog.DebugLevel)
45+
}
46+
3047
serviceManagerSrv, err := servicemanager.NewServiceManagerServer(cfg)
3148
if err != nil {
3249
log.Fatal().Err(err).Msg("failed to create release system api")
@@ -52,27 +69,48 @@ func main() {
5269
// start healthcheck scheduler and remediation consumer
5370
ctx, cancel := context.WithCancel(context.Background())
5471
defer cancel()
55-
interval := parseDuration(os.Getenv("HC_SCAN_INTERVAL"), 10*time.Second)
56-
batch := parseInt(os.Getenv("HC_SCAN_BATCH"), 200)
57-
workers := parseInt(os.Getenv("HC_WORKERS"), 1)
72+
73+
// bootstrap alert rules from config if provided
74+
if err := healthcheck.BootstrapRulesFromConfigWithApp(ctx, alertDB, &cfg.Alerting.Ruleset); err != nil {
75+
log.Error().Err(err).Msg("bootstrap rules from config failed")
76+
}
77+
interval := parseDuration(cfg.Alerting.Healthcheck.Interval, 10*time.Second)
78+
batch := cfg.Alerting.Healthcheck.Batch
79+
workers := cfg.Alerting.Healthcheck.Workers
5880
if workers < 1 {
5981
workers = 1
6082
}
61-
alertChSize := parseInt(os.Getenv("REMEDIATION_ALERT_CHAN_SIZE"), 1024)
83+
alertChSize := cfg.Alerting.Healthcheck.AlertChanSize
6284
alertCh := make(chan healthcheck.AlertMessage, alertChSize)
6385

6486
for i := 0; i < workers; i++ {
6587
go healthcheck.StartScheduler(ctx, healthcheck.Deps{
6688
DB: alertDB,
67-
Redis: healthcheck.NewRedisClientFromEnv(),
89+
Redis: healthcheck.NewRedisClientFromConfig(&cfg.Redis),
6890
AlertCh: alertCh,
6991
Batch: batch,
7092
Interval: interval,
7193
})
7294
}
73-
rem := remediation.NewConsumer(alertDB, healthcheck.NewRedisClientFromEnv())
95+
rem := remediation.NewConsumer(alertDB, healthcheck.NewRedisClientFromConfig(&cfg.Redis)).WithConfig(&cfg.Alerting.Remediation)
7496
go rem.Start(ctx, alertCh)
7597

98+
// start Prometheus anomaly detection scheduler
99+
promInterval := parseDuration(cfg.Alerting.Prometheus.SchedulerInterval, 6*time.Hour)
100+
promStep := parseDuration(cfg.Alerting.Prometheus.QueryStep, time.Minute)
101+
promRange := parseDuration(cfg.Alerting.Prometheus.QueryRange, 6*time.Hour)
102+
promCfg := healthcheck.NewPrometheusConfigFromApp(&cfg.Alerting.Prometheus)
103+
promClient := healthcheck.NewPrometheusClient(promCfg)
104+
go healthcheck.StartPrometheusScheduler(ctx, healthcheck.PrometheusDeps{
105+
DB: alertDB,
106+
PrometheusClient: promClient,
107+
Interval: promInterval,
108+
QueryStep: promStep,
109+
QueryRange: promRange,
110+
RulesetBase: cfg.Alerting.Ruleset.APIBase,
111+
RulesetTimeout: parseDuration(cfg.Alerting.Ruleset.APITimeout, 10*time.Second),
112+
})
113+
76114
router := fox.New()
77115
router.Use(middleware.Authentication)
78116
alertapi.NewApiWithConfig(router, cfg)

config.docker.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"server": { "bindAddr": ":8080" },
3+
"logging": { "level": "info" },
4+
"database": {
5+
"host": "host.docker.internal",
6+
"port": 5532,
7+
"user": "postgres",
8+
"password": "postgres123",
9+
"dbname": "zeroops",
10+
"sslmode": "disable"
11+
},
12+
"redis": { "addr": "host.docker.internal:16379", "password": "", "db": 0 },
13+
"alerting": {
14+
"healthcheck": { "interval": "10s", "batch": 200, "workers": 2, "alertChanSize": 1024 },
15+
"prometheus": { "url": "http://10.210.10.33:9090", "queryTimeout": "30s", "anomalyAPIUrl": "http://alerting-ml:8081/api/v1/anomaly/detect", "anomalyAPITimeout": "10s", "schedulerInterval": "6h", "queryStep": "1m", "queryRange": "6h" },
16+
"ruleset": { "configFile": "/app/configs/alerting/rules.json", "apiBase": "http://10.210.10.33:9999", "apiTimeout": "10s" },
17+
"receiver": { "basicUser": "alert", "basicPass": "REDACTED", "bearer": "" }
18+
}
19+
}

config.example.json

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"server": {
3+
"bindAddr": "0.0.0.0:8080"
4+
},
5+
"logging": {
6+
"level": "debug"
7+
},
8+
"database": {
9+
"host": "localhost",
10+
"port": 5432,
11+
"user": "postgres",
12+
"password": "postgres",
13+
"dbname": "zeroops",
14+
"sslmode": "disable"
15+
},
16+
"redis": {
17+
"addr": "localhost:6379",
18+
"password": "",
19+
"db": 0
20+
},
21+
"alerting": {
22+
"healthcheck": {
23+
"interval": "10s",
24+
"batch": 200,
25+
"workers": 1,
26+
"alertChanSize": 1024
27+
},
28+
"remediation": {
29+
"rollbackSleep": "30s",
30+
"rollbackURL": "http://deploy-system/rollback/%s"
31+
},
32+
"prometheus": {
33+
"url": "http://localhost:9090",
34+
"queryTimeout": "30s",
35+
"anomalyAPIUrl": "http://localhost:8081/api/v1/anomaly/detect",
36+
"anomalyAPITimeout": "10s",
37+
"schedulerInterval": "6h",
38+
"queryStep": "1m",
39+
"queryRange": "6h"
40+
},
41+
"ruleset": {
42+
"configFile": "",
43+
"apiBase": "",
44+
"apiTimeout": "10s"
45+
},
46+
"receiver": {
47+
"basicUser": "alert",
48+
"basicPass": "REDACTED",
49+
"bearer": ""
50+
}
51+
}
52+
}
53+
54+

config.json

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"server": {
3+
"bindAddr": "0.0.0.0:8080"
4+
},
5+
"logging": {
6+
"level": "debug"
7+
},
8+
"database": {
9+
"host": "localhost",
10+
"port": 5432,
11+
"user": "postgres",
12+
"password": "postgres",
13+
"dbname": "zeroops",
14+
"sslmode": "disable"
15+
},
16+
"redis": {
17+
"addr": "localhost:6379",
18+
"password": "",
19+
"db": 0
20+
},
21+
"alerting": {
22+
"healthcheck": {
23+
"interval": "10s",
24+
"batch": 200,
25+
"workers": 1,
26+
"alertChanSize": 1024
27+
},
28+
"remediation": {
29+
"rollbackSleep": "30s",
30+
"rollbackURL": "http://deploy-system/rollback/%s"
31+
},
32+
"prometheus": {
33+
"url": "http://localhost:9090",
34+
"queryTimeout": "30s",
35+
"anomalyAPIUrl": "http://localhost:8081/api/v1/anomaly/detect",
36+
"anomalyAPITimeout": "10s",
37+
"schedulerInterval": "6h",
38+
"queryStep": "1m",
39+
"queryRange": "6h"
40+
},
41+
"ruleset": {
42+
"configFile": "configs/alerting/rules.json",
43+
"apiBase": "",
44+
"apiTimeout": "10s"
45+
},
46+
"receiver": {
47+
"basicUser": "alert",
48+
"basicPass": "REDACTED",
49+
"bearer": ""
50+
}
51+
}
52+
}
53+
54+

0 commit comments

Comments
 (0)