Skip to content

Commit 0afaa01

Browse files
committed
Merge remote-tracking branch 'qiniu/develop'
2 parents 40de388 + 9a094d9 commit 0afaa01

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+4693
-276
lines changed

ENV_SETUP.md

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,27 @@
1-
# 环境变量设置指南
1+
# 配置文件与环境变量指南
22

33
## 概述
44

5-
ZeroOps 项目需要配置 API 密钥来使用 AI 模型。本文档说明如何正确设置环境变量,包括模型配置、搜索后端、评估器设置等
5+
ZeroOps 现已支持通过配置文件集中管理参数(推荐)。同时也兼容环境变量方式
66

7-
## 方法1:使用 .env 文件(推荐)
7+
## 使用配置文件(推荐)
8+
9+
1) 复制示例并编辑
10+
11+
```bash
12+
cp config.example.json config.json
13+
vi config.json
14+
```
15+
16+
2) 以配置文件启动服务
17+
18+
```bash
19+
go run ./cmd/zeroops -f $(pwd)/config.json
20+
```
21+
22+
配置文件中的字段包括:`server``logging``database``redis``alerting.healthcheck/prometheus/remediation/ruleset/receiver`
23+
24+
## 方法1:使用 .env 文件(可选)
825

926
### 步骤1:创建 .env 文件
1027
在项目根目录创建 `.env` 文件:

api/openapi/alerting-ml.yaml

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
openapi: 3.0.3
2+
info:
3+
title: alerting-ml service
4+
version: 0.1.0
5+
description: HTTP API for time series anomaly detection using IsolationForest
6+
servers:
7+
- url: http://localhost:8081
8+
paths:
9+
/api/v1/anomaly/detect:
10+
post:
11+
summary: Detect anomalies in a single time series
12+
operationId: detectAnomalies
13+
requestBody:
14+
required: true
15+
content:
16+
application/json:
17+
schema:
18+
$ref: '#/components/schemas/DetectRequest'
19+
responses:
20+
'200':
21+
description: Detection result
22+
content:
23+
application/json:
24+
schema:
25+
$ref: '#/components/schemas/DetectResponse'
26+
'400':
27+
description: Invalid input
28+
'500':
29+
description: Internal error
30+
components:
31+
schemas:
32+
Metadata:
33+
type: object
34+
properties:
35+
alert_name:
36+
type: string
37+
severity:
38+
type: string
39+
labels:
40+
type: object
41+
additionalProperties:
42+
type: string
43+
DataPoint:
44+
type: object
45+
required: [timestamp, value]
46+
properties:
47+
timestamp:
48+
type: string
49+
format: date-time
50+
value:
51+
type: number
52+
format: double
53+
DetectRequest:
54+
type: object
55+
required: [data]
56+
properties:
57+
metadata:
58+
$ref: '#/components/schemas/Metadata'
59+
data:
60+
type: array
61+
items:
62+
$ref: '#/components/schemas/DataPoint'
63+
contamination:
64+
type: number
65+
default: 0.05
66+
random_state:
67+
type: integer
68+
default: 42
69+
ratio_threshold:
70+
type: number
71+
default: 0.2
72+
streak_threshold:
73+
type: integer
74+
default: 20
75+
Anomaly:
76+
type: object
77+
required: [start, end]
78+
properties:
79+
start:
80+
type: string
81+
format: date-time
82+
end:
83+
type: string
84+
format: date-time
85+
DetectResponse:
86+
type: object
87+
properties:
88+
metadata:
89+
$ref: '#/components/schemas/Metadata'
90+
anomalies:
91+
type: array
92+
items:
93+
$ref: '#/components/schemas/Anomaly'
94+

client/src/api/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import axios from 'axios'
22

33
// 创建 axios 实例
44
const api = axios.create({
5+
// baseURL: (import.meta as any).env?.VITE_API_BASE_URL || 'http://10.210.10.33:8080',
56
timeout: 10000,
67
headers: {
78
'Content-Type': 'application/json'

client/src/mock/services.ts

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1845,27 +1845,9 @@ loadServiceAlertStatus()
18451845
*/
18461846
export const serviceVersionAlertStatusMap: Record<string, Record<string, ServiceAlertStatus>> = {}
18471847

1848-
const saveServiceVersionAlertStatus = () => {
1849-
try {
1850-
localStorage.setItem('serviceVersionAlertStatusMap', JSON.stringify(serviceVersionAlertStatusMap))
1851-
console.log('服务版本告警状态已保存到 localStorage')
1852-
} catch (error) {
1853-
console.error('保存服务版本告警状态失败:', error)
1854-
}
1855-
}
1848+
const saveServiceVersionAlertStatus = () => {}
18561849

1857-
const loadServiceVersionAlertStatus = () => {
1858-
try {
1859-
const data = localStorage.getItem('serviceVersionAlertStatusMap')
1860-
if (data) {
1861-
const parsed = JSON.parse(data)
1862-
Object.assign(serviceVersionAlertStatusMap, parsed)
1863-
console.log('已从 localStorage 加载服务版本告警状态')
1864-
}
1865-
} catch (error) {
1866-
console.error('从 localStorage 加载服务版本告警状态失败:', error)
1867-
}
1868-
}
1850+
const loadServiceVersionAlertStatus = () => {}
18691851

18701852
/**
18711853
* 根据告警状态更新服务版本状态
@@ -1919,8 +1901,7 @@ export const clearServiceVersionAlertStatus = (serviceName: string, version?: st
19191901
console.log(`已清除服务 ${serviceName} ${version ? '版本 ' + version : '所有版本'} 的告警状态`)
19201902
}
19211903

1922-
// 页面加载时恢复服务版本告警状态
1923-
loadServiceVersionAlertStatus()
1904+
// 页面加载时不再从 localStorage 恢复服务版本告警状态(禁用持久化)
19241905

19251906
// ==================== 发布任务状态管理 ====================
19261907
// 管理服务的发布任务状态,用于显示发布指示器

client/src/views/ChangeLogView.vue

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,31 @@
8484
import { ref, computed, onMounted, watch } from 'vue'
8585
import { useAppStore, type ChangeItem, type AlarmChangeItem } from '@/stores/app'
8686
import { mockApi } from '@/mock/api'
87-
import type { DeploymentChangelogResponse, DeploymentChangelogItem, AlertRuleChangelogResponse, AlertRuleChangeItem } from '@/mock/services'
87+
import { apiService } from '@/api'
88+
import type { DeploymentChangelogResponse } from '@/mock/services'
8889
import ChangeCard from '@/components/ChangeCard.vue'
8990
import AlarmChangeCard from '@/components/AlarmChangeCard.vue'
9091
import { ArrowLeft, Loading } from '@element-plus/icons-vue'
9192
93+
interface AlertRuleChangeValue {
94+
name: string
95+
old: string
96+
new: string
97+
}
98+
99+
interface AlertRuleChangeItem {
100+
name: string
101+
editTime: string
102+
scope: string
103+
values: AlertRuleChangeValue[]
104+
reason: string
105+
}
106+
107+
interface AlertRuleChangelogResponse {
108+
items: AlertRuleChangeItem[]
109+
next?: string
110+
}
111+
92112
const appStore = useAppStore()
93113
94114
const activeTab = ref('service')
@@ -160,7 +180,7 @@ const transformAlertRuleChangelogToAlarmChangeItems = (changelogData: AlertRuleC
160180
const serviceName = item.scope?.startsWith('service:') ? item.scope.slice('service:'.length) + '服务' : '全局服务'
161181
162182
// 构建变更描述
163-
const changeDescription = item.values.map(value => {
183+
const changeDescription = item.values.map((value) => {
164184
return `${value.name}: ${value.old} -> ${value.new}`
165185
}).join(', ')
166186
@@ -200,21 +220,21 @@ const loadDeploymentChangelog = async (start?: string, limit?: number) => {
200220
}
201221
}
202222
203-
// 加载告警规则变更记录
223+
// 加载告警规则变更记录(使用真实 API)
204224
const loadAlertRuleChangelog = async (start?: string, limit?: number) => {
205225
if (alertRuleLoading.value) return // 防止重复加载
206226
207227
try {
208228
alertRuleLoading.value = true
209229
error.value = null
210230
211-
const response = await mockApi.getAlertRuleChangelog(start, limit)
212-
alertRuleChangelog.value = response
231+
const response = await apiService.getAlertRuleChangelog(start, limit ?? 10)
232+
alertRuleChangelog.value = response.data
213233
214234
// 转换数据格式
215-
alarmChangeItems.value = transformAlertRuleChangelogToAlarmChangeItems(response.items)
235+
alarmChangeItems.value = transformAlertRuleChangelogToAlarmChangeItems(response.data.items)
216236
217-
console.log('告警规则变更记录加载成功:', response)
237+
console.log('告警规则变更记录加载成功:', response.data)
218238
} catch (err) {
219239
error.value = '加载告警规则变更记录失败'
220240
console.error('加载告警规则变更记录失败:', err)

client/vite.config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ export default defineConfig({
1818
server: {
1919
proxy: {
2020
'/v1': {
21-
target: 'http://127.0.0.1:8080',
21+
target: 'http://10.210.10.33:8080',
2222
changeOrigin: true,
2323
secure: false,
2424
}

cmd/zeroops/Dockerfile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
FROM docker.m.daocloud.io/library/golang:1.24-alpine AS builder
2+
WORKDIR /src
3+
COPY go.mod go.sum ./
4+
RUN go mod download
5+
COPY . .
6+
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/zeroops ./cmd/zeroops
7+
8+
FROM gcr.m.daocloud.io/distroless/base-debian12:nonroot
9+
WORKDIR /app
10+
COPY --from=builder /out/zeroops /app/zeroops
11+
# 复制配置文件目录
12+
COPY configs/ /app/configs/
13+
EXPOSE 8080
14+
ENTRYPOINT ["/app/zeroops"]
15+
CMD ["-f", "/app/config.json"]
16+
17+

cmd/zeroops/main.go

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@ package main
33
import (
44
"context"
55
"fmt"
6-
"os"
76
"strconv"
87
"time"
98

10-
"github.com/fox-gonic/fox"
9+
"github.com/gin-gonic/gin"
1110
alertapi "github.com/qiniu/zeroops/internal/alerting/api"
1211
adb "github.com/qiniu/zeroops/internal/alerting/database"
1312
"github.com/qiniu/zeroops/internal/alerting/service/healthcheck"
@@ -17,16 +16,34 @@ import (
1716
servicemanager "github.com/qiniu/zeroops/internal/service_manager"
1817

1918
// releasesystem "github.com/qiniu/zeroops/internal/release_system/api"
19+
"strings"
20+
21+
"github.com/rs/zerolog"
2022
"github.com/rs/zerolog/log"
2123
)
2224

2325
func main() {
26+
// load config first
2427
log.Info().Msg("Starting zeroops api server")
2528
cfg, err := config.Load()
2629
if err != nil {
2730
log.Fatal().Err(err).Msg("failed to load config")
2831
}
2932

33+
// configure log level from config
34+
switch strings.ToLower(cfg.Logging.Level) {
35+
case "trace":
36+
zerolog.SetGlobalLevel(zerolog.TraceLevel)
37+
case "debug":
38+
zerolog.SetGlobalLevel(zerolog.DebugLevel)
39+
case "warn", "warning":
40+
zerolog.SetGlobalLevel(zerolog.WarnLevel)
41+
case "error":
42+
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
43+
default:
44+
zerolog.SetGlobalLevel(zerolog.DebugLevel)
45+
}
46+
3047
serviceManagerSrv, err := servicemanager.NewServiceManagerServer(cfg)
3148
if err != nil {
3249
log.Fatal().Err(err).Msg("failed to create release system api")
@@ -52,28 +69,51 @@ func main() {
5269
// start healthcheck scheduler and remediation consumer
5370
ctx, cancel := context.WithCancel(context.Background())
5471
defer cancel()
55-
interval := parseDuration(os.Getenv("HC_SCAN_INTERVAL"), 10*time.Second)
56-
batch := parseInt(os.Getenv("HC_SCAN_BATCH"), 200)
57-
workers := parseInt(os.Getenv("HC_WORKERS"), 1)
72+
73+
// bootstrap alert rules from config if provided
74+
if err := healthcheck.BootstrapRulesFromConfigWithApp(ctx, alertDB, &cfg.Alerting.Ruleset); err != nil {
75+
log.Error().Err(err).Msg("bootstrap rules from config failed")
76+
}
77+
interval := parseDuration(cfg.Alerting.Healthcheck.Interval, 10*time.Second)
78+
batch := cfg.Alerting.Healthcheck.Batch
79+
workers := cfg.Alerting.Healthcheck.Workers
5880
if workers < 1 {
5981
workers = 1
6082
}
61-
alertChSize := parseInt(os.Getenv("REMEDIATION_ALERT_CHAN_SIZE"), 1024)
83+
alertChSize := cfg.Alerting.Healthcheck.AlertChanSize
6284
alertCh := make(chan healthcheck.AlertMessage, alertChSize)
6385

6486
for i := 0; i < workers; i++ {
6587
go healthcheck.StartScheduler(ctx, healthcheck.Deps{
6688
DB: alertDB,
67-
Redis: healthcheck.NewRedisClientFromEnv(),
89+
Redis: healthcheck.NewRedisClientFromConfig(&cfg.Redis),
6890
AlertCh: alertCh,
6991
Batch: batch,
7092
Interval: interval,
7193
})
7294
}
73-
rem := remediation.NewConsumer(alertDB, healthcheck.NewRedisClientFromEnv())
95+
rem := remediation.NewConsumer(alertDB, healthcheck.NewRedisClientFromConfig(&cfg.Redis)).WithConfig(&cfg.Alerting.Remediation)
7496
go rem.Start(ctx, alertCh)
7597

76-
router := fox.New()
98+
// start Prometheus anomaly detection scheduler
99+
promInterval := parseDuration(cfg.Alerting.Prometheus.SchedulerInterval, 5*time.Minute)
100+
promStep := parseDuration(cfg.Alerting.Prometheus.QueryStep, time.Minute)
101+
promRange := parseDuration(cfg.Alerting.Prometheus.QueryRange, 6*time.Hour)
102+
promCfg := healthcheck.NewPrometheusConfigFromApp(&cfg.Alerting.Prometheus)
103+
anomalyDetectClient := healthcheck.NewAnomalyDetectClient(promCfg)
104+
go healthcheck.StartPrometheusScheduler(ctx, healthcheck.PrometheusDeps{
105+
DB: alertDB,
106+
AnomalyDetectClient: anomalyDetectClient,
107+
Interval: promInterval,
108+
QueryStep: promStep,
109+
QueryRange: promRange,
110+
RulesetBase: cfg.Alerting.Ruleset.APIBase,
111+
RulesetTimeout: parseDuration(cfg.Alerting.Ruleset.APITimeout, 10*time.Second),
112+
})
113+
114+
router := gin.New()
115+
router.Use(gin.Logger())
116+
router.Use(gin.Recovery())
77117
router.Use(middleware.Authentication)
78118
alertapi.NewApiWithConfig(router, cfg)
79119
if err := serviceManagerSrv.UseApi(router); err != nil {

config.docker.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"server": { "bindAddr": ":8080" },
3+
"logging": { "level": "info" },
4+
"database": {
5+
"host": "host.docker.internal",
6+
"port": 5532,
7+
"user": "postgres",
8+
"password": "postgres123",
9+
"dbname": "zeroops",
10+
"sslmode": "disable"
11+
},
12+
"redis": { "addr": "host.docker.internal:16379", "password": "", "db": 0 },
13+
"alerting": {
14+
"healthcheck": { "interval": "10s", "batch": 200, "workers": 2, "alertChanSize": 1024 },
15+
"prometheus": { "url": "http://10.210.10.33:9090", "queryTimeout": "30s", "anomalyAPIUrl": "http://alerting-ml:8081/api/v1/anomaly/detect", "anomalyAPITimeout": "10s", "schedulerInterval": "6h", "queryStep": "1m", "queryRange": "6h" },
16+
"ruleset": { "configFile": "/app/configs/alerting/rules.json", "apiBase": "http://10.210.10.33:9999", "apiTimeout": "10s" },
17+
"receiver": { "basicUser": "alert", "basicPass": "REDACTED", "bearer": "" }
18+
}
19+
}

0 commit comments

Comments
 (0)