@@ -3,7 +3,6 @@ package main
33import (
44 "context"
55 "fmt"
6- "os"
76 "strconv"
87 "time"
98
@@ -17,16 +16,34 @@ import (
1716 servicemanager "github.com/qiniu/zeroops/internal/service_manager"
1817
1918 // releasesystem "github.com/qiniu/zeroops/internal/release_system/api"
19+ "strings"
20+
21+ "github.com/rs/zerolog"
2022 "github.com/rs/zerolog/log"
2123)
2224
2325func main () {
26+ // load config first
2427 log .Info ().Msg ("Starting zeroops api server" )
2528 cfg , err := config .Load ()
2629 if err != nil {
2730 log .Fatal ().Err (err ).Msg ("failed to load config" )
2831 }
2932
33+ // configure log level from config
34+ switch strings .ToLower (cfg .Logging .Level ) {
35+ case "trace" :
36+ zerolog .SetGlobalLevel (zerolog .TraceLevel )
37+ case "debug" :
38+ zerolog .SetGlobalLevel (zerolog .DebugLevel )
39+ case "warn" , "warning" :
40+ zerolog .SetGlobalLevel (zerolog .WarnLevel )
41+ case "error" :
42+ zerolog .SetGlobalLevel (zerolog .ErrorLevel )
43+ default :
44+ zerolog .SetGlobalLevel (zerolog .DebugLevel )
45+ }
46+
3047 serviceManagerSrv , err := servicemanager .NewServiceManagerServer (cfg )
3148 if err != nil {
3249 log .Fatal ().Err (err ).Msg ("failed to create release system api" )
@@ -52,27 +69,48 @@ func main() {
5269 // start healthcheck scheduler and remediation consumer
5370 ctx , cancel := context .WithCancel (context .Background ())
5471 defer cancel ()
55- interval := parseDuration (os .Getenv ("HC_SCAN_INTERVAL" ), 10 * time .Second )
56- batch := parseInt (os .Getenv ("HC_SCAN_BATCH" ), 200 )
57- workers := parseInt (os .Getenv ("HC_WORKERS" ), 1 )
72+
73+ // bootstrap alert rules from config if provided
74+ if err := healthcheck .BootstrapRulesFromConfigWithApp (ctx , alertDB , & cfg .Alerting .Ruleset ); err != nil {
75+ log .Error ().Err (err ).Msg ("bootstrap rules from config failed" )
76+ }
77+ interval := parseDuration (cfg .Alerting .Healthcheck .Interval , 10 * time .Second )
78+ batch := cfg .Alerting .Healthcheck .Batch
79+ workers := cfg .Alerting .Healthcheck .Workers
5880 if workers < 1 {
5981 workers = 1
6082 }
61- alertChSize := parseInt ( os . Getenv ( "REMEDIATION_ALERT_CHAN_SIZE" ), 1024 )
83+ alertChSize := cfg . Alerting . Healthcheck . AlertChanSize
6284 alertCh := make (chan healthcheck.AlertMessage , alertChSize )
6385
6486 for i := 0 ; i < workers ; i ++ {
6587 go healthcheck .StartScheduler (ctx , healthcheck.Deps {
6688 DB : alertDB ,
67- Redis : healthcheck .NewRedisClientFromEnv ( ),
89+ Redis : healthcheck .NewRedisClientFromConfig ( & cfg . Redis ),
6890 AlertCh : alertCh ,
6991 Batch : batch ,
7092 Interval : interval ,
7193 })
7294 }
73- rem := remediation .NewConsumer (alertDB , healthcheck .NewRedisClientFromEnv () )
95+ rem := remediation .NewConsumer (alertDB , healthcheck .NewRedisClientFromConfig ( & cfg . Redis )). WithConfig ( & cfg . Alerting . Remediation )
7496 go rem .Start (ctx , alertCh )
7597
98+ // start Prometheus anomaly detection scheduler
99+ promInterval := parseDuration (cfg .Alerting .Prometheus .SchedulerInterval , 6 * time .Hour )
100+ promStep := parseDuration (cfg .Alerting .Prometheus .QueryStep , time .Minute )
101+ promRange := parseDuration (cfg .Alerting .Prometheus .QueryRange , 6 * time .Hour )
102+ promCfg := healthcheck .NewPrometheusConfigFromApp (& cfg .Alerting .Prometheus )
103+ promClient := healthcheck .NewPrometheusClient (promCfg )
104+ go healthcheck .StartPrometheusScheduler (ctx , healthcheck.PrometheusDeps {
105+ DB : alertDB ,
106+ PrometheusClient : promClient ,
107+ Interval : promInterval ,
108+ QueryStep : promStep ,
109+ QueryRange : promRange ,
110+ RulesetBase : cfg .Alerting .Ruleset .APIBase ,
111+ RulesetTimeout : parseDuration (cfg .Alerting .Ruleset .APITimeout , 10 * time .Second ),
112+ })
113+
76114 router := fox .New ()
77115 router .Use (middleware .Authentication )
78116 alertapi .NewApiWithConfig (router , cfg )
0 commit comments