Skip to content

Commit a480a51

Browse files
authored
Merge pull request #298 from roffe/healthcheck
Healthchecks
2 parents f1cb675 + daefc81 commit a480a51

18 files changed

+361
-20
lines changed

Documentation/health.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Health checking kube-router
2+
3+
kube-router currently has basic health checking in form of heartbeats sent from each controller to the healthcontroller each time the main loop completes successfully.
4+
5+
The health port is by default 20244 but can be changed with the startup option.
6+
The health path is `/healthz`
7+
8+
--health-port=<port number>
9+
10+
If port is set to 0 (zero) no HTTP endpoint will be made availible but the health controller will still run and print out any missed heartbeats to STDERR of kube-router
11+
12+
If a controller does not send a heartbeat within controllersynctime + 5 seconds the component will be flagged as unhealthy.
13+
14+
If any of the running components is failing the whole kube-router state will be marked as failed in the /healthz endpoint
15+
16+
E.g
17+
18+
kube-router is started with
19+
20+
--run-router=true
21+
--run-firewall=true
22+
--run-service-proxy=true
23+
24+
If the route controller, policy controller or service controller exits it's main loop and does not publish a heartbeat the /healthz endpoint will return a error 500 signaling that kube-router is not healthy.
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
package controllers
2+
3+
import (
4+
"net/http"
5+
"strconv"
6+
"sync"
7+
"time"
8+
9+
"github.com/cloudnativelabs/kube-router/app/options"
10+
"github.com/golang/glog"
11+
"golang.org/x/net/context"
12+
)
13+
14+
//ControllerHeartbeat is the structure to hold the heartbeats sent by controlers
15+
type ControllerHeartbeat struct {
16+
Component string
17+
Lastheartbeat time.Time
18+
}
19+
20+
//HealthController reports the health of the controller loops as a http endpoint
21+
type HealthController struct {
22+
HealthPort uint16
23+
HTTPenabled bool
24+
Status HealthStats
25+
Config *options.KubeRouterConfig
26+
}
27+
28+
//HealthStats is holds the latest heartbeats
29+
type HealthStats struct {
30+
sync.Mutex
31+
Healthy bool
32+
MetricsControllerAlive time.Time
33+
NetworkPolicyControllerAlive time.Time
34+
NetworkRoutingControllerAlive time.Time
35+
NetworkServicesControllerAlive time.Time
36+
}
37+
38+
//sendHeartBeat sends a heartbeat on the passed channel
39+
func sendHeartBeat(channel chan<- *ControllerHeartbeat, controller string) {
40+
heartbeat := ControllerHeartbeat{
41+
Component: controller,
42+
Lastheartbeat: time.Now(),
43+
}
44+
channel <- &heartbeat
45+
}
46+
47+
//Handler writes HTTP responses to the health path
48+
func (hc *HealthController) Handler(w http.ResponseWriter, req *http.Request) {
49+
if hc.Status.Healthy {
50+
w.WriteHeader(http.StatusOK)
51+
w.Write([]byte("OK\n"))
52+
} else {
53+
w.WriteHeader(http.StatusInternalServerError)
54+
/*
55+
statusText := fmt.Sprintf("Service controller last alive %s\n ago"+
56+
"Routing controller last alive: %s\n ago"+
57+
"Policy controller last alive: %s\n ago"+
58+
"Metrics controller last alive: %s\n ago",
59+
time.Since(hc.Status.NetworkServicesControllerAlive),
60+
time.Since(hc.Status.NetworkRoutingControllerAlive),
61+
time.Since(hc.Status.NetworkPolicyControllerAlive),
62+
time.Since(hc.Status.MetricsControllerAlive))
63+
w.Write([]byte(statusText))
64+
*/
65+
w.Write([]byte("Unhealthy"))
66+
}
67+
}
68+
69+
//HandleHeartbeat handles recevied heartbeats onthe health channel
70+
func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) {
71+
glog.V(3).Infof("Received heartbeat from %s", beat.Component)
72+
73+
hc.Status.Lock()
74+
defer hc.Status.Unlock()
75+
76+
switch component := beat.Component; component {
77+
case "NSC":
78+
hc.Status.NetworkServicesControllerAlive = time.Now()
79+
case "NRC":
80+
hc.Status.NetworkRoutingControllerAlive = time.Now()
81+
case "NPC":
82+
hc.Status.NetworkPolicyControllerAlive = time.Now()
83+
case "MC":
84+
hc.Status.MetricsControllerAlive = time.Now()
85+
}
86+
}
87+
88+
//CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
89+
func (hc *HealthController) CheckHealth() bool {
90+
health := true
91+
if hc.Config.RunFirewall {
92+
if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+5*time.Second {
93+
glog.Error("Network Policy Controller heartbeat missed")
94+
health = false
95+
}
96+
}
97+
98+
if hc.Config.RunRouter {
99+
if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+5*time.Second {
100+
glog.Error("Network Routing Controller heartbeat missed")
101+
health = false
102+
}
103+
}
104+
105+
if hc.Config.RunServiceProxy {
106+
if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+5*time.Second {
107+
glog.Error("NetworkService Controller heartbeat missed")
108+
health = false
109+
}
110+
}
111+
112+
if hc.Config.MetricsEnabled {
113+
if time.Since(hc.Status.MetricsControllerAlive) > 5*time.Second {
114+
glog.Error("Metrics Controller heartbeat missed")
115+
health = false
116+
}
117+
}
118+
119+
return health
120+
}
121+
122+
//Run starts the HealthController
123+
func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
124+
Started := time.Now()
125+
t := time.NewTicker(500 * time.Millisecond)
126+
defer wg.Done()
127+
glog.Info("Starting health controller")
128+
129+
srv := &http.Server{Addr: ":" + strconv.Itoa(int(hc.HealthPort)), Handler: http.DefaultServeMux}
130+
131+
http.HandleFunc("/healthz", hc.Handler)
132+
133+
if (hc.Config.HealthPort > 0) && (hc.Config.HealthPort <= 65535) {
134+
hc.HTTPenabled = true
135+
go func() {
136+
if err := srv.ListenAndServe(); err != nil {
137+
// cannot panic, because this probably is an intentional close
138+
glog.Errorf("Health controller error: %s", err)
139+
}
140+
}()
141+
} else if hc.Config.MetricsPort > 65535 {
142+
glog.Errorf("Metrics port must be over 0 and under 65535, given port: %d", hc.Config.MetricsPort)
143+
} else {
144+
hc.HTTPenabled = false
145+
}
146+
147+
for {
148+
//Give the controllers a few seconds to start before checking health
149+
if time.Since(Started) > 5*time.Second {
150+
hc.Status.Healthy = hc.CheckHealth()
151+
}
152+
select {
153+
case <-stopCh:
154+
glog.Infof("Shutting down health controller")
155+
if hc.HTTPenabled {
156+
if err := srv.Shutdown(context.Background()); err != nil {
157+
glog.Errorf("could not shutdown: %v", err)
158+
}
159+
}
160+
return nil
161+
case heartbeat := <-healthChan:
162+
hc.HandleHeartbeat(heartbeat)
163+
case <-t.C:
164+
glog.V(4).Info("Health controller tick")
165+
}
166+
}
167+
168+
}
169+
170+
//NewHealthController creates a new healh controller and returns a reference to it
171+
func NewHealthController(config *options.KubeRouterConfig) (*HealthController, error) {
172+
hc := HealthController{
173+
Config: config,
174+
HealthPort: config.HealthPort,
175+
Status: HealthStats{
176+
Healthy: false,
177+
},
178+
}
179+
return &hc, nil
180+
}

app/controllers/metrics_controller.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"net/http"
66
"strconv"
77
"sync"
8+
"time"
89

910
"github.com/cloudnativelabs/kube-router/app/options"
1011
"github.com/golang/glog"
@@ -118,7 +119,8 @@ type MetricsController struct {
118119
}
119120

120121
// Run prometheus metrics controller
121-
func (mc *MetricsController) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) error {
122+
func (mc *MetricsController) Run(healthChan chan<- *ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
123+
t := time.NewTicker(3 * time.Second)
122124
defer wg.Done()
123125
glog.Info("Starting metrics controller")
124126

@@ -136,13 +138,19 @@ func (mc *MetricsController) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) err
136138
glog.Errorf("Metrics controller error: %s", err)
137139
}
138140
}()
139-
140-
<-stopCh
141-
glog.Infof("Shutting down metrics controller")
142-
if err := srv.Shutdown(context.Background()); err != nil {
143-
glog.Errorf("could not shutdown: %v", err)
141+
for {
142+
sendHeartBeat(healthChan, "MC")
143+
select {
144+
case <-stopCh:
145+
glog.Infof("Shutting down metrics controller")
146+
if err := srv.Shutdown(context.Background()); err != nil {
147+
glog.Errorf("could not shutdown: %v", err)
148+
}
149+
return nil
150+
case <-t.C:
151+
glog.V(4).Info("Metrics controller tick")
152+
}
144153
}
145-
return nil
146154
}
147155

148156
// NewMetricsController returns new MetricController object

app/controllers/network_policy_controller.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ type protocolAndPort struct {
100100
}
101101

102102
// Run runs forver till we receive notification on stopCh
103-
func (npc *NetworkPolicyController) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) {
103+
func (npc *NetworkPolicyController) Run(healthChan chan<- *ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) {
104104
t := time.NewTicker(npc.syncPeriod)
105105
defer t.Stop()
106106
defer wg.Done()
@@ -121,6 +121,8 @@ func (npc *NetworkPolicyController) Run(stopCh <-chan struct{}, wg *sync.WaitGro
121121
err := npc.Sync()
122122
if err != nil {
123123
glog.Errorf("Error during periodic sync: " + err.Error())
124+
} else {
125+
sendHeartBeat(healthChan, "NPC")
124126
}
125127
} else {
126128
continue

app/controllers/network_routes_controller.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ const (
8282
)
8383

8484
// Run runs forever until we are notified on stop channel
85-
func (nrc *NetworkRoutingController) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) {
85+
func (nrc *NetworkRoutingController) Run(healthChan chan<- *ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) {
8686
cidr, err := utils.GetPodCidrFromCniSpec("/etc/cni/net.d/10-kuberouter.conf")
8787
if err != nil {
8888
glog.Errorf("Failed to get pod CIDR from CNI conf file: %s", err.Error())
@@ -254,6 +254,8 @@ func (nrc *NetworkRoutingController) Run(stopCh <-chan struct{}, wg *sync.WaitGr
254254
nrc.syncInternalPeers()
255255
}
256256

257+
sendHeartBeat(healthChan, "NRC")
258+
257259
select {
258260
case <-stopCh:
259261
glog.Infof("Shutting down network routes controller")

app/controllers/network_services_controller.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ type endpointsInfo struct {
9999
type endpointsInfoMap map[string][]endpointsInfo
100100

101101
// Run periodically sync ipvs configuration to reflect desired state of services and endpoints
102-
func (nsc *NetworkServicesController) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) error {
102+
func (nsc *NetworkServicesController) Run(healthChan chan<- *ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
103103

104104
t := time.NewTicker(nsc.syncPeriod)
105105
defer t.Stop()
@@ -130,7 +130,12 @@ func (nsc *NetworkServicesController) Run(stopCh <-chan struct{}, wg *sync.WaitG
130130

131131
if watchers.PodWatcher.HasSynced() && watchers.NetworkPolicyWatcher.HasSynced() {
132132
glog.V(1).Info("Performing periodic sync of ipvs services")
133-
nsc.sync()
133+
err := nsc.sync()
134+
if err != nil {
135+
glog.Errorf("Error during periodic ipvs sync: " + err.Error())
136+
} else {
137+
sendHeartBeat(healthChan, "NSC")
138+
}
134139
} else {
135140
continue
136141
}
@@ -144,20 +149,28 @@ func (nsc *NetworkServicesController) Run(stopCh <-chan struct{}, wg *sync.WaitG
144149
}
145150
}
146151

147-
func (nsc *NetworkServicesController) sync() {
152+
func (nsc *NetworkServicesController) sync() error {
153+
var err error
148154
nsc.mu.Lock()
149155
defer nsc.mu.Unlock()
150156

151157
nsc.serviceMap = buildServicesInfo()
152158
nsc.endpointsMap = buildEndpointsInfo()
153-
err := nsc.syncHairpinIptablesRules()
159+
err = nsc.syncHairpinIptablesRules()
154160
if err != nil {
155161
glog.Errorf("Error syncing hairpin iptable rules: %s", err.Error())
156162
}
157-
nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap)
163+
164+
err = nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap)
165+
if err != nil {
166+
glog.Errorf("Error syncing IPVS services: %s", err.Error())
167+
return err
168+
}
169+
158170
if nsc.MetricsEnabled {
159171
nsc.publishMetrics(nsc.serviceMap)
160172
}
173+
return nil
161174
}
162175

163176
func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoMap) error {

app/options/options.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ type KubeRouterConfig struct {
4141
MetricsPort uint16
4242
MetricsPath string
4343
VLevel string
44+
HealthPort uint16
4445
// FullMeshPassword string
4546
}
4647

@@ -113,10 +114,11 @@ func (s *KubeRouterConfig) AddFlags(fs *pflag.FlagSet) {
113114
"Password for authenticating against the BGP peer defined with \"--peer-router-ips\".")
114115
fs.BoolVar(&s.EnablePprof, "enable-pprof", false,
115116
"Enables pprof for debugging performance and memory leak issues.")
116-
fs.Uint16Var(&s.MetricsPort, "metrics-port", 0, "Prometheus metrics port, 0 = Disabled")
117+
fs.Uint16Var(&s.MetricsPort, "metrics-port", 0, "Prometheus metrics port, (Default 0, Disabled)")
117118
fs.StringVar(&s.MetricsPath, "metrics-path", "/metrics", "Prometheus metrics path")
118119
// fs.StringVar(&s.FullMeshPassword, "nodes-full-mesh-password", s.FullMeshPassword,
119120
// "Password that cluster-node BGP servers will use to authenticate one another when \"--nodes-full-mesh\" is set.")
120121
fs.StringVarP(&s.VLevel, "v", "v", "0", "log level for V logs")
122+
fs.Uint16Var(&s.HealthPort, "health-port", 20244, "Health check port, 0 = Disabled")
121123

122124
}

0 commit comments

Comments
 (0)