Skip to content

Commit 17f92de

Browse files
roffemurali-reddy
authored andcommitted
Health 1 (#463)
* rework healthcontroller timeouts * change static duration * varname
1 parent 58da2d4 commit 17f92de

File tree

2 files changed

+44
-28
lines changed

2 files changed

+44
-28
lines changed

pkg/cmd/kube-router.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,13 @@ func (kr *KubeRouter) Run() error {
8282

8383
stopCh := make(chan struct{})
8484

85+
hc, err := healthcheck.NewHealthController(kr.Config)
86+
if err != nil {
87+
return errors.New("Failed to create health controller: " + err.Error())
88+
}
89+
wg.Add(1)
90+
go hc.Run(healthChan, stopCh, &wg)
91+
8592
if !(kr.Config.RunFirewall || kr.Config.RunServiceProxy || kr.Config.RunRouter) {
8693
glog.Info("Router, Firewall or Service proxy functionality must be specified. Exiting!")
8794
os.Exit(0)
@@ -158,13 +165,6 @@ func (kr *KubeRouter) Run() error {
158165
go nsc.Run(healthChan, stopCh, &wg)
159166
}
160167

161-
hc, err := healthcheck.NewHealthController(kr.Config)
162-
if err != nil {
163-
return errors.New("Failed to create health controller: " + err.Error())
164-
}
165-
wg.Add(1)
166-
go hc.Run(healthChan, stopCh, &wg)
167-
168168
// Handle SIGINT and SIGTERM
169169
ch := make(chan os.Signal)
170170
signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM)

pkg/healthcheck/health_controller.go

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,14 @@ type HealthController struct {
2828
//HealthStats is holds the latest heartbeats
2929
type HealthStats struct {
3030
sync.Mutex
31-
Healthy bool
32-
MetricsControllerAlive time.Time
33-
NetworkPolicyControllerAlive time.Time
34-
NetworkRoutingControllerAlive time.Time
35-
NetworkServicesControllerAlive time.Time
31+
Healthy bool
32+
MetricsControllerAlive time.Time
33+
NetworkPolicyControllerAlive time.Time
34+
NetworkPolicyControllerAliveTTL time.Duration
35+
NetworkRoutingControllerAlive time.Time
36+
NetworkRoutingControllerAliveTTL time.Duration
37+
NetworkServicesControllerAlive time.Time
38+
NetworkServicesControllerAliveTTL time.Duration
3639
}
3740

3841
//SendHeartBeat sends a heartbeat on the passed channel
@@ -73,37 +76,53 @@ func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) {
7376
hc.Status.Lock()
7477
defer hc.Status.Unlock()
7578

76-
switch component := beat.Component; component {
77-
case "NSC":
78-
hc.Status.NetworkServicesControllerAlive = time.Now()
79-
case "NRC":
80-
hc.Status.NetworkRoutingControllerAlive = time.Now()
81-
case "NPC":
82-
hc.Status.NetworkPolicyControllerAlive = time.Now()
83-
case "MC":
84-
hc.Status.MetricsControllerAlive = time.Now()
79+
switch {
80+
// The first heartbeat will set the initial gracetime the controller has to report in, A static time is added as well when checking to allow for load variation in sync time
81+
case beat.Component == "NSC":
82+
if hc.Status.NetworkServicesControllerAliveTTL == 0 {
83+
hc.Status.NetworkServicesControllerAliveTTL = time.Since(hc.Status.NetworkServicesControllerAlive)
84+
}
85+
hc.Status.NetworkServicesControllerAlive = beat.LastHeartBeat
86+
87+
case beat.Component == "NRC":
88+
if hc.Status.NetworkRoutingControllerAliveTTL == 0 {
89+
hc.Status.NetworkRoutingControllerAliveTTL = time.Since(hc.Status.NetworkRoutingControllerAlive)
90+
}
91+
hc.Status.NetworkRoutingControllerAlive = beat.LastHeartBeat
92+
93+
case beat.Component == "NPC":
94+
if hc.Status.NetworkPolicyControllerAliveTTL == 0 {
95+
hc.Status.NetworkPolicyControllerAliveTTL = time.Since(hc.Status.NetworkPolicyControllerAlive)
96+
}
97+
hc.Status.NetworkPolicyControllerAlive = beat.LastHeartBeat
98+
99+
case beat.Component == "MC":
100+
hc.Status.MetricsControllerAlive = beat.LastHeartBeat
85101
}
86102
}
87103

88-
//CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
104+
// CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
89105
func (hc *HealthController) CheckHealth() bool {
90106
health := true
107+
graceTime := time.Duration(1500 * time.Millisecond)
108+
91109
if hc.Config.RunFirewall {
92-
if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+5*time.Second {
110+
111+
if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+hc.Status.NetworkPolicyControllerAliveTTL+graceTime {
93112
glog.Error("Network Policy Controller heartbeat missed")
94113
health = false
95114
}
96115
}
97116

98117
if hc.Config.RunRouter {
99-
if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+5*time.Second {
118+
if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+hc.Status.NetworkRoutingControllerAliveTTL+graceTime {
100119
glog.Error("Network Routing Controller heartbeat missed")
101120
health = false
102121
}
103122
}
104123

105124
if hc.Config.RunServiceProxy {
106-
if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+5*time.Second {
125+
if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+hc.Status.NetworkServicesControllerAliveTTL+graceTime {
107126
glog.Error("NetworkService Controller heartbeat missed")
108127
health = false
109128
}
@@ -143,9 +162,6 @@ func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <
143162
hc.HTTPEnabled = false
144163
}
145164

146-
//Give the controllers a few seconds to start before checking health
147-
time.Sleep(60 * time.Second)
148-
149165
for {
150166
select {
151167
case <-stopCh:

0 commit comments

Comments
 (0)