@@ -28,11 +28,14 @@ type HealthController struct {
2828//HealthStats is holds the latest heartbeats
2929type HealthStats struct {
3030 sync.Mutex
31- Healthy bool
32- MetricsControllerAlive time.Time
33- NetworkPolicyControllerAlive time.Time
34- NetworkRoutingControllerAlive time.Time
35- NetworkServicesControllerAlive time.Time
31+ Healthy bool
32+ MetricsControllerAlive time.Time
33+ NetworkPolicyControllerAlive time.Time
34+ NetworkPolicyControllerAliveTTL time.Duration
35+ NetworkRoutingControllerAlive time.Time
36+ NetworkRoutingControllerAliveTTL time.Duration
37+ NetworkServicesControllerAlive time.Time
38+ NetworkServicesControllerAliveTTL time.Duration
3639}
3740
3841//SendHeartBeat sends a heartbeat on the passed channel
@@ -73,37 +76,53 @@ func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) {
7376 hc .Status .Lock ()
7477 defer hc .Status .Unlock ()
7578
76- switch component := beat .Component ; component {
77- case "NSC" :
78- hc .Status .NetworkServicesControllerAlive = time .Now ()
79- case "NRC" :
80- hc .Status .NetworkRoutingControllerAlive = time .Now ()
81- case "NPC" :
82- hc .Status .NetworkPolicyControllerAlive = time .Now ()
83- case "MC" :
84- hc .Status .MetricsControllerAlive = time .Now ()
79+ switch {
80+ // The first heartbeat will set the initial gracetime the controller has to report in, A static time is added as well when checking to allow for load variation in sync time
81+ case beat .Component == "NSC" :
82+ if hc .Status .NetworkServicesControllerAliveTTL == 0 {
83+ hc .Status .NetworkServicesControllerAliveTTL = time .Since (hc .Status .NetworkServicesControllerAlive )
84+ }
85+ hc .Status .NetworkServicesControllerAlive = beat .LastHeartBeat
86+
87+ case beat .Component == "NRC" :
88+ if hc .Status .NetworkRoutingControllerAliveTTL == 0 {
89+ hc .Status .NetworkRoutingControllerAliveTTL = time .Since (hc .Status .NetworkRoutingControllerAlive )
90+ }
91+ hc .Status .NetworkRoutingControllerAlive = beat .LastHeartBeat
92+
93+ case beat .Component == "NPC" :
94+ if hc .Status .NetworkPolicyControllerAliveTTL == 0 {
95+ hc .Status .NetworkPolicyControllerAliveTTL = time .Since (hc .Status .NetworkPolicyControllerAlive )
96+ }
97+ hc .Status .NetworkPolicyControllerAlive = beat .LastHeartBeat
98+
99+ case beat .Component == "MC" :
100+ hc .Status .MetricsControllerAlive = beat .LastHeartBeat
85101 }
86102}
87103
88- //CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
104+ // CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
89105func (hc * HealthController ) CheckHealth () bool {
90106 health := true
107+ graceTime := time .Duration (1500 * time .Millisecond )
108+
91109 if hc .Config .RunFirewall {
92- if time .Since (hc .Status .NetworkPolicyControllerAlive ) > hc .Config .IPTablesSyncPeriod + 5 * time .Second {
110+
111+ if time .Since (hc .Status .NetworkPolicyControllerAlive ) > hc .Config .IPTablesSyncPeriod + hc .Status .NetworkPolicyControllerAliveTTL + graceTime {
93112 glog .Error ("Network Policy Controller heartbeat missed" )
94113 health = false
95114 }
96115 }
97116
98117 if hc .Config .RunRouter {
99- if time .Since (hc .Status .NetworkRoutingControllerAlive ) > hc .Config .RoutesSyncPeriod + 5 * time . Second {
118+ if time .Since (hc .Status .NetworkRoutingControllerAlive ) > hc .Config .RoutesSyncPeriod + hc . Status . NetworkRoutingControllerAliveTTL + graceTime {
100119 glog .Error ("Network Routing Controller heartbeat missed" )
101120 health = false
102121 }
103122 }
104123
105124 if hc .Config .RunServiceProxy {
106- if time .Since (hc .Status .NetworkServicesControllerAlive ) > hc .Config .IpvsSyncPeriod + 5 * time . Second {
125+ if time .Since (hc .Status .NetworkServicesControllerAlive ) > hc .Config .IpvsSyncPeriod + hc . Status . NetworkServicesControllerAliveTTL + graceTime {
107126 glog .Error ("NetworkService Controller heartbeat missed" )
108127 health = false
109128 }
@@ -143,9 +162,6 @@ func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <
143162 hc .HTTPEnabled = false
144163 }
145164
146- //Give the controllers a few seconds to start before checking health
147- time .Sleep (60 * time .Second )
148-
149165 for {
150166 select {
151167 case <- stopCh :
0 commit comments