Skip to content

Commit 327a46d

Browse files
authored
fix race condition issues with health checks (#460)
* fix race condition issues with health checks * better log meesage when skipping heartbeat
1 parent 5c6a24d commit 327a46d

File tree

5 files changed

+29
-18
lines changed

5 files changed

+29
-18
lines changed

pkg/cmd/kube-router.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,6 @@ func (kr *KubeRouter) Run() error {
8787
os.Exit(0)
8888
}
8989

90-
hc, err := healthcheck.NewHealthController(kr.Config)
91-
if err != nil {
92-
return errors.New("Failed to create health controller: " + err.Error())
93-
}
94-
wg.Add(1)
95-
go hc.Run(healthChan, stopCh, &wg)
96-
9790
if (kr.Config.MetricsPort > 0) && (kr.Config.MetricsPort <= 65535) {
9891
kr.Config.MetricsEnabled = true
9992
mc, err := metrics.NewMetricsController(kr.Client, kr.Config)
@@ -165,6 +158,13 @@ func (kr *KubeRouter) Run() error {
165158
go nsc.Run(healthChan, stopCh, &wg)
166159
}
167160

161+
hc, err := healthcheck.NewHealthController(kr.Config)
162+
if err != nil {
163+
return errors.New("Failed to create health controller: " + err.Error())
164+
}
165+
wg.Add(1)
166+
go hc.Run(healthChan, stopCh, &wg)
167+
168168
// Handle SIGINT and SIGTERM
169169
ch := make(chan os.Signal)
170170
signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM)

pkg/controllers/netpol/network_policy_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,8 @@ func (npc *NetworkPolicyController) Run(healthChan chan<- *healthcheck.Controlle
136136
glog.V(1).Info("Performing periodic sync of iptables to reflect network policies")
137137
err := npc.Sync()
138138
if err != nil {
139-
glog.Errorf("Error during periodic sync: " + err.Error())
139+
glog.Errorf("Error during periodic sync of network policies in network policy controller. Error: " + err.Error())
140+
glog.Errorf("Skipping sending heartbeat from network policy controller as periodic sync failed.")
140141
} else {
141142
healthcheck.SendHeartBeat(healthChan, "NPC")
142143
}

pkg/controllers/proxy/network_services_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,8 @@ func (nsc *NetworkServicesController) Run(healthChan chan<- *healthcheck.Control
265265
glog.V(1).Info("Performing periodic sync of ipvs services")
266266
err := nsc.sync()
267267
if err != nil {
268-
glog.Errorf("Error during periodic ipvs sync: " + err.Error())
268+
glog.Errorf("Error during periodic ipvs sync in network service controller. Error: " + err.Error())
269+
glog.Errorf("Skipping sending heartbeat from network service controller as periodic sync failed.")
269270
} else {
270271
healthcheck.SendHeartBeat(healthChan, "NSC")
271272
}

pkg/controllers/routing/network_routes_controller.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ func (nrc *NetworkRoutingController) Run(healthChan chan<- *healthcheck.Controll
216216

217217
// loop forever till notified to stop on stopCh
218218
for {
219+
var err error
219220
select {
220221
case <-stopCh:
221222
glog.Infof("Shutting down network routes controller")
@@ -226,7 +227,7 @@ func (nrc *NetworkRoutingController) Run(healthChan chan<- *healthcheck.Controll
226227
// Update ipset entries
227228
if nrc.enablePodEgress || nrc.enableOverlays {
228229
glog.V(1).Info("Syncing ipsets")
229-
err := nrc.syncNodeIPSets()
230+
err = nrc.syncNodeIPSets()
230231
if err != nil {
231232
glog.Errorf("Error synchronizing ipsets: %s", err.Error())
232233
}
@@ -257,7 +258,12 @@ func (nrc *NetworkRoutingController) Run(healthChan chan<- *healthcheck.Controll
257258
nrc.syncInternalPeers()
258259
}
259260

260-
healthcheck.SendHeartBeat(healthChan, "NRC")
261+
if err == nil {
262+
healthcheck.SendHeartBeat(healthChan, "NRC")
263+
} else {
264+
glog.Errorf("Error during periodic sync in network routing controller. Error: " + err.Error())
265+
glog.Errorf("Skipping sending heartbeat from network routing controller as periodic sync failed.")
266+
}
261267

262268
select {
263269
case <-stopCh:

pkg/healthcheck/health_controller.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,7 @@ func (hc *HealthController) CheckHealth() bool {
121121

122122
//Run starts the HealthController
123123
func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
124-
Started := time.Now()
125-
t := time.NewTicker(500 * time.Millisecond)
124+
t := time.NewTicker(5000 * time.Millisecond)
126125
defer wg.Done()
127126
glog.Info("Starting health controller")
128127

@@ -144,11 +143,10 @@ func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <
144143
hc.HTTPEnabled = false
145144
}
146145

146+
//Give the controllers a few seconds to start before checking health
147+
time.Sleep(60 * time.Second)
148+
147149
for {
148-
//Give the controllers a few seconds to start before checking health
149-
if time.Since(Started) > 5*time.Second {
150-
hc.Status.Healthy = hc.CheckHealth()
151-
}
152150
select {
153151
case <-stopCh:
154152
glog.Infof("Shutting down health controller")
@@ -163,6 +161,7 @@ func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <
163161
case <-t.C:
164162
glog.V(4).Info("Health controller tick")
165163
}
164+
hc.Status.Healthy = hc.CheckHealth()
166165
}
167166

168167
}
@@ -173,7 +172,11 @@ func NewHealthController(config *options.KubeRouterConfig) (*HealthController, e
173172
Config: config,
174173
HealthPort: config.HealthPort,
175174
Status: HealthStats{
176-
Healthy: false,
175+
Healthy: true,
176+
MetricsControllerAlive: time.Now(),
177+
NetworkPolicyControllerAlive: time.Now(),
178+
NetworkRoutingControllerAlive: time.Now(),
179+
NetworkServicesControllerAlive: time.Now(),
177180
},
178181
}
179182
return &hc, nil

0 commit comments

Comments
 (0)