Skip to content

Commit c15b385

Browse files
committed
Fix peers error handling
Do not exit the Start method with an error in case of a peers update problem, it - potentially breaks watchdog related unit tests, because the k8s manager will stop all components, which disarms the watchdog - will cause a pod restart in production for no reason Retry with a quicker interval instead. Signed-off-by: Marc Sluiter <[email protected]>
1 parent 4e07717 commit c15b385

File tree

1 file changed

+17
-12
lines changed

1 file changed

+17
-12
lines changed

pkg/peers/peers.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ type Role int8
2727
const (
2828
Worker Role = iota
2929
ControlPlane
30+
31+
// this is used instead of "peerUpdateInterval" when peer update fails
32+
quickPeerUpdateInterval = 2 * time.Minute
3033
)
3134

3235
type Peers struct {
@@ -77,22 +80,24 @@ func (p *Peers) Start(ctx context.Context) error {
7780
p.controlPlanePeerSelector = createSelector(hostname, getControlPlaneLabel(myNode))
7881
}
7982

80-
var updatePeersError error
81-
cancellableCtx, cancel := context.WithCancel(ctx)
82-
8383
p.log.Info("peer starting", "name", p.myNodeName)
84-
wait.UntilWithContext(cancellableCtx, func(ctx context.Context) {
85-
updatePeersError = p.updateWorkerPeers(ctx)
86-
if updatePeersError != nil {
87-
cancel()
88-
}
89-
updatePeersError = p.updateControlPlanePeers(ctx)
90-
if updatePeersError != nil {
91-
cancel()
84+
wait.UntilWithContext(ctx, func(ctx context.Context) {
85+
updateWorkerPeersError := p.updateWorkerPeers(ctx)
86+
updateControlPlanePeersError := p.updateControlPlanePeers(ctx)
87+
if updateWorkerPeersError != nil || updateControlPlanePeersError != nil {
88+
// the default update interval is quite long, in case of an error we want to retry quicker
89+
quickCtx, quickCancel := context.WithCancel(ctx)
90+
wait.UntilWithContext(quickCtx, func(ctx context.Context) {
91+
quickUpdateWorkerPeersError := p.updateWorkerPeers(ctx)
92+
quickUpdateControlPlanePeersError := p.updateControlPlanePeers(ctx)
93+
if quickUpdateWorkerPeersError == nil && quickUpdateControlPlanePeersError == nil {
94+
quickCancel()
95+
}
96+
}, quickPeerUpdateInterval)
9297
}
9398
}, p.peerUpdateInterval)
9499

95-
return updatePeersError
100+
return nil
96101
}
97102

98103
func (p *Peers) updateWorkerPeers(ctx context.Context) error {

0 commit comments

Comments
 (0)