@@ -25,8 +25,10 @@ import (
2525const loggerName = "salud"
2626
2727const (
28- wakeup = time .Minute * 5
2928 requestTimeout = time .Second * 10
29+ initialBackoffDelay = 10 * time .Second
30+ maxBackoffDelay = 5 * time .Minute
31+ backoffFactor = 2
3032 DefaultMinPeersPerBin = 4
3133 DefaultDurPercentile = 0.4 // consider 40% as healthy, lower percentile = stricter duration check
3234 DefaultConnsPercentile = 0.8 // consider 80% as healthy, lower percentile = stricter conns check
@@ -97,14 +99,20 @@ func (s *service) worker(startupStabilizer stabilization.Subscriber, mode string
9799 s .logger .Debug ("node warmup check completed" )
98100 }
99101
100- for {
102+ currentDelay := initialBackoffDelay
101103
104+ for {
102105 s .salud (mode , minPeersPerbin , durPercentile , connsPercentile )
103106
104107 select {
105108 case <- s .quit :
106109 return
107- case <- time .After (wakeup ):
110+ case <- time .After (currentDelay ):
111+ }
112+
113+ currentDelay *= time .Duration (backoffFactor )
114+ if currentDelay > maxBackoffDelay {
115+ currentDelay = maxBackoffDelay
108116 }
109117 }
110118}
@@ -135,7 +143,7 @@ func (s *service) salud(mode string, minPeersPerbin int, durPercentile float64,
135143 bins [swarm .MaxBins ]int
136144 )
137145
138- _ = s .topology .EachConnectedPeer (func (addr swarm.Address , bin uint8 ) (stop bool , jumpToNext bool , err error ) {
146+ err : = s .topology .EachConnectedPeer (func (addr swarm.Address , bin uint8 ) (stop bool , jumpToNext bool , err error ) {
139147 wg .Add (1 )
140148 go func () {
141149 defer wg .Done ()
@@ -164,6 +172,9 @@ func (s *service) salud(mode string, minPeersPerbin int, durPercentile float64,
164172 }()
165173 return false , false , nil
166174 }, topology.Select {})
175+ if err != nil {
176+ s .logger .Error (err , "error iterating over connected peers" , "mode" , mode )
177+ }
167178
168179 wg .Wait ()
169180
0 commit comments