Skip to content

Commit 1fe77cf

Browse files
authored
fix: set context timeout for resolvePeers (#4343)
* fix: set context timeout for resolvePeers Given resolvePeers is context aware, yet, uses background that may lead to blocking call. This is due to the fact that `LookupIPAddr` is called under the hood that does dns look up which may be blocked in some platforms. In case of dns lookups are blocked may lead to hanging goroutine due to the fact that main goroutine is being used for the call that leads to application hanging & being unresponsive as well as readiness check to fail. --------- Signed-off-by: emreya <[email protected]>
1 parent fe2c526 commit 1fe77cf

File tree

4 files changed

+40
-19
lines changed

4 files changed

+40
-19
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ be configured to communicate with each other. This is configured using the
358358
- `--cluster.advertise-address` string: cluster advertise address
359359
- `--cluster.peer` value: initial peers (repeat flag for each additional peer)
360360
- `--cluster.peer-timeout` value: peer timeout period (default "15s")
361+
- `--cluster.peers-resolve-timeout` value: peers resolve timeout period (default "15s")
361362
- `--cluster.gossip-interval` value: cluster message propagation speed
362363
(default "200ms")
363364
- `--cluster.pushpull-interval` value: lower values will increase

cluster/cluster.go

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ type Peer struct {
6060
mlist *memberlist.Memberlist
6161
delegate *delegate
6262

63-
resolvedPeers []string
63+
resolvedPeers []string
64+
resolvePeersTimeout time.Duration
6465

6566
mtx sync.RWMutex
6667
states map[string]State
@@ -117,15 +118,16 @@ func (s PeerStatus) String() string {
117118
}
118119

119120
const (
120-
DefaultPushPullInterval = 60 * time.Second
121-
DefaultGossipInterval = 200 * time.Millisecond
122-
DefaultTCPTimeout = 10 * time.Second
123-
DefaultProbeTimeout = 500 * time.Millisecond
124-
DefaultProbeInterval = 1 * time.Second
125-
DefaultReconnectInterval = 10 * time.Second
126-
DefaultReconnectTimeout = 6 * time.Hour
127-
DefaultRefreshInterval = 15 * time.Second
128-
MaxGossipPacketSize = 1400
121+
DefaultPushPullInterval = 60 * time.Second
122+
DefaultGossipInterval = 200 * time.Millisecond
123+
DefaultTCPTimeout = 10 * time.Second
124+
DefaultProbeTimeout = 500 * time.Millisecond
125+
DefaultProbeInterval = 1 * time.Second
126+
DefaultReconnectInterval = 10 * time.Second
127+
DefaultReconnectTimeout = 6 * time.Hour
128+
DefaultRefreshInterval = 15 * time.Second
129+
DefaultResolvePeersTimeout = 15 * time.Second
130+
MaxGossipPacketSize = 1400
129131
)
130132

131133
func Create(
@@ -138,6 +140,7 @@ func Create(
138140
pushPullInterval time.Duration,
139141
gossipInterval time.Duration,
140142
tcpTimeout time.Duration,
143+
resolveTimeout time.Duration,
141144
probeTimeout time.Duration,
142145
probeInterval time.Duration,
143146
tlsTransportConfig *TLSTransportConfig,
@@ -168,7 +171,9 @@ func Create(
168171
}
169172
}
170173

171-
resolvedPeers, err := resolvePeers(context.Background(), knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty)
174+
ctx, cancel := context.WithTimeout(context.Background(), resolveTimeout)
175+
defer cancel()
176+
resolvedPeers, err := resolvePeers(ctx, knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty)
172177
if err != nil {
173178
return nil, fmt.Errorf("resolve peers: %w", err)
174179
}
@@ -199,13 +204,14 @@ func Create(
199204
}
200205

201206
p := &Peer{
202-
states: map[string]State{},
203-
stopc: make(chan struct{}),
204-
readyc: make(chan struct{}),
205-
logger: l,
206-
peers: map[string]peer{},
207-
resolvedPeers: resolvedPeers,
208-
knownPeers: knownPeers,
207+
states: map[string]State{},
208+
stopc: make(chan struct{}),
209+
readyc: make(chan struct{}),
210+
logger: l,
211+
peers: map[string]peer{},
212+
resolvedPeers: resolvedPeers,
213+
resolvePeersTimeout: resolveTimeout,
214+
knownPeers: knownPeers,
209215
}
210216

211217
p.register(reg, name)
@@ -445,7 +451,9 @@ func (p *Peer) reconnect() {
445451
func (p *Peer) refresh() {
446452
logger := p.logger.With("msg", "refresh")
447453

448-
resolvedPeers, err := resolvePeers(context.Background(), p.knownPeers, p.advertiseAddr, &net.Resolver{}, false)
454+
ctx, cancel := context.WithTimeout(context.Background(), p.resolvePeersTimeout)
455+
defer cancel()
456+
resolvedPeers, err := resolvePeers(ctx, p.knownPeers, p.advertiseAddr, &net.Resolver{}, false)
449457
if err != nil {
450458
logger.Debug(fmt.Sprintf("%v", p.knownPeers), "err", err)
451459
return

cluster/cluster_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ func testJoinLeave(t *testing.T) {
5353
DefaultPushPullInterval,
5454
DefaultGossipInterval,
5555
DefaultTCPTimeout,
56+
DefaultResolvePeersTimeout,
5657
DefaultProbeTimeout,
5758
DefaultProbeInterval,
5859
nil,
@@ -89,6 +90,7 @@ func testJoinLeave(t *testing.T) {
8990
DefaultPushPullInterval,
9091
DefaultGossipInterval,
9192
DefaultTCPTimeout,
93+
DefaultResolvePeersTimeout,
9294
DefaultProbeTimeout,
9395
DefaultProbeInterval,
9496
nil,
@@ -126,6 +128,7 @@ func testReconnect(t *testing.T) {
126128
DefaultPushPullInterval,
127129
DefaultGossipInterval,
128130
DefaultTCPTimeout,
131+
DefaultResolvePeersTimeout,
129132
DefaultProbeTimeout,
130133
DefaultProbeInterval,
131134
nil,
@@ -153,6 +156,7 @@ func testReconnect(t *testing.T) {
153156
DefaultPushPullInterval,
154157
DefaultGossipInterval,
155158
DefaultTCPTimeout,
159+
DefaultResolvePeersTimeout,
156160
DefaultProbeTimeout,
157161
DefaultProbeInterval,
158162
nil,
@@ -195,6 +199,7 @@ func testRemoveFailedPeers(t *testing.T) {
195199
DefaultPushPullInterval,
196200
DefaultGossipInterval,
197201
DefaultTCPTimeout,
202+
DefaultResolvePeersTimeout,
198203
DefaultProbeTimeout,
199204
DefaultProbeInterval,
200205
nil,
@@ -248,6 +253,7 @@ func testInitiallyFailingPeers(t *testing.T) {
248253
DefaultPushPullInterval,
249254
DefaultGossipInterval,
250255
DefaultTCPTimeout,
256+
DefaultResolvePeersTimeout,
251257
DefaultProbeTimeout,
252258
DefaultProbeInterval,
253259
nil,
@@ -297,6 +303,7 @@ func testTLSConnection(t *testing.T) {
297303
DefaultPushPullInterval,
298304
DefaultGossipInterval,
299305
DefaultTCPTimeout,
306+
DefaultResolvePeersTimeout,
300307
DefaultProbeTimeout,
301308
DefaultProbeInterval,
302309
tlsTransportConfig1,
@@ -330,6 +337,7 @@ func testTLSConnection(t *testing.T) {
330337
DefaultPushPullInterval,
331338
DefaultGossipInterval,
332339
DefaultTCPTimeout,
340+
DefaultResolvePeersTimeout,
333341
DefaultProbeTimeout,
334342
DefaultProbeInterval,
335343
tlsTransportConfig2,
@@ -369,6 +377,7 @@ func testPeerNames(t *testing.T, name1, name2 string) {
369377
DefaultPushPullInterval,
370378
DefaultGossipInterval,
371379
DefaultTCPTimeout,
380+
DefaultResolvePeersTimeout,
372381
DefaultProbeTimeout,
373382
DefaultProbeInterval,
374383
nil,
@@ -405,6 +414,7 @@ func testPeerNames(t *testing.T, name1, name2 string) {
405414
DefaultPushPullInterval,
406415
DefaultGossipInterval,
407416
DefaultTCPTimeout,
417+
DefaultResolvePeersTimeout,
408418
DefaultProbeTimeout,
409419
DefaultProbeInterval,
410420
nil,

cmd/alertmanager/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ func run() int {
159159
clusterPeerName = kingpin.Flag("cluster.peer-name", "Explicit name of the peer, rather than generating a random one").Default("").String()
160160
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
161161
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
162+
peersResolveTimeout = kingpin.Flag("cluster.peers-resolve-timeout", "Time to resolve peers.").Default(cluster.DefaultResolvePeersTimeout.String()).Duration()
162163
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
163164
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
164165
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTCPTimeout.String()).Duration()
@@ -245,6 +246,7 @@ func run() int {
245246
*pushPullInterval,
246247
*gossipInterval,
247248
*tcpTimeout,
249+
*peersResolveTimeout,
248250
*probeTimeout,
249251
*probeInterval,
250252
tlsTransportConfig,

0 commit comments

Comments
 (0)