Skip to content

Commit 2faa714

Browse files
authored
Remove PeerFinder request timeout (#134365)
There's really no need to time out so enthusiastically here, we can wait for as long as it takes to receive a list of other discovery targets from the remote peer. This commit removes the timeout, deferring failure detection down to the TCP level (e.g. keepalives) as managed by the OS. Relates #132713, #123568
1 parent cd6e672 commit 2faa714

File tree

4 files changed

+17
-51
lines changed

4 files changed

+17
-51
lines changed

docs/changelog/134365.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
pr: 134365
2+
summary: Remove `PeerFinder` request timeout
3+
area: Cluster Coordination
4+
type: deprecation
5+
issues: []
6+
deprecation:
7+
title: Remove `PeerFinder` request timeout
8+
area: Cluster and node setting
9+
details: >-
10+
There is no need to time out requests sent by the `PeerFinder` during discovery and cluster formation, and this
11+
timeout may sometimes cause spurious failures. With this change the `PeerFinder` requests will wait indefinitely for
12+
responses. The `discovery.request_peers_timeout` setting no longer has any effect.
13+
impact: Discontinue use of the `discovery.request_peers_timeout` setting.

docs/reference/elasticsearch/configuration-reference/discovery-cluster-formation-settings.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ If you adjust these settings then your cluster may not form correctly or may bec
5050
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long to wait when attempting to identify the remote node via a handshake. Defaults to `30s`.
5151

5252
`discovery.request_peers_timeout`
53-
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long a node will wait after asking its peers again before considering the request to have failed. Defaults to `3s`.
53+
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) In 9.1.x and earlier versions, sets how long a node will wait after asking its peers again before considering the request to have failed. Has no effect from version 9.2.0 onwards. Defaults to `3s`.
5454

5555
`discovery.find_peers_warning_timeout`
5656
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long a node will attempt to discover its peers before it starts to log verbose messages describing why the connection attempts are failing. Defaults to `3m`.

server/src/main/java/org/elasticsearch/discovery/PeerFinder.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ public abstract class PeerFinder {
6666
"discovery.request_peers_timeout",
6767
TimeValue.timeValueMillis(3000),
6868
TimeValue.timeValueMillis(1),
69-
Setting.Property.NodeScope
69+
Setting.Property.NodeScope,
70+
Setting.Property.Deprecated
7071
);
7172

7273
// We do not log connection failures immediately: some failures are expected, especially if the hosts list isn't perfectly up-to-date
@@ -81,7 +82,6 @@ public abstract class PeerFinder {
8182
);
8283

8384
private final TimeValue findPeersInterval;
84-
private final TimeValue requestPeersTimeout;
8585
private final TimeValue verbosityIncreaseTimeout;
8686

8787
private final Object mutex = new Object();
@@ -106,7 +106,6 @@ public PeerFinder(
106106
ConfiguredHostsResolver configuredHostsResolver
107107
) {
108108
findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
109-
requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(settings);
110109
verbosityIncreaseTimeout = VERBOSITY_INCREASE_TIMEOUT_SETTING.get(settings);
111110
this.transportService = transportService;
112111
this.clusterCoordinationExecutor = transportService.getThreadPool().executor(Names.CLUSTER_COORDINATION);
@@ -571,7 +570,7 @@ public Executor executor() {
571570
discoveryNode,
572571
REQUEST_PEERS_ACTION_NAME,
573572
new PeersRequest(getLocalNode(), knownNodes),
574-
TransportRequestOptions.timeout(requestPeersTimeout),
573+
TransportRequestOptions.EMPTY,
575574
peersResponseHandler
576575
);
577576
}

server/src/test/java/org/elasticsearch/discovery/PeerFinderTests.java

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -750,52 +750,6 @@ public void testDoesNotMakeMultipleConcurrentConnectionAttemptsToOneAddress() {
750750
assertFoundPeers(otherNode);
751751
}
752752

753-
public void testTimesOutAndRetriesConnectionsToBlackholedNodes() {
754-
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");
755-
final DiscoveryNode nodeToFind = newDiscoveryNode("node-to-find");
756-
757-
providedAddresses.add(otherNode.getAddress());
758-
transportAddressConnector.addReachableNode(otherNode);
759-
transportAddressConnector.addReachableNode(nodeToFind);
760-
761-
peerFinder.activate(lastAcceptedNodes);
762-
763-
while (true) {
764-
deterministicTaskQueue.advanceTime();
765-
runAllRunnableTasks(); // MockTransportAddressConnector verifies no multiple connection attempts
766-
if (capturingTransport.getCapturedRequestsAndClear().length > 0) {
767-
break;
768-
}
769-
}
770-
771-
final long timeoutAtMillis = deterministicTaskQueue.getCurrentTimeMillis() + PeerFinder.DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(
772-
Settings.EMPTY
773-
).millis();
774-
while (deterministicTaskQueue.getCurrentTimeMillis() < timeoutAtMillis) {
775-
assertFoundPeers(otherNode);
776-
deterministicTaskQueue.advanceTime();
777-
runAllRunnableTasks();
778-
}
779-
780-
// need to wait for the connection to timeout, then for another wakeup, before discovering the peer
781-
final long expectedTime = timeoutAtMillis + PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(Settings.EMPTY).millis();
782-
783-
while (deterministicTaskQueue.getCurrentTimeMillis() < expectedTime) {
784-
deterministicTaskQueue.advanceTime();
785-
runAllRunnableTasks();
786-
}
787-
788-
respondToRequests(node -> {
789-
assertThat(node, is(otherNode));
790-
return new PeersResponse(Optional.empty(), singletonList(nodeToFind), randomNonNegativeLong());
791-
});
792-
793-
deterministicTaskQueue.advanceTime();
794-
runAllRunnableTasks();
795-
796-
assertFoundPeers(nodeToFind, otherNode);
797-
}
798-
799753
@TestLogging(reason = "testing logging at WARN level", value = "org.elasticsearch.discovery:WARN")
800754
public void testLogsWarningsIfActiveForLongEnough() throws IllegalAccessException {
801755
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");

0 commit comments

Comments
 (0)