Skip to content

Commit 866114d

Browse files
DaveCTurnergmjehovich
authored andcommitted
Remove PeerFinder request timeout (elastic#134365)
There's really no need to time out so enthusiastically here, we can wait for as long as it takes to receive a list of other discovery targets from the remote peer. This commit removes the timeout, deferring failure detection down to the TCP level (e.g. keepalives) as managed by the OS. Relates elastic#132713, elastic#123568
1 parent 7e18952 commit 866114d

File tree

4 files changed

+17
-51
lines changed

4 files changed

+17
-51
lines changed

docs/changelog/134365.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
pr: 134365
2+
summary: Remove `PeerFinder` request timeout
3+
area: Cluster Coordination
4+
type: deprecation
5+
issues: []
6+
deprecation:
7+
title: Remove `PeerFinder` request timeout
8+
area: Cluster and node setting
9+
details: >-
10+
There is no need to time out requests sent by the `PeerFinder` during discovery and cluster formation, and this
11+
timeout may sometimes cause spurious failures. With this change the `PeerFinder` requests will wait indefinitely for
12+
responses. The `discovery.request_peers_timeout` setting no longer has any effect.
13+
impact: Discontinue use of the `discovery.request_peers_timeout` setting.

docs/reference/elasticsearch/configuration-reference/discovery-cluster-formation-settings.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ If you adjust these settings then your cluster may not form correctly or may bec
5050
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long to wait when attempting to identify the remote node via a handshake. Defaults to `30s`.
5151

5252
`discovery.request_peers_timeout`
53-
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long a node will wait after asking its peers again before considering the request to have failed. Defaults to `3s`.
53+
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) In 9.1.x and earlier versions, sets how long a node will wait after asking its peers again before considering the request to have failed. Has no effect from version 9.2.0 onwards. Defaults to `3s`.
5454

5555
`discovery.find_peers_warning_timeout`
5656
: ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long a node will attempt to discover its peers before it starts to log verbose messages describing why the connection attempts are failing. Defaults to `3m`.

server/src/main/java/org/elasticsearch/discovery/PeerFinder.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ public abstract class PeerFinder {
6666
"discovery.request_peers_timeout",
6767
TimeValue.timeValueMillis(3000),
6868
TimeValue.timeValueMillis(1),
69-
Setting.Property.NodeScope
69+
Setting.Property.NodeScope,
70+
Setting.Property.Deprecated
7071
);
7172

7273
// We do not log connection failures immediately: some failures are expected, especially if the hosts list isn't perfectly up-to-date
@@ -81,7 +82,6 @@ public abstract class PeerFinder {
8182
);
8283

8384
private final TimeValue findPeersInterval;
84-
private final TimeValue requestPeersTimeout;
8585
private final TimeValue verbosityIncreaseTimeout;
8686

8787
private final Object mutex = new Object();
@@ -106,7 +106,6 @@ public PeerFinder(
106106
ConfiguredHostsResolver configuredHostsResolver
107107
) {
108108
findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
109-
requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(settings);
110109
verbosityIncreaseTimeout = VERBOSITY_INCREASE_TIMEOUT_SETTING.get(settings);
111110
this.transportService = transportService;
112111
this.clusterCoordinationExecutor = transportService.getThreadPool().executor(Names.CLUSTER_COORDINATION);
@@ -571,7 +570,7 @@ public Executor executor() {
571570
discoveryNode,
572571
REQUEST_PEERS_ACTION_NAME,
573572
new PeersRequest(getLocalNode(), knownNodes),
574-
TransportRequestOptions.timeout(requestPeersTimeout),
573+
TransportRequestOptions.EMPTY,
575574
peersResponseHandler
576575
);
577576
}

server/src/test/java/org/elasticsearch/discovery/PeerFinderTests.java

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -750,52 +750,6 @@ public void testDoesNotMakeMultipleConcurrentConnectionAttemptsToOneAddress() {
750750
assertFoundPeers(otherNode);
751751
}
752752

753-
public void testTimesOutAndRetriesConnectionsToBlackholedNodes() {
754-
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");
755-
final DiscoveryNode nodeToFind = newDiscoveryNode("node-to-find");
756-
757-
providedAddresses.add(otherNode.getAddress());
758-
transportAddressConnector.addReachableNode(otherNode);
759-
transportAddressConnector.addReachableNode(nodeToFind);
760-
761-
peerFinder.activate(lastAcceptedNodes);
762-
763-
while (true) {
764-
deterministicTaskQueue.advanceTime();
765-
runAllRunnableTasks(); // MockTransportAddressConnector verifies no multiple connection attempts
766-
if (capturingTransport.getCapturedRequestsAndClear().length > 0) {
767-
break;
768-
}
769-
}
770-
771-
final long timeoutAtMillis = deterministicTaskQueue.getCurrentTimeMillis() + PeerFinder.DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(
772-
Settings.EMPTY
773-
).millis();
774-
while (deterministicTaskQueue.getCurrentTimeMillis() < timeoutAtMillis) {
775-
assertFoundPeers(otherNode);
776-
deterministicTaskQueue.advanceTime();
777-
runAllRunnableTasks();
778-
}
779-
780-
// need to wait for the connection to timeout, then for another wakeup, before discovering the peer
781-
final long expectedTime = timeoutAtMillis + PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(Settings.EMPTY).millis();
782-
783-
while (deterministicTaskQueue.getCurrentTimeMillis() < expectedTime) {
784-
deterministicTaskQueue.advanceTime();
785-
runAllRunnableTasks();
786-
}
787-
788-
respondToRequests(node -> {
789-
assertThat(node, is(otherNode));
790-
return new PeersResponse(Optional.empty(), singletonList(nodeToFind), randomNonNegativeLong());
791-
});
792-
793-
deterministicTaskQueue.advanceTime();
794-
runAllRunnableTasks();
795-
796-
assertFoundPeers(nodeToFind, otherNode);
797-
}
798-
799753
@TestLogging(reason = "testing logging at WARN level", value = "org.elasticsearch.discovery:WARN")
800754
public void testLogsWarningsIfActiveForLongEnough() throws IllegalAccessException {
801755
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");

0 commit comments

Comments
 (0)