From 7f06144d90481d2918706966766f174e51ccb1dd Mon Sep 17 00:00:00 2001 From: David Turner Date: Thu, 29 Aug 2024 08:02:52 +0100 Subject: [PATCH 1/3] Add links to network disconnect troubleshooting Makes the docs added in #112271 more discoverable. --- .../discovery/fault-detection.asciidoc | 5 +++++ docs/reference/modules/transport.asciidoc | 22 ++++++++++--------- .../elasticsearch/common/ReferenceDocs.java | 1 + .../transport/ClusterConnectionManager.java | 9 +++++++- .../common/reference-docs-links.json | 5 +++-- .../ClusterConnectionManagerTests.java | 5 ++++- 6 files changed, 33 insertions(+), 14 deletions(-) diff --git a/docs/reference/modules/discovery/fault-detection.asciidoc b/docs/reference/modules/discovery/fault-detection.asciidoc index d12985b70597c..87d9b6bd54d41 100644 --- a/docs/reference/modules/discovery/fault-detection.asciidoc +++ b/docs/reference/modules/discovery/fault-detection.asciidoc @@ -144,6 +144,7 @@ to <> for information about the settings which control this mechanism. [discrete] +[[cluster-fault-detection-troubleshooting-disconnected]] ===== Diagnosing `disconnected` nodes Nodes typically leave the cluster with reason `disconnected` when they shut @@ -184,6 +185,7 @@ if traffic between the nodes is being disrupted by another device on the network. [discrete] +[[cluster-fault-detection-troubleshooting-lagging]] ===== Diagnosing `lagging` nodes {es} needs every node to process cluster state updates reasonably quickly. If a @@ -229,6 +231,7 @@ cat lagdetector.log | sed -e 's/.*://' | base64 --decode | gzip --decompress ---- [discrete] +[[cluster-fault-detection-troubleshooting-follower-check]] ===== Diagnosing `follower check retry count exceeded` nodes Nodes sometimes leave the cluster with reason `follower check retry count @@ -265,6 +268,7 @@ are unpredictable then capture stack dumps every 15s to be sure that at least one stack dump was taken at the right time. [discrete] +[[cluster-fault-detection-troubleshooting-shardlockobtainfailedexception]] ===== Diagnosing `ShardLockObtainFailedException` failures If a node leaves and rejoins the cluster then {es} will usually shut down and @@ -302,6 +306,7 @@ cat shardlock.log | sed -e 's/.*://' | base64 --decode | gzip --decompress ---- [discrete] +[[cluster-fault-detection-troubleshooting-network]] ===== Diagnosing other network disconnections {es} is designed to run on a fairly reliable network. It opens a number of TCP diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc index d08da2cfc1d2f..363928f83d45f 100644 --- a/docs/reference/modules/transport.asciidoc +++ b/docs/reference/modules/transport.asciidoc @@ -185,16 +185,18 @@ configured, and defaults otherwise to `transport.tcp.reuse_address`. A transport connection between two nodes is made up of a number of long-lived TCP connections, some of which may be idle for an extended period of time. -Nonetheless, Elasticsearch requires these connections to remain open, and it -can disrupt the operation of your cluster if any inter-node connections are -closed by an external influence such as a firewall. It is important to -configure your network to preserve long-lived idle connections between -Elasticsearch nodes, for instance by leaving `*.tcp.keep_alive` enabled and -ensuring that the keepalive interval is shorter than any timeout that might -cause idle connections to be closed, or by setting `transport.ping_schedule` if -keepalives cannot be configured. Devices which drop connections when they reach -a certain age are a common source of problems to Elasticsearch clusters, and -must not be used. +Nonetheless, {es} requires these connections to remain open, and it can disrupt +the operation of your cluster if any inter-node connections are closed by an +external influence such as a firewall. It is important to configure your network +to preserve long-lived idle connections between {es} nodes, for instance by +leaving `*.tcp.keep_alive` enabled and ensuring that the keepalive interval is +shorter than any timeout that might cause idle connections to be closed, or by +setting `transport.ping_schedule` if keepalives cannot be configured. Devices +which drop connections when they reach a certain age are a common source of +problems to {es} clusters, and must not be used. + +For information about troubleshooting unexpected network disconnections, see +<>. [[request-compression]] ===== Request compression diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java index 59c55fb7b624a..f73425c42a1c2 100644 --- a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java +++ b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java @@ -43,6 +43,7 @@ public enum ReferenceDocs { UNSTABLE_CLUSTER_TROUBLESHOOTING, LAGGING_NODE_TROUBLESHOOTING, SHARD_LOCK_TROUBLESHOOTING, + NETWORK_DISCONNECT_TROUBLESHOOTING, CONCURRENT_REPOSITORY_WRITERS, ARCHIVE_INDICES, HTTP_TRACER, diff --git a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java index 4d6a66b6ec075..da8f7b25e5197 100644 --- a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java +++ b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java @@ -12,6 +12,7 @@ import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.support.ContextPreservingActionListener; import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.common.ReferenceDocs; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.common.util.concurrent.ListenableFuture; @@ -237,7 +238,13 @@ private void connectToNodeOrRetry( if (connectingRefCounter.hasReferences() == false) { logger.trace("connection manager shut down, closing transport connection to [{}]", node); } else if (conn.hasReferences()) { - logger.info("transport connection to [{}] closed by remote", node.descriptionWithoutAttributes()); + logger.info( + """ + transport connection to [{}] closed by remote; \ + if unexpected, see [{}] for troubleshooting guidance""", + node.descriptionWithoutAttributes(), + ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING + ); // In production code we only close connections via ref-counting, so this message confirms that a // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a // node leaves the cluster with "reason: disconnected" but without this message being logged then diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json index 3eb8939c22a65..3c2a9cb7dec9e 100644 --- a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json +++ b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json @@ -2,8 +2,9 @@ "INITIAL_MASTER_NODES": "important-settings.html#initial_master_nodes", "DISCOVERY_TROUBLESHOOTING": "discovery-troubleshooting.html", "UNSTABLE_CLUSTER_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html", - "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#_diagnosing_lagging_nodes_2", - "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#_diagnosing_shardlockobtainfailedexception_failures_2", + "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-lagging", + "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-shardlockobtainfailedexception", + "NETWORK_DISCONNECT_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-network", "CONCURRENT_REPOSITORY_WRITERS": "diagnosing-corrupted-repositories.html", "ARCHIVE_INDICES": "archive-indices.html", "HTTP_TRACER": "modules-network.html#http-rest-request-tracer", diff --git a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java index 27874d4311cd2..675c3e63db7d5 100644 --- a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java +++ b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java @@ -188,7 +188,10 @@ public void testDisconnectLogging() { "remotely-triggered close message", ClusterConnectionManager.class.getCanonicalName(), Level.INFO, - "transport connection to [" + remoteClose.descriptionWithoutAttributes() + "] closed by remote" + "transport connection to [" + + remoteClose.descriptionWithoutAttributes() + + "] closed by remote; " + + "if unexpected, see [https://www.elastic.co/guide/en/elasticsearch/reference/*] for troubleshooting guidance" ) ); mockLog.addExpectation( From 954b55da5a1a68d0d2911f0e401097f4bfabc7a9 Mon Sep 17 00:00:00 2001 From: David Turner Date: Thu, 29 Aug 2024 08:04:23 +0100 Subject: [PATCH 2/3] Update docs/changelog/112330.yaml --- docs/changelog/112330.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/112330.yaml diff --git a/docs/changelog/112330.yaml b/docs/changelog/112330.yaml new file mode 100644 index 0000000000000..498698f5175ba --- /dev/null +++ b/docs/changelog/112330.yaml @@ -0,0 +1,5 @@ +pr: 112330 +summary: Add links to network disconnect troubleshooting +area: Network +type: enhancement +issues: [] From 80d1e75d3e0aaad982e9db3bc82bea81fb821107 Mon Sep 17 00:00:00 2001 From: David Turner Date: Thu, 29 Aug 2024 14:36:41 +0100 Subject: [PATCH 3/3] Update links --- docs/reference/modules/discovery/fault-detection.asciidoc | 5 ----- docs/reference/modules/transport.asciidoc | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/docs/reference/modules/discovery/fault-detection.asciidoc b/docs/reference/modules/discovery/fault-detection.asciidoc index a3d86995c7180..21f4ae2317e6a 100644 --- a/docs/reference/modules/discovery/fault-detection.asciidoc +++ b/docs/reference/modules/discovery/fault-detection.asciidoc @@ -39,31 +39,26 @@ starting from the beginning of the cluster state update. Refer to See <>. [discrete] -[[cluster-fault-detection-troubleshooting-disconnected]] ===== Diagnosing `disconnected` nodes See <>. [discrete] -[[cluster-fault-detection-troubleshooting-lagging]] ===== Diagnosing `lagging` nodes See <>. [discrete] -[[cluster-fault-detection-troubleshooting-follower-check]] ===== Diagnosing `follower check retry count exceeded` nodes See <>. [discrete] -[[cluster-fault-detection-troubleshooting-shardlockobtainfailedexception]] ===== Diagnosing `ShardLockObtainFailedException` failures See <>. [discrete] -[[cluster-fault-detection-troubleshooting-network]] ===== Diagnosing other network disconnections See <>. diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc index 363928f83d45f..fc7b6831ca848 100644 --- a/docs/reference/modules/transport.asciidoc +++ b/docs/reference/modules/transport.asciidoc @@ -196,7 +196,7 @@ which drop connections when they reach a certain age are a common source of problems to {es} clusters, and must not be used. For information about troubleshooting unexpected network disconnections, see -<>. +<>. [[request-compression]] ===== Request compression