elastic · elasticsearchmachine · Sep 9, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/docs/changelog/112330.yaml b/docs/changelog/112330.yaml
@@ -0,0 +1,5 @@
+pr: 112330
+summary: Add links to network disconnect troubleshooting
+area: Network
+type: enhancement
+issues: []
diff --git a/docs/reference/modules/discovery/fault-detection.asciidoc b/docs/reference/modules/discovery/fault-detection.asciidoc
@@ -144,6 +144,7 @@ to <<modules-discovery-settings>> for information about the settings which
 control this mechanism.
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-disconnected]]
 ===== Diagnosing `disconnected` nodes
 
 Nodes typically leave the cluster with reason `disconnected` when they shut
@@ -184,6 +185,7 @@ if traffic between the nodes is being disrupted by another device on the
 network.
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-lagging]]
 ===== Diagnosing `lagging` nodes
 
 {es} needs every node to process cluster state updates reasonably quickly. If a
@@ -229,6 +231,7 @@ cat lagdetector.log | sed -e 's/.*://' | base64 --decode | gzip --decompress
 ----
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-follower-check]]
 ===== Diagnosing `follower check retry count exceeded` nodes
 
 Nodes sometimes leave the cluster with reason `follower check retry count
@@ -265,6 +268,7 @@ are unpredictable then capture stack dumps every 15s to be sure that at least
 one stack dump was taken at the right time.
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-shardlockobtainfailedexception]]
 ===== Diagnosing `ShardLockObtainFailedException` failures
 
 If a node leaves and rejoins the cluster then {es} will usually shut down and
@@ -302,6 +306,7 @@ cat shardlock.log | sed -e 's/.*://' | base64 --decode | gzip --decompress
 ----
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-network]]
 ===== Diagnosing other network disconnections
 
 {es} is designed to run on a fairly reliable network. It opens a number of TCP

diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc
@@ -185,16 +185,18 @@ configured, and defaults otherwise to `transport.tcp.reuse_address`.
 
 A transport connection between two nodes is made up of a number of long-lived
 TCP connections, some of which may be idle for an extended period of time.
-Nonetheless, Elasticsearch requires these connections to remain open, and it
-can disrupt the operation of your cluster if any inter-node connections are
-closed by an external influence such as a firewall. It is important to
-configure your network to preserve long-lived idle connections between
-Elasticsearch nodes, for instance by leaving `*.tcp.keep_alive` enabled and
-ensuring that the keepalive interval is shorter than any timeout that might
-cause idle connections to be closed, or by setting `transport.ping_schedule` if
-keepalives cannot be configured. Devices which drop connections when they reach
-a certain age are a common source of problems to Elasticsearch clusters, and
-must not be used.
+Nonetheless, {es} requires these connections to remain open, and it can disrupt
+the operation of your cluster if any inter-node connections are closed by an
+external influence such as a firewall. It is important to configure your network
+to preserve long-lived idle connections between {es} nodes, for instance by
+leaving `*.tcp.keep_alive` enabled and ensuring that the keepalive interval is
+shorter than any timeout that might cause idle connections to be closed, or by
+setting `transport.ping_schedule` if keepalives cannot be configured. Devices
+which drop connections when they reach a certain age are a common source of
+problems to {es} clusters, and must not be used.
+
+For information about troubleshooting unexpected network disconnections, see
+<<cluster-fault-detection-troubleshooting-network>>.
 
 [[request-compression]]
 ===== Request compression

diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -43,6 +43,7 @@ public enum ReferenceDocs {
     UNSTABLE_CLUSTER_TROUBLESHOOTING,
     LAGGING_NODE_TROUBLESHOOTING,
     SHARD_LOCK_TROUBLESHOOTING,
+    NETWORK_DISCONNECT_TROUBLESHOOTING,
     CONCURRENT_REPOSITORY_WRITERS,
     ARCHIVE_INDICES,
     HTTP_TRACER,

diff --git a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java
@@ -12,6 +12,7 @@
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.support.ContextPreservingActionListener;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.common.util.concurrent.ListenableFuture;
@@ -237,7 +238,13 @@ private void connectToNodeOrRetry(
                                     if (connectingRefCounter.hasReferences() == false) {
                                         logger.trace("connection manager shut down, closing transport connection to [{}]", node);
                                     } else if (conn.hasReferences()) {
-                                        logger.info("transport connection to [{}] closed by remote", node.descriptionWithoutAttributes());
+                                        logger.info(
+                                            """
+                                                transport connection to [{}] closed by remote; \
+                                                if unexpected, see [{}] for troubleshooting guidance""",
+                                            node.descriptionWithoutAttributes(),
+                                            ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
+                                        );
                                         // In production code we only close connections via ref-counting, so this message confirms that a
                                         // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a
                                         // node leaves the cluster with "reason: disconnected" but without this message being logged then

diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
@@ -2,8 +2,9 @@
   "INITIAL_MASTER_NODES": "important-settings.html#initial_master_nodes",
   "DISCOVERY_TROUBLESHOOTING": "discovery-troubleshooting.html",
   "UNSTABLE_CLUSTER_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html",
-  "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#_diagnosing_lagging_nodes_2",
-  "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#_diagnosing_shardlockobtainfailedexception_failures_2",
+  "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-lagging",
+  "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-shardlockobtainfailedexception",
+  "NETWORK_DISCONNECT_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-network",
   "CONCURRENT_REPOSITORY_WRITERS": "diagnosing-corrupted-repositories.html",
   "ARCHIVE_INDICES": "archive-indices.html",
   "HTTP_TRACER": "modules-network.html#http-rest-request-tracer",

diff --git a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java
@@ -188,7 +188,10 @@ public void testDisconnectLogging() {
                     "remotely-triggered close message",
                     ClusterConnectionManager.class.getCanonicalName(),
                     Level.INFO,
-                    "transport connection to [" + remoteClose.descriptionWithoutAttributes() + "] closed by remote"
+                    "transport connection to ["
+                        + remoteClose.descriptionWithoutAttributes()
+                        + "] closed by remote; "
+                        + "if unexpected, see [https://www.elastic.co/guide/en/elasticsearch/reference/*] for troubleshooting guidance"
                 )
             );
             mockLog.addExpectation(