From 7f06144d90481d2918706966766f174e51ccb1dd Mon Sep 17 00:00:00 2001
From: David Turner <david.turner@elastic.co>
Date: Thu, 29 Aug 2024 08:02:52 +0100
Subject: [PATCH 1/3] Add links to network disconnect troubleshooting

Makes the docs added in #112271 more discoverable.
---
 .../discovery/fault-detection.asciidoc        |  5 +++++
 docs/reference/modules/transport.asciidoc     | 22 ++++++++++---------
 .../elasticsearch/common/ReferenceDocs.java   |  1 +
 .../transport/ClusterConnectionManager.java   |  9 +++++++-
 .../common/reference-docs-links.json          |  5 +++--
 .../ClusterConnectionManagerTests.java        |  5 ++++-
 6 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/docs/reference/modules/discovery/fault-detection.asciidoc b/docs/reference/modules/discovery/fault-detection.asciidoc
index d12985b70597c..87d9b6bd54d41 100644
--- a/docs/reference/modules/discovery/fault-detection.asciidoc
+++ b/docs/reference/modules/discovery/fault-detection.asciidoc
@@ -144,6 +144,7 @@ to <<modules-discovery-settings>> for information about the settings which
 control this mechanism.
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-disconnected]]
 ===== Diagnosing `disconnected` nodes
 
 Nodes typically leave the cluster with reason `disconnected` when they shut
@@ -184,6 +185,7 @@ if traffic between the nodes is being disrupted by another device on the
 network.
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-lagging]]
 ===== Diagnosing `lagging` nodes
 
 {es} needs every node to process cluster state updates reasonably quickly. If a
@@ -229,6 +231,7 @@ cat lagdetector.log | sed -e 's/.*://' | base64 --decode | gzip --decompress
 ----
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-follower-check]]
 ===== Diagnosing `follower check retry count exceeded` nodes
 
 Nodes sometimes leave the cluster with reason `follower check retry count
@@ -265,6 +268,7 @@ are unpredictable then capture stack dumps every 15s to be sure that at least
 one stack dump was taken at the right time.
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-shardlockobtainfailedexception]]
 ===== Diagnosing `ShardLockObtainFailedException` failures
 
 If a node leaves and rejoins the cluster then {es} will usually shut down and
@@ -302,6 +306,7 @@ cat shardlock.log | sed -e 's/.*://' | base64 --decode | gzip --decompress
 ----
 
 [discrete]
+[[cluster-fault-detection-troubleshooting-network]]
 ===== Diagnosing other network disconnections
 
 {es} is designed to run on a fairly reliable network. It opens a number of TCP
diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc
index d08da2cfc1d2f..363928f83d45f 100644
--- a/docs/reference/modules/transport.asciidoc
+++ b/docs/reference/modules/transport.asciidoc
@@ -185,16 +185,18 @@ configured, and defaults otherwise to `transport.tcp.reuse_address`.
 
 A transport connection between two nodes is made up of a number of long-lived
 TCP connections, some of which may be idle for an extended period of time.
-Nonetheless, Elasticsearch requires these connections to remain open, and it
-can disrupt the operation of your cluster if any inter-node connections are
-closed by an external influence such as a firewall. It is important to
-configure your network to preserve long-lived idle connections between
-Elasticsearch nodes, for instance by leaving `*.tcp.keep_alive` enabled and
-ensuring that the keepalive interval is shorter than any timeout that might
-cause idle connections to be closed, or by setting `transport.ping_schedule` if
-keepalives cannot be configured. Devices which drop connections when they reach
-a certain age are a common source of problems to Elasticsearch clusters, and
-must not be used.
+Nonetheless, {es} requires these connections to remain open, and it can disrupt
+the operation of your cluster if any inter-node connections are closed by an
+external influence such as a firewall. It is important to configure your network
+to preserve long-lived idle connections between {es} nodes, for instance by
+leaving `*.tcp.keep_alive` enabled and ensuring that the keepalive interval is
+shorter than any timeout that might cause idle connections to be closed, or by
+setting `transport.ping_schedule` if keepalives cannot be configured. Devices
+which drop connections when they reach a certain age are a common source of
+problems to {es} clusters, and must not be used.
+
+For information about troubleshooting unexpected network disconnections, see
+<<cluster-fault-detection-troubleshooting-network>>.
 
 [[request-compression]]
 ===== Request compression
diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
index 59c55fb7b624a..f73425c42a1c2 100644
--- a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
+++ b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -43,6 +43,7 @@ public enum ReferenceDocs {
     UNSTABLE_CLUSTER_TROUBLESHOOTING,
     LAGGING_NODE_TROUBLESHOOTING,
     SHARD_LOCK_TROUBLESHOOTING,
+    NETWORK_DISCONNECT_TROUBLESHOOTING,
     CONCURRENT_REPOSITORY_WRITERS,
     ARCHIVE_INDICES,
     HTTP_TRACER,
diff --git a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java
index 4d6a66b6ec075..da8f7b25e5197 100644
--- a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java
+++ b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java
@@ -12,6 +12,7 @@
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.support.ContextPreservingActionListener;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.common.util.concurrent.ListenableFuture;
@@ -237,7 +238,13 @@ private void connectToNodeOrRetry(
                                     if (connectingRefCounter.hasReferences() == false) {
                                         logger.trace("connection manager shut down, closing transport connection to [{}]", node);
                                     } else if (conn.hasReferences()) {
-                                        logger.info("transport connection to [{}] closed by remote", node.descriptionWithoutAttributes());
+                                        logger.info(
+                                            """
+                                                transport connection to [{}] closed by remote; \
+                                                if unexpected, see [{}] for troubleshooting guidance""",
+                                            node.descriptionWithoutAttributes(),
+                                            ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
+                                        );
                                         // In production code we only close connections via ref-counting, so this message confirms that a
                                         // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a
                                         // node leaves the cluster with "reason: disconnected" but without this message being logged then
diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
index 3eb8939c22a65..3c2a9cb7dec9e 100644
--- a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
+++ b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
@@ -2,8 +2,9 @@
   "INITIAL_MASTER_NODES": "important-settings.html#initial_master_nodes",
   "DISCOVERY_TROUBLESHOOTING": "discovery-troubleshooting.html",
   "UNSTABLE_CLUSTER_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html",
-  "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#_diagnosing_lagging_nodes_2",
-  "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#_diagnosing_shardlockobtainfailedexception_failures_2",
+  "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-lagging",
+  "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-shardlockobtainfailedexception",
+  "NETWORK_DISCONNECT_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#cluster-fault-detection-troubleshooting-network",
   "CONCURRENT_REPOSITORY_WRITERS": "diagnosing-corrupted-repositories.html",
   "ARCHIVE_INDICES": "archive-indices.html",
   "HTTP_TRACER": "modules-network.html#http-rest-request-tracer",
diff --git a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java
index 27874d4311cd2..675c3e63db7d5 100644
--- a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java
+++ b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java
@@ -188,7 +188,10 @@ public void testDisconnectLogging() {
                     "remotely-triggered close message",
                     ClusterConnectionManager.class.getCanonicalName(),
                     Level.INFO,
-                    "transport connection to [" + remoteClose.descriptionWithoutAttributes() + "] closed by remote"
+                    "transport connection to ["
+                        + remoteClose.descriptionWithoutAttributes()
+                        + "] closed by remote; "
+                        + "if unexpected, see [https://www.elastic.co/guide/en/elasticsearch/reference/*] for troubleshooting guidance"
                 )
             );
             mockLog.addExpectation(

From 954b55da5a1a68d0d2911f0e401097f4bfabc7a9 Mon Sep 17 00:00:00 2001
From: David Turner <david.turner@elastic.co>
Date: Thu, 29 Aug 2024 08:04:23 +0100
Subject: [PATCH 2/3] Update docs/changelog/112330.yaml

---
 docs/changelog/112330.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 docs/changelog/112330.yaml

diff --git a/docs/changelog/112330.yaml b/docs/changelog/112330.yaml
new file mode 100644
index 0000000000000..498698f5175ba
--- /dev/null
+++ b/docs/changelog/112330.yaml
@@ -0,0 +1,5 @@
+pr: 112330
+summary: Add links to network disconnect troubleshooting
+area: Network
+type: enhancement
+issues: []

From 80d1e75d3e0aaad982e9db3bc82bea81fb821107 Mon Sep 17 00:00:00 2001
From: David Turner <david.turner@elastic.co>
Date: Thu, 29 Aug 2024 14:36:41 +0100
Subject: [PATCH 3/3] Update links

---
 docs/reference/modules/discovery/fault-detection.asciidoc | 5 -----
 docs/reference/modules/transport.asciidoc                 | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/docs/reference/modules/discovery/fault-detection.asciidoc b/docs/reference/modules/discovery/fault-detection.asciidoc
index a3d86995c7180..21f4ae2317e6a 100644
--- a/docs/reference/modules/discovery/fault-detection.asciidoc
+++ b/docs/reference/modules/discovery/fault-detection.asciidoc
@@ -39,31 +39,26 @@ starting from the beginning of the cluster state update. Refer to
 See <<troubleshooting-unstable-cluster>>.
 
 [discrete]
-[[cluster-fault-detection-troubleshooting-disconnected]]
 ===== Diagnosing `disconnected` nodes
 
 See <<troubleshooting-unstable-cluster-disconnected>>.
 
 [discrete]
-[[cluster-fault-detection-troubleshooting-lagging]]
 ===== Diagnosing `lagging` nodes
 
 See <<troubleshooting-unstable-cluster-lagging>>.
 
 [discrete]
-[[cluster-fault-detection-troubleshooting-follower-check]]
 ===== Diagnosing `follower check retry count exceeded` nodes
 
 See <<troubleshooting-unstable-cluster-follower-check>>.
 
 [discrete]
-[[cluster-fault-detection-troubleshooting-shardlockobtainfailedexception]]
 ===== Diagnosing `ShardLockObtainFailedException` failures
 
 See <<troubleshooting-unstable-cluster-shardlockobtainfailedexception>>.
 
 [discrete]
-[[cluster-fault-detection-troubleshooting-network]]
 ===== Diagnosing other network disconnections
 
 See <<troubleshooting-unstable-cluster-network>>.
diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc
index 363928f83d45f..fc7b6831ca848 100644
--- a/docs/reference/modules/transport.asciidoc
+++ b/docs/reference/modules/transport.asciidoc
@@ -196,7 +196,7 @@ which drop connections when they reach a certain age are a common source of
 problems to {es} clusters, and must not be used.
 
 For information about troubleshooting unexpected network disconnections, see
-<<cluster-fault-detection-troubleshooting-network>>.
+<<troubleshooting-unstable-cluster-network>>.
 
 [[request-compression]]
 ===== Request compression