Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/112330.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 112330
summary: Add links to network disconnect troubleshooting
area: Network
type: enhancement
issues: []
22 changes: 12 additions & 10 deletions docs/reference/modules/transport.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,18 @@ configured, and defaults otherwise to `transport.tcp.reuse_address`.

A transport connection between two nodes is made up of a number of long-lived
TCP connections, some of which may be idle for an extended period of time.
Nonetheless, Elasticsearch requires these connections to remain open, and it
can disrupt the operation of your cluster if any inter-node connections are
closed by an external influence such as a firewall. It is important to
configure your network to preserve long-lived idle connections between
Elasticsearch nodes, for instance by leaving `*.tcp.keep_alive` enabled and
ensuring that the keepalive interval is shorter than any timeout that might
cause idle connections to be closed, or by setting `transport.ping_schedule` if
keepalives cannot be configured. Devices which drop connections when they reach
a certain age are a common source of problems to Elasticsearch clusters, and
must not be used.
Nonetheless, {es} requires these connections to remain open, and it can disrupt
the operation of your cluster if any inter-node connections are closed by an
external influence such as a firewall. It is important to configure your network
to preserve long-lived idle connections between {es} nodes, for instance by
leaving `*.tcp.keep_alive` enabled and ensuring that the keepalive interval is
shorter than any timeout that might cause idle connections to be closed, or by
setting `transport.ping_schedule` if keepalives cannot be configured. Devices
which drop connections when they reach a certain age are a common source of
problems to {es} clusters, and must not be used.

For information about troubleshooting unexpected network disconnections, see
<<troubleshooting-unstable-cluster-network>>.

[[request-compression]]
===== Request compression
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public enum ReferenceDocs {
UNSTABLE_CLUSTER_TROUBLESHOOTING,
LAGGING_NODE_TROUBLESHOOTING,
SHARD_LOCK_TROUBLESHOOTING,
NETWORK_DISCONNECT_TROUBLESHOOTING,
CONCURRENT_REPOSITORY_WRITERS,
ARCHIVE_INDICES,
HTTP_TRACER,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.ContextPreservingActionListener;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.ReferenceDocs;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.ListenableFuture;
Expand Down Expand Up @@ -237,7 +238,13 @@ private void connectToNodeOrRetry(
if (connectingRefCounter.hasReferences() == false) {
logger.trace("connection manager shut down, closing transport connection to [{}]", node);
} else if (conn.hasReferences()) {
logger.info("transport connection to [{}] closed by remote", node.descriptionWithoutAttributes());
logger.info(
"""
transport connection to [{}] closed by remote; \
if unexpected, see [{}] for troubleshooting guidance""",
node.descriptionWithoutAttributes(),
ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
);
// In production code we only close connections via ref-counting, so this message confirms that a
// 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a
// node leaves the cluster with "reason: disconnected" but without this message being logged then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"UNSTABLE_CLUSTER_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html",
"LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-lagging",
"SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-shardlockobtainfailedexception",
"NETWORK_DISCONNECT_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-network",
"CONCURRENT_REPOSITORY_WRITERS": "diagnosing-corrupted-repositories.html",
"ARCHIVE_INDICES": "archive-indices.html",
"HTTP_TRACER": "modules-network.html#http-rest-request-tracer",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,10 @@ public void testDisconnectLogging() {
"remotely-triggered close message",
ClusterConnectionManager.class.getCanonicalName(),
Level.INFO,
"transport connection to [" + remoteClose.descriptionWithoutAttributes() + "] closed by remote"
"transport connection to ["
+ remoteClose.descriptionWithoutAttributes()
+ "] closed by remote; "
+ "if unexpected, see [https://www.elastic.co/guide/en/elasticsearch/reference/*] for troubleshooting guidance"
)
);
mockLog.addExpectation(
Expand Down