1717import org .elasticsearch .cluster .node .DiscoveryNode ;
1818import org .elasticsearch .cluster .node .DiscoveryNodes ;
1919import org .elasticsearch .cluster .service .ClusterApplier ;
20+ import org .elasticsearch .common .ReferenceDocs ;
2021import org .elasticsearch .common .component .AbstractLifecycleComponent ;
2122import org .elasticsearch .common .settings .Setting ;
2223import org .elasticsearch .common .settings .Settings ;
2324import org .elasticsearch .common .util .concurrent .AbstractRunnable ;
25+ import org .elasticsearch .common .util .concurrent .ConcurrentCollections ;
26+ import org .elasticsearch .core .Nullable ;
2427import org .elasticsearch .core .Releasable ;
2528import org .elasticsearch .core .Releasables ;
2629import org .elasticsearch .core .TimeValue ;
2730import org .elasticsearch .injection .guice .Inject ;
2831import org .elasticsearch .threadpool .ThreadPool ;
32+ import org .elasticsearch .transport .Transport ;
33+ import org .elasticsearch .transport .TransportConnectionListener ;
2934import org .elasticsearch .transport .TransportService ;
3035
3136import java .util .ArrayList ;
3540import java .util .List ;
3641import java .util .Map ;
3742import java .util .Set ;
43+ import java .util .concurrent .ConcurrentMap ;
3844import java .util .concurrent .atomic .AtomicInteger ;
3945import java .util .concurrent .atomic .AtomicReference ;
4046
@@ -79,12 +85,14 @@ public class NodeConnectionsService extends AbstractLifecycleComponent {
7985
8086 private final TimeValue reconnectInterval ;
8187 private volatile ConnectionChecker connectionChecker ;
88+ private final ConnectionHistory connectionHistory ;
8289
8390 @ Inject
8491 public NodeConnectionsService (Settings settings , ThreadPool threadPool , TransportService transportService ) {
8592 this .threadPool = threadPool ;
8693 this .transportService = transportService ;
8794 this .reconnectInterval = NodeConnectionsService .CLUSTER_NODE_RECONNECT_INTERVAL_SETTING .get (settings );
95+ this .connectionHistory = new ConnectionHistory ();
8896 }
8997
9098 /**
@@ -99,13 +107,12 @@ public void connectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletion)
99107 }
100108
101109 final List <Runnable > runnables = new ArrayList <>(discoveryNodes .getSize ());
102- final List <DiscoveryNode > nodes = new ArrayList <>(discoveryNodes .getSize ());
103110 try (var refs = new RefCountingRunnable (onCompletion )) {
104111 synchronized (mutex ) {
112+ connectionHistory .reserveConnectionHistoryForNodes (DiscoveryNodes );
105113 // Ugly hack: when https://github.com/elastic/elasticsearch/issues/94946 is fixed, just iterate over discoveryNodes here
106114 for (final Iterator <DiscoveryNode > iterator = discoveryNodes .mastersFirstStream ().iterator (); iterator .hasNext ();) {
107115 final DiscoveryNode discoveryNode = iterator .next ();
108- nodes .add (discoveryNode );
109116 ConnectionTarget connectionTarget = targetsByNode .get (discoveryNode );
110117 final boolean isNewNode = connectionTarget == null ;
111118 if (isNewNode ) {
@@ -122,7 +129,6 @@ public void connectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletion)
122129 runnables .add (connectionTarget .connect (null ));
123130 }
124131 }
125- transportService .retainConnectionHistory (nodes );
126132 }
127133 }
128134 runnables .forEach (Runnable ::run );
@@ -140,6 +146,7 @@ public void disconnectFromNodesExcept(DiscoveryNodes discoveryNodes) {
140146 nodesToDisconnect .remove (discoveryNode );
141147 }
142148
149+ connectionHistory .removeConnectionHistoryForNodes (nodesToDisconnect );
143150 for (final DiscoveryNode discoveryNode : nodesToDisconnect ) {
144151 runnables .add (targetsByNode .remove (discoveryNode )::disconnect );
145152 }
@@ -350,4 +357,113 @@ public String toString() {
350357 }
351358 }
352359 }
360+
361+ private class ConnectionHistory {
362+ record NodeConnectionHistory (String ephemeralId , long disconnectTime , Exception disconnectCause ) {}
363+
364+ /**
365+ * Holds the DiscoveryNode nodeId to connection history record.
366+ *
367+ * Entries for each node are reserved during NodeConnectionsService.connectToNodes, by placing a (nodeId, dummy) entry
368+ * for each node in the cluster. On node disconnect, this entry is updated with its NodeConnectionHistory. On node
369+ * connect, this entry is reset to the dummy value. On NodeConnectionsService.disconnectFromNodesExcept, node entries
370+ * are removed.
371+ *
372+ * Each node in the cluster always has a nodeHistory entry that is either the dummy value or a connection history record. This
373+ * allows node disconnect callbacks to discard their entry if the disconnect occurred because of a change in cluster state.
374+ */
375+ private final NodeConnectionHistory dummy = new NodeConnectionHistory ("" , 0 , null );
376+ private final ConcurrentMap <String , NodeConnectionHistory > nodeHistory = ConcurrentCollections .newConcurrentMap ();
377+
378+ ConnectionHistory () {
379+ NodeConnectionsService .this .transportService .addConnectionListener (new TransportConnectionListener () {
380+ @ Override
381+ public void onNodeConnected (DiscoveryNode node , Transport .Connection connection ) {
382+ // log case where the remote node has same ephemeralId as its previous connection
383+ // (the network was disrupted, but not the remote process)
384+ NodeConnectionHistory nodeConnectionHistory = nodeHistory .get (node .getId ());
385+ if (nodeConnectionHistory != null ) {
386+ nodeHistory .replace (node .getId (), nodeConnectionHistory , dummy );
387+ }
388+
389+ if (nodeConnectionHistory != null
390+ && nodeConnectionHistory != dummy
391+ && nodeConnectionHistory .ephemeralId .equals (node .getEphemeralId ())) {
392+ if (nodeConnectionHistory .disconnectCause != null ) {
393+ logger .warn (
394+ () -> format (
395+ "reopened transport connection to node [%s] "
396+ + "which disconnected exceptionally [%dms] ago but did not "
397+ + "restart, so the disconnection is unexpected; "
398+ + "if unexpected, see [{}] for troubleshooting guidance" ,
399+ node .descriptionWithoutAttributes (),
400+ nodeConnectionHistory .disconnectTime ,
401+ ReferenceDocs .NETWORK_DISCONNECT_TROUBLESHOOTING
402+ ),
403+ nodeConnectionHistory .disconnectCause
404+ );
405+ } else {
406+ logger .warn (
407+ """
408+ reopened transport connection to node [{}] \
409+ which disconnected gracefully [{}ms] ago but did not \
410+ restart, so the disconnection is unexpected; \
411+ if unexpected, see [{}] for troubleshooting guidance""" ,
412+ node .descriptionWithoutAttributes (),
413+ nodeConnectionHistory .disconnectTime ,
414+ ReferenceDocs .NETWORK_DISCONNECT_TROUBLESHOOTING
415+ );
416+ }
417+ }
418+ }
419+
420+ @ Override
421+ public void onNodeDisconnected (DiscoveryNode node , Transport .Connection connection ) {
422+ connection .addCloseListener (new ActionListener <Void >() {
423+ @ Override
424+ public void onResponse (Void ignored ) {
425+ insertNodeConnectionHistory (null );
426+ }
427+
428+ @ Override
429+ public void onFailure (Exception e ) {
430+ insertNodeConnectionHistory (e );
431+ }
432+
433+ private void insertNodeConnectionHistory (@ Nullable Exception e ) {
434+ final long disconnectTime = threadPool .absoluteTimeInMillis ();
435+ final NodeConnectionHistory nodeConnectionHistory = new NodeConnectionHistory (
436+ node .getEphemeralId (),
437+ disconnectTime ,
438+ e
439+ );
440+ final String nodeId = node .getId ();
441+ NodeConnectionHistory previousConnectionHistory = nodeHistory .get (nodeId );
442+ if (previousConnectionHistory != null ) {
443+ nodeHistory .replace (nodeId , previousConnectionHistory , nodeConnectionHistory );
444+ }
445+ }
446+ });
447+ }
448+ });
449+ }
450+
451+ void reserveConnectionHistoryForNodes (DiscoveryNodes nodes ) {
452+ for (DiscoveryNode node : nodes ) {
453+ nodeHistory .put (node .getId (), dummy );
454+ }
455+ }
456+
457+ void removeConnectionHistoryForNodes (Set <DiscoveryNode > nodes ) {
458+ final int startSize = nodeHistory .size ();
459+ for (DiscoveryNode node : nodes ) {
460+ nodeHistory .remove (node .getId ());
461+ }
462+ logger .trace ("Connection history garbage-collected from {} to {} entries" , startSize , nodeHistory .size ());
463+ }
464+
465+ int connectionHistorySize () {
466+ return nodeHistory .size ();
467+ }
468+ }
353469}
0 commit comments