2727
2828import java .util .Collections ;
2929import java .util .Iterator ;
30+ import java .util .List ;
3031import java .util .Map ;
3132import java .util .Set ;
3233import java .util .concurrent .ConcurrentMap ;
3334import java .util .concurrent .CountDownLatch ;
3435import java .util .concurrent .atomic .AtomicBoolean ;
36+ import java .util .stream .Collectors ;
37+
38+ import static org .elasticsearch .core .Strings .format ;
3539
3640/**
3741 * This class manages node connections within a cluster. The connection is opened by the underlying transport.
@@ -47,6 +51,9 @@ public class ClusterConnectionManager implements ConnectionManager {
4751 .newConcurrentMap ();
4852 private final AbstractRefCounted connectingRefCounter = AbstractRefCounted .of (this ::pendingConnectionsComplete );
4953
54+ record NodeConnectionHistory (String ephemeralId , Exception disconnectCause ) {}
55+ private final ConcurrentMap <String , NodeConnectionHistory > nodeHistory = ConcurrentCollections .newConcurrentMap ();
56+
5057 private final Transport transport ;
5158 private final ThreadContext threadContext ;
5259 private final ConnectionProfile defaultProfile ;
@@ -226,6 +233,29 @@ private void connectToNodeOrRetry(
226233 } else {
227234 logger .debug ("connected to node [{}]" , node );
228235 managerRefs .mustIncRef ();
236+
237+ // log case where the remote node has same ephemeralId as its previous connection
238+ // (the network was disrupted, but not the remote process)
239+ final DiscoveryNode connNode = conn .getNode ();
240+ NodeConnectionHistory hist = nodeHistory .remove (connNode .getId ());
241+ if (hist != null && hist .ephemeralId .equals (connNode .getEphemeralId ())) {
242+ if (hist .disconnectCause != null ) {
243+ logger .warn (
244+ () -> format (
245+ "transport connection reopened to node with same ephemeralId [%s], close exception:" ,
246+ node .descriptionWithoutAttributes ()
247+ ),
248+ hist .disconnectCause
249+ );
250+ } else {
251+ logger .warn (
252+ """
253+ transport connection reopened to node with same ephemeralId [{}]""" ,
254+ node .descriptionWithoutAttributes ()
255+ );
256+ }
257+ }
258+
229259 try {
230260 connectionListener .onNodeConnected (node , conn );
231261 } finally {
@@ -235,25 +265,65 @@ private void connectToNodeOrRetry(
235265 managerRefs .decRef ();
236266 }));
237267
238- conn .addCloseListener (ActionListener .running (() -> {
239- if (connectingRefCounter .hasReferences () == false ) {
240- logger .trace ("connection manager shut down, closing transport connection to [{}]" , node );
241- } else if (conn .hasReferences ()) {
242- logger .info (
243- """
244- transport connection to [{}] closed by remote; \
245- if unexpected, see [{}] for troubleshooting guidance""" ,
246- node .descriptionWithoutAttributes (),
247- ReferenceDocs .NETWORK_DISCONNECT_TROUBLESHOOTING
248- );
249- // In production code we only close connections via ref-counting, so this message confirms that a
250- // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a
251- // node leaves the cluster with "reason: disconnected" but without this message being logged then
252- // that's a bug.
253- } else {
254- logger .debug ("closing unused transport connection to [{}]" , node );
268+ conn .addCloseListener (new ActionListener <Void >() {
269+ @ Override
270+ public void onResponse (Void ignored ) {
271+ final NodeConnectionHistory hist = new NodeConnectionHistory (node .getEphemeralId (), null );
272+ nodeHistory .put (conn .getNode ().getId (), hist );
255273 }
256- }));
274+
275+ @ Override
276+ public void onFailure (Exception e ) {
277+ final NodeConnectionHistory hist = new NodeConnectionHistory (node .getEphemeralId (), e );
278+ nodeHistory .put (conn .getNode ().getId (), hist );
279+ }
280+ });
281+
282+ conn .addCloseListener (new ActionListener <Void >() {
283+ @ Override
284+ public void onResponse (Void ignored ) {
285+ if (connectingRefCounter .hasReferences () == false ) {
286+ logger .trace ("connection manager shut down, closing transport connection to [{}]" , node );
287+ } else if (conn .hasReferences ()) {
288+ logger .info (
289+ """
290+ transport connection to [{}] closed by remote; \
291+ if unexpected, see [{}] for troubleshooting guidance""" ,
292+ node .descriptionWithoutAttributes (),
293+ ReferenceDocs .NETWORK_DISCONNECT_TROUBLESHOOTING
294+ );
295+ // In production code we only close connections via ref-counting, so this message confirms that
296+ // a 'node-left ... reason: disconnected' event was caused by external factors. Put
297+ // differently, if a node leaves the cluster with "reason: disconnected" but without this
298+ // message being logged then that's a bug.
299+ } else {
300+ logger .debug ("closing unused transport connection to [{}]" , node );
301+ }
302+ }
303+
304+ @ Override
305+ public void onFailure (Exception e ) {
306+ if (conn .hasReferences ()) {
307+ logger .warn (
308+ """
309+ transport connection to [{}] closed by remote with exception [{}]; \
310+ if unexpected, see [{}] for troubleshooting guidance""" ,
311+ node .descriptionWithoutAttributes (),
312+ e ,
313+ ReferenceDocs .NETWORK_DISCONNECT_TROUBLESHOOTING
314+ );
315+ } else {
316+ logger .warn (
317+ """
318+ transport connection to [{}] closed with exception [{}]; \
319+ if unexpected, see [{}] for troubleshooting guidance""" ,
320+ node .descriptionWithoutAttributes (),
321+ e ,
322+ ReferenceDocs .NETWORK_DISCONNECT_TROUBLESHOOTING
323+ );
324+ }
325+ }
326+ });
257327 }
258328 }
259329 } finally {
@@ -276,6 +346,21 @@ private void connectToNodeOrRetry(
276346 );
277347 }
278348
349+ @ Override
350+ public void retainConnectionHistory (List <DiscoveryNode > nodes ) {
351+ List <String > nodeIds = nodes .stream ().map (node -> node .getId ()).collect (Collectors .toList ());
352+
353+ final int startSize = nodeHistory .size ();
354+ // the keyset propagates changes to the underlying map
355+ nodeHistory .keySet ().retainAll (nodeIds );
356+ logger .trace ("Connection history garbage-collected from {} to {} entries" , startSize , nodeHistory .size ());
357+ }
358+
359+ @ Override
360+ public int connectionHistorySize () {
361+ return nodeHistory .size ();
362+ }
363+
279364 /**
280365 * Returns a connection for the given node if the node is connected.
281366 * Connections returned from this method must not be closed. The lifecycle of this connection is
0 commit comments