- 
                Notifications
    
You must be signed in to change notification settings  - Fork 25.6k
 
transport: log network reconnects with same peer process #128415
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
7d54620
              bd688bb
              11347ca
              0cc8084
              ab7f490
              81cbcdc
              27438d6
              0514724
              a56e728
              d03eb4d
              f7f8f72
              2becaf1
              8cf607e
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -99,11 +99,13 @@ public void connectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletion) | |
| } | ||
| 
     | 
||
| final List<Runnable> runnables = new ArrayList<>(discoveryNodes.getSize()); | ||
| final List<DiscoveryNode> nodes = new ArrayList<>(discoveryNodes.getSize()); | ||
| try (var refs = new RefCountingRunnable(onCompletion)) { | ||
| synchronized (mutex) { | ||
| // Ugly hack: when https://github.com/elastic/elasticsearch/issues/94946 is fixed, just iterate over discoveryNodes here | ||
| for (final Iterator<DiscoveryNode> iterator = discoveryNodes.mastersFirstStream().iterator(); iterator.hasNext();) { | ||
| final DiscoveryNode discoveryNode = iterator.next(); | ||
| nodes.add(discoveryNode); | ||
| ConnectionTarget connectionTarget = targetsByNode.get(discoveryNode); | ||
| final boolean isNewNode = connectionTarget == null; | ||
| if (isNewNode) { | ||
| 
        
          
        
         | 
    @@ -120,6 +122,7 @@ public void connectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletion) | |
| runnables.add(connectionTarget.connect(null)); | ||
| } | ||
| } | ||
| transportService.retainConnectionHistory(nodes); | ||
                
       | 
||
| } | ||
| } | ||
| runnables.forEach(Runnable::run); | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -27,11 +27,15 @@ | |
| 
     | 
||
| import java.util.Collections; | ||
| import java.util.Iterator; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.concurrent.ConcurrentMap; | ||
| import java.util.concurrent.CountDownLatch; | ||
| import java.util.concurrent.atomic.AtomicBoolean; | ||
| import java.util.stream.Collectors; | ||
| 
     | 
||
| import static org.elasticsearch.core.Strings.format; | ||
| 
     | 
||
| /** | ||
| * This class manages node connections within a cluster. The connection is opened by the underlying transport. | ||
| 
        
          
        
         | 
    @@ -47,6 +51,10 @@ public class ClusterConnectionManager implements ConnectionManager { | |
| .newConcurrentMap(); | ||
| private final AbstractRefCounted connectingRefCounter = AbstractRefCounted.of(this::pendingConnectionsComplete); | ||
| 
     | 
||
| record NodeConnectionHistory(String ephemeralId, Exception disconnectCause) {} | ||
| 
     | 
||
| private final ConcurrentMap<String, NodeConnectionHistory> nodeHistory = ConcurrentCollections.newConcurrentMap(); | ||
| 
     | 
||
| private final Transport transport; | ||
| private final ThreadContext threadContext; | ||
| private final ConnectionProfile defaultProfile; | ||
| 
          
            
          
           | 
    @@ -226,6 +234,29 @@ private void connectToNodeOrRetry( | |
| } else { | ||
| logger.debug("connected to node [{}]", node); | ||
| managerRefs.mustIncRef(); | ||
| 
     | 
||
| // log case where the remote node has same ephemeralId as its previous connection | ||
| // (the network was disrupted, but not the remote process) | ||
| final DiscoveryNode connNode = conn.getNode(); | ||
| NodeConnectionHistory hist = nodeHistory.remove(connNode.getId()); | ||
| if (hist != null && hist.ephemeralId.equals(connNode.getEphemeralId())) { | ||
                
       | 
||
| if (hist.disconnectCause != null) { | ||
| logger.warn( | ||
| () -> format( | ||
| "transport connection reopened to node with same ephemeralId [%s], close exception:", | ||
                
       | 
||
| node.descriptionWithoutAttributes() | ||
| ), | ||
| hist.disconnectCause | ||
| ); | ||
| } else { | ||
| logger.warn( | ||
| """ | ||
| transport connection reopened to node with same ephemeralId [{}]""", | ||
| node.descriptionWithoutAttributes() | ||
| ); | ||
                
      
                  nicktindall marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||
| } | ||
| } | ||
| 
     | 
||
| try { | ||
| connectionListener.onNodeConnected(node, conn); | ||
| } finally { | ||
| 
        
          
        
         | 
    @@ -235,25 +266,65 @@ private void connectToNodeOrRetry( | |
| managerRefs.decRef(); | ||
| })); | ||
| 
     | 
||
| conn.addCloseListener(ActionListener.running(() -> { | ||
| if (connectingRefCounter.hasReferences() == false) { | ||
| logger.trace("connection manager shut down, closing transport connection to [{}]", node); | ||
| } else if (conn.hasReferences()) { | ||
| logger.info( | ||
| """ | ||
| transport connection to [{}] closed by remote; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
| node.descriptionWithoutAttributes(), | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| ); | ||
| // In production code we only close connections via ref-counting, so this message confirms that a | ||
| // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a | ||
| // node leaves the cluster with "reason: disconnected" but without this message being logged then | ||
| // that's a bug. | ||
| } else { | ||
| logger.debug("closing unused transport connection to [{}]", node); | ||
| conn.addCloseListener(new ActionListener<Void>() { | ||
                
      
                  DaveCTurner marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||
| @Override | ||
| public void onResponse(Void ignored) { | ||
| final NodeConnectionHistory hist = new NodeConnectionHistory(node.getEphemeralId(), null); | ||
| nodeHistory.put(conn.getNode().getId(), hist); | ||
| } | ||
| })); | ||
| 
     | 
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| final NodeConnectionHistory hist = new NodeConnectionHistory(node.getEphemeralId(), e); | ||
| nodeHistory.put(conn.getNode().getId(), hist); | ||
| } | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to store the connection history even when  I guess in that case we would eventually discard the entry via  Do we need to be careful with the timing of calls to   | 
||
| }); | ||
| 
     | 
||
| conn.addCloseListener(new ActionListener<Void>() { | ||
| @Override | ||
| public void onResponse(Void ignored) { | ||
| if (connectingRefCounter.hasReferences() == false) { | ||
| logger.trace("connection manager shut down, closing transport connection to [{}]", node); | ||
| } else if (conn.hasReferences()) { | ||
| logger.info( | ||
| """ | ||
| transport connection to [{}] closed by remote; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
| node.descriptionWithoutAttributes(), | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| ); | ||
| // In production code we only close connections via ref-counting, so this message confirms that | ||
| // a 'node-left ... reason: disconnected' event was caused by external factors. Put | ||
| // differently, if a node leaves the cluster with "reason: disconnected" but without this | ||
| // message being logged then that's a bug. | ||
| } else { | ||
| logger.debug("closing unused transport connection to [{}]", node); | ||
| } | ||
| } | ||
| 
     | 
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| if (conn.hasReferences()) { | ||
| logger.warn( | ||
| """ | ||
| transport connection to [{}] closed by remote with exception [{}]; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
                
       | 
||
| node.descriptionWithoutAttributes(), | ||
| e, | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| ); | ||
| } else { | ||
| logger.warn( | ||
| """ | ||
| transport connection to [{}] closed with exception [{}]; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
| node.descriptionWithoutAttributes(), | ||
| e, | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| ); | ||
                
       | 
||
| } | ||
| } | ||
| }); | ||
| } | ||
| } | ||
| } finally { | ||
| 
        
          
        
         | 
    @@ -276,6 +347,21 @@ private void connectToNodeOrRetry( | |
| ); | ||
| } | ||
| 
     | 
||
| @Override | ||
| public void retainConnectionHistory(List<DiscoveryNode> nodes) { | ||
| List<String> nodeIds = nodes.stream().map(node -> node.getId()).collect(Collectors.toList()); | ||
| 
     | 
||
| final int startSize = nodeHistory.size(); | ||
| // the keyset propagates changes to the underlying map | ||
| nodeHistory.keySet().retainAll(nodeIds); | ||
| logger.trace("Connection history garbage-collected from {} to {} entries", startSize, nodeHistory.size()); | ||
| } | ||
| 
     | 
||
| @Override | ||
| public int connectionHistorySize() { | ||
| return nodeHistory.size(); | ||
| } | ||
| 
     | 
||
| /** | ||
| * Returns a connection for the given node if the node is connected. | ||
| * Connections returned from this method must not be closed. The lifecycle of this connection is | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -15,6 +15,7 @@ | |
| import org.elasticsearch.core.Releasable; | ||
| 
     | 
||
| import java.io.Closeable; | ||
| import java.util.List; | ||
| import java.util.Set; | ||
| import java.util.concurrent.CopyOnWriteArrayList; | ||
| 
     | 
||
| 
          
            
          
           | 
    @@ -50,6 +51,16 @@ void connectToNode( | |
| 
     | 
||
| ConnectionProfile getConnectionProfile(); | ||
| 
     | 
||
| /** | ||
| * Keep the connection history for the nodes listed | ||
| */ | ||
| void retainConnectionHistory(List<DiscoveryNode> nodes); | ||
                
       | 
||
| 
     | 
||
| /** | ||
| * Exposed for tests | ||
| */ | ||
| int connectionHistorySize(); | ||
| 
     | 
||
| @FunctionalInterface | ||
| interface ConnectionValidator { | ||
| void validate(Transport.Connection connection, ConnectionProfile profile, ActionListener<Void> listener); | ||
| 
          
            
          
           | 
    ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we put this into its own test suite? This suite is supposed to be about
ESLoggingHandlerwhich is unrelated to the logging inClusterConnectionManager. I think this test should work fine in the:servertest suite, no need to hide it in thetransport-netty4module.Also could you open a separate PR to move
testConnectionLoggingandtestExceptionalDisconnectLoggingout of this test suite - they're testing the logging inTcpTransportwhich is similarly unrelated toESLoggingHandler. IIRC they were added here for historical reasons, but these days we use the Netty transport everywhere so these should work in:servertoo.