- 
                Notifications
    
You must be signed in to change notification settings  - Fork 25.6k
 
transport: log network reconnects with same peer process #128415
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
7d54620
              bd688bb
              11347ca
              0cc8084
              ab7f490
              81cbcdc
              27438d6
              0514724
              a56e728
              d03eb4d
              f7f8f72
              2becaf1
              8cf607e
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| 
          
            
          
           | 
    @@ -17,15 +17,19 @@ | |||||||
| import org.elasticsearch.cluster.node.DiscoveryNode; | ||||||||
| import org.elasticsearch.cluster.node.DiscoveryNodes; | ||||||||
| import org.elasticsearch.cluster.service.ClusterApplier; | ||||||||
| import org.elasticsearch.common.ReferenceDocs; | ||||||||
| import org.elasticsearch.common.component.AbstractLifecycleComponent; | ||||||||
| import org.elasticsearch.common.settings.Setting; | ||||||||
| import org.elasticsearch.common.settings.Settings; | ||||||||
| import org.elasticsearch.common.util.concurrent.AbstractRunnable; | ||||||||
| import org.elasticsearch.core.Nullable; | ||||||||
| import org.elasticsearch.core.Releasable; | ||||||||
| import org.elasticsearch.core.Releasables; | ||||||||
| import org.elasticsearch.core.TimeValue; | ||||||||
| import org.elasticsearch.injection.guice.Inject; | ||||||||
| import org.elasticsearch.threadpool.ThreadPool; | ||||||||
| import org.elasticsearch.transport.Transport; | ||||||||
| import org.elasticsearch.transport.TransportConnectionListener; | ||||||||
| import org.elasticsearch.transport.TransportService; | ||||||||
| 
     | 
||||||||
| import java.util.ArrayList; | ||||||||
| 
          
            
          
           | 
    @@ -188,6 +192,7 @@ public String toString() { | |||||||
| 
     | 
||||||||
| @Override | ||||||||
| protected void doStart() { | ||||||||
| transportService.addConnectionListener(new ConnectionChangeListener()); | ||||||||
| final ConnectionChecker connectionChecker = new ConnectionChecker(); | ||||||||
| this.connectionChecker = connectionChecker; | ||||||||
| connectionChecker.scheduleNextCheck(); | ||||||||
| 
        
          
        
         | 
    @@ -209,12 +214,36 @@ public void reconnectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletio | |||||||
| }); | ||||||||
| } | ||||||||
| 
     | 
||||||||
| private class ConnectionTarget { | ||||||||
| // exposed for testing | ||||||||
| protected ConnectionTarget connectionTargetForNode(DiscoveryNode node) { | ||||||||
| synchronized (mutex) { | ||||||||
| return targetsByNode.get(node); | ||||||||
| } | ||||||||
| } | ||||||||
| 
     | 
||||||||
| /** | ||||||||
| * Time of disconnect in absolute time ({@link ThreadPool#absoluteTimeInMillis()}), | ||||||||
| * and disconnect-causing exception, if any | ||||||||
| */ | ||||||||
| record DisconnectionHistory(long disconnectTimeMillis, @Nullable Exception disconnectCause) { | ||||||||
| public long getDisconnectTimeMillis() { | ||||||||
| return disconnectTimeMillis; | ||||||||
| } | ||||||||
| 
     | 
||||||||
| public Exception getDisconnectCause() { | ||||||||
| return disconnectCause; | ||||||||
| } | ||||||||
                
       | 
||||||||
| } | ||||||||
| 
     | 
||||||||
| protected class ConnectionTarget { | ||||||||
| private final DiscoveryNode discoveryNode; | ||||||||
| 
     | 
||||||||
| private final AtomicInteger consecutiveFailureCount = new AtomicInteger(); | ||||||||
| private final AtomicReference<Releasable> connectionRef = new AtomicReference<>(); | ||||||||
| 
     | 
||||||||
| // access is synchronized by the service mutex | ||||||||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍  | 
||||||||
| protected DisconnectionHistory disconnectionHistory = null; | ||||||||
                
       | 
||||||||
| protected DisconnectionHistory disconnectionHistory = null; | |
| @Nullable // if node is connected | |
| protected DisconnectionHistory disconnectionHistory = null; | 
        
          
              
                Outdated
          
        
      There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: DisconnectionHistory records are stored their node's ConnectionTarget, should it be "... stored in their node's ..."
        
          
              
                Outdated
          
        
      There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have a slight preference for including both the number of milliseconds and a human-readable representation of the time, see e.g. org.elasticsearch.action.support.SubscribableListener#scheduleTimeout. Sometimes these things may be minutes/hours long and it's hard to eyeball such large timespans in terms of milliseconds.
| which disconnected exceptionally [%dms] ago but did not \ | |
| which disconnected exceptionally [%s/%dms] ago but did not \ | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -229,11 +229,23 @@ private void connectToNodeOrRetry( | |
| try { | ||
| connectionListener.onNodeConnected(node, conn); | ||
| } finally { | ||
| conn.addCloseListener(ActionListener.running(() -> { | ||
| connectedNodes.remove(node, conn); | ||
| connectionListener.onNodeDisconnected(node, conn); | ||
| managerRefs.decRef(); | ||
| })); | ||
| conn.addCloseListener(new ActionListener<Void>() { | ||
                
      
                  DaveCTurner marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||
| @Override | ||
| public void onResponse(Void ignored) { | ||
| handleClose(null); | ||
| } | ||
| 
     | 
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| handleClose(e); | ||
| } | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to store the connection history even when  I guess in that case we would eventually discard the entry via  Do we need to be careful with the timing of calls to   | 
||
| 
     | 
||
| void handleClose(@Nullable Exception e) { | ||
| connectedNodes.remove(node, conn); | ||
| connectionListener.onNodeDisconnected(node, e); | ||
| managerRefs.decRef(); | ||
                
       | 
||
| } | ||
| }); | ||
| 
     | 
||
| conn.addCloseListener(ActionListener.running(() -> { | ||
| if (connectingRefCounter.hasReferences() == false) { | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -10,6 +10,7 @@ | |
| package org.elasticsearch.transport; | ||
| 
     | 
||
| import org.elasticsearch.cluster.node.DiscoveryNode; | ||
| import org.elasticsearch.core.Nullable; | ||
| 
     | 
||
| /** | ||
| * A listener interface that allows to react on transport events. All methods may be | ||
| 
          
            
          
           | 
    @@ -38,5 +39,5 @@ default void onNodeConnected(DiscoveryNode node, Transport.Connection connection | |
| /** | ||
| * Called once a node connection is closed and unregistered. | ||
| */ | ||
| default void onNodeDisconnected(DiscoveryNode node, Transport.Connection connection) {} | ||
| default void onNodeDisconnected(DiscoveryNode node, @Nullable Exception closeException) {} | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mentioned earlier that we could consider pulling this API change out to a separate PR. As things stand I now think we should definitely do that - it's a simple refactoring (needs no test changes) and will make this change much more focussed.  | 
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we need to expose the whole
ConnectionTargetout to tests - we could just allow access to theDisconnectionHistoryfor a node and keep theConnectionTargetclass private.