-
Notifications
You must be signed in to change notification settings - Fork 25.6k
15290: Log Unexpected Disconnects #127736
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
1584302
a6679d7
4cbca2c
9d05a06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,6 +47,12 @@ public class ClusterConnectionManager implements ConnectionManager { | |
| .newConcurrentMap(); | ||
| private final AbstractRefCounted connectingRefCounter = AbstractRefCounted.of(this::pendingConnectionsComplete); | ||
|
|
||
| record NodeConnectionHistory(String ephemeralId, long disconnectTime) {} | ||
|
|
||
| // map of nodeId -> NodeConnectionHistory entries updated on connection close, with any error | ||
| private final ConcurrentMap<String, NodeConnectionHistory> nodeHistory = ConcurrentCollections.newConcurrentMap(); | ||
| private long nodeHistoryLastGC = 0; | ||
|
|
||
| private final Transport transport; | ||
| private final ThreadContext threadContext; | ||
| private final ConnectionProfile defaultProfile; | ||
|
|
@@ -226,6 +232,19 @@ private void connectToNodeOrRetry( | |
| } else { | ||
| logger.debug("connected to node [{}]", node); | ||
| managerRefs.mustIncRef(); | ||
|
|
||
| // log case where remote has same ephemeralId as previous connection (the network was disrupted, but not the | ||
| // remote process), and update history with removal (we just connected successfully) | ||
| final DiscoveryNode connNode = conn.getNode(); | ||
| NodeConnectionHistory hist = nodeHistory.remove(connNode.getId()); | ||
| if (hist != null && hist.ephemeralId.equals(connNode.getEphemeralId())) { | ||
| logger.warn( | ||
| """ | ||
| transport connection to [{}] reopened, with same ephemeralId found.""", | ||
| node.descriptionWithoutAttributes() | ||
| ); | ||
| } | ||
|
|
||
| try { | ||
| connectionListener.onNodeConnected(node, conn); | ||
| } finally { | ||
|
|
@@ -235,25 +254,75 @@ private void connectToNodeOrRetry( | |
| managerRefs.decRef(); | ||
| })); | ||
|
|
||
| conn.addCloseListener(ActionListener.running(() -> { | ||
| if (connectingRefCounter.hasReferences() == false) { | ||
| logger.trace("connection manager shut down, closing transport connection to [{}]", node); | ||
| } else if (conn.hasReferences()) { | ||
| logger.info( | ||
| """ | ||
| transport connection to [{}] closed by remote; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
| node.descriptionWithoutAttributes(), | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| conn.addCloseListener(new ActionListener<Void>() { | ||
| @Override | ||
| public void onResponse(Void ignored) { | ||
| final NodeConnectionHistory hist = new NodeConnectionHistory( | ||
| node.getEphemeralId(), | ||
| System.currentTimeMillis() | ||
| ); | ||
| // In production code we only close connections via ref-counting, so this message confirms that a | ||
| // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a | ||
| // node leaves the cluster with "reason: disconnected" but without this message being logged then | ||
| // that's a bug. | ||
| } else { | ||
| logger.debug("closing unused transport connection to [{}]", node); | ||
| nodeHistory.put(conn.getNode().getId(), hist); | ||
| } | ||
| })); | ||
|
|
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| final NodeConnectionHistory hist = new NodeConnectionHistory( | ||
| node.getEphemeralId(), | ||
| System.currentTimeMillis() | ||
| ); | ||
| nodeHistory.put(conn.getNode().getId(), hist); | ||
| } | ||
| }); | ||
|
|
||
| conn.addCloseListener(ActionListener.running(ClusterConnectionManager.this::collectHistoryGarbage)); | ||
|
|
||
| conn.addCloseListener(new ActionListener<Void>() { | ||
| @Override | ||
| public void onResponse(Void ignored) { | ||
| if (connectingRefCounter.hasReferences() == false) { | ||
| logger.trace("connection manager shut down, closing transport connection to [{}]", node); | ||
| } else if (conn.hasReferences()) { | ||
| // the connection is closing down, but hasn't been released by the client/library side | ||
| // this is an event coming up from the network side | ||
| logger.info( | ||
| """ | ||
| transport connection to [{}] closed by remote; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
| node.descriptionWithoutAttributes(), | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| ); | ||
| // In production code we only close connections via ref-counting, so this message confirms that | ||
| // a 'node-left ... reason: disconnected' event was caused by external factors. Put | ||
| // differently, if a node leaves the cluster with "reason: disconnected" but without this | ||
| // message being logged then that's a bug. | ||
| } else { | ||
| logger.debug("closing unused transport connection to [{}]", node); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| if (conn.hasReferences()) { | ||
| logger.warn( | ||
| """ | ||
| transport connection to [{}] closed by remote with exception [{}]; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
| node.descriptionWithoutAttributes(), | ||
| e, | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| ); | ||
| } else { | ||
| logger.warn( | ||
| """ | ||
| transport connection to [{}] closed with exception [{}]; \ | ||
| if unexpected, see [{}] for troubleshooting guidance""", | ||
| node.descriptionWithoutAttributes(), | ||
| e, | ||
| ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING | ||
| ); | ||
| } | ||
| } | ||
| }); | ||
| } | ||
| } | ||
| } finally { | ||
|
|
@@ -276,6 +345,26 @@ private void connectToNodeOrRetry( | |
| ); | ||
| } | ||
|
|
||
| /** | ||
| * Removes entries in the nodeHistory table that are too old | ||
| */ | ||
| private void collectHistoryGarbage() { | ||
| final long now = System.currentTimeMillis(); | ||
| final long hour = 60 * 60 * 1000; | ||
|
Comment on lines
+351
to
+353
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this should be time-based. I'd rather we dropped the entry when the node is removed from the cluster membership (unexpected membership changes are already logged appropriately - see (If we were to make this time-based then we should make the timeout configurable via a setting rather than hard-coded at 1 hour). In principle we could also just make it size-based, expiring the oldest entries to limit the size of the map to no more than (say) double the size of the cluster. That's what There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea! |
||
|
|
||
| if (now - hour > nodeHistoryLastGC) { | ||
| final int startSize = nodeHistory.size(); | ||
| nodeHistoryLastGC = now; | ||
| final long expire = now - hour; | ||
| for (Map.Entry<String, NodeConnectionHistory> entry : nodeHistory.entrySet()) { | ||
| if (expire > entry.getValue().disconnectTime) { | ||
| nodeHistory.remove(entry.getKey(), entry.getValue()); | ||
| } | ||
| } | ||
| logger.trace("ClusterConnectionManager GCed connection history from {} to {} entries", startSize, nodeHistory.size()); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns a connection for the given node if the node is connected. | ||
| * Connections returned from this method must not be closed. The lifecycle of this connection is | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also use
org.elasticsearch.ExceptionsHelper#maybeDieOnAnotherThreadhere - if anErroris thrown then we cannot continue and must not just quietly suppress it.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm very new to how exceptions are reported here. The maybeDieOnAnotherThread helper is invoked a few lines below -- it's snuck in on line 329 in this PR.
An observation I made is that these changes shouldn't modify how the exception is responded to (I almost did this in a few cases!). In this commit, it's more about pinning it to the channel, so if it's closed there's some attribution around why.
Does this explanation address things? I really barely know what that helper does!