Skip to content

Commit 5ff538b

Browse files
authored
transport: log network reconnects with same peer process (#129439)
NodeConnectionsService now caches the ephemeralId (created on process-start) of peer nodes on network disconnect on the node's ConnectionTarget object. On reconnect, when a peer has the same ephemeralId as it did in the previous connection, this is logged to indicate a network failure instead of a restart. Closes ES-11448
1 parent a604089 commit 5ff538b

File tree

2 files changed

+206
-1
lines changed

2 files changed

+206
-1
lines changed

server/src/main/java/org/elasticsearch/cluster/NodeConnectionsService.java

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,19 @@
1717
import org.elasticsearch.cluster.node.DiscoveryNode;
1818
import org.elasticsearch.cluster.node.DiscoveryNodes;
1919
import org.elasticsearch.cluster.service.ClusterApplier;
20+
import org.elasticsearch.common.ReferenceDocs;
2021
import org.elasticsearch.common.component.AbstractLifecycleComponent;
2122
import org.elasticsearch.common.settings.Setting;
2223
import org.elasticsearch.common.settings.Settings;
2324
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
25+
import org.elasticsearch.core.Nullable;
2426
import org.elasticsearch.core.Releasable;
2527
import org.elasticsearch.core.Releasables;
2628
import org.elasticsearch.core.TimeValue;
2729
import org.elasticsearch.injection.guice.Inject;
2830
import org.elasticsearch.threadpool.ThreadPool;
31+
import org.elasticsearch.transport.Transport;
32+
import org.elasticsearch.transport.TransportConnectionListener;
2933
import org.elasticsearch.transport.TransportService;
3034

3135
import java.util.ArrayList;
@@ -188,6 +192,7 @@ public String toString() {
188192

189193
@Override
190194
protected void doStart() {
195+
transportService.addConnectionListener(new ConnectionChangeListener());
191196
final ConnectionChecker connectionChecker = new ConnectionChecker();
192197
this.connectionChecker = connectionChecker;
193198
connectionChecker.scheduleNextCheck();
@@ -209,12 +214,33 @@ public void reconnectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletio
209214
});
210215
}
211216

217+
// exposed for testing
218+
protected DisconnectionHistory disconnectionHistoryForNode(DiscoveryNode node) {
219+
synchronized (mutex) {
220+
ConnectionTarget connectionTarget = targetsByNode.get(node);
221+
if (connectionTarget != null) {
222+
return connectionTarget.disconnectionHistory;
223+
}
224+
}
225+
return null;
226+
}
227+
228+
/**
229+
* Time of disconnect in absolute time ({@link ThreadPool#absoluteTimeInMillis()}),
230+
* and disconnect-causing exception, if any
231+
*/
232+
record DisconnectionHistory(long disconnectTimeMillis, @Nullable Exception disconnectCause) {}
233+
212234
private class ConnectionTarget {
213235
private final DiscoveryNode discoveryNode;
214236

215237
private final AtomicInteger consecutiveFailureCount = new AtomicInteger();
216238
private final AtomicReference<Releasable> connectionRef = new AtomicReference<>();
217239

240+
// access is synchronized by the service mutex
241+
@Nullable // null when node is connected or initialized; non-null in between disconnects and connects
242+
private DisconnectionHistory disconnectionHistory = null;
243+
218244
// all access to these fields is synchronized
219245
private List<Releasable> pendingRefs;
220246
private boolean connectionInProgress;
@@ -345,4 +371,70 @@ public String toString() {
345371
return "ConnectionTarget{discoveryNode=" + discoveryNode + '}';
346372
}
347373
}
374+
375+
/**
376+
* Receives connection/disconnection events from the transport, and records them in per-node DisconnectionHistory
377+
* structures for logging network issues. DisconnectionHistory records are stored in their node's ConnectionTarget.
378+
*
379+
* Network issues (that this listener monitors for) occur whenever a reconnection to a node succeeds,
380+
* and it has the same ephemeral ID as it did during the last connection; this happens when a connection event
381+
* occurs, and its ConnectionTarget entry has a previous DisconnectionHistory stored.
382+
*/
383+
private class ConnectionChangeListener implements TransportConnectionListener {
384+
@Override
385+
public void onNodeConnected(DiscoveryNode node, Transport.Connection connection) {
386+
DisconnectionHistory disconnectionHistory = null;
387+
synchronized (mutex) {
388+
ConnectionTarget connectionTarget = targetsByNode.get(node);
389+
if (connectionTarget != null) {
390+
disconnectionHistory = connectionTarget.disconnectionHistory;
391+
connectionTarget.disconnectionHistory = null;
392+
}
393+
}
394+
395+
if (disconnectionHistory != null) {
396+
long millisSinceDisconnect = threadPool.absoluteTimeInMillis() - disconnectionHistory.disconnectTimeMillis;
397+
TimeValue timeValueSinceDisconnect = TimeValue.timeValueMillis(millisSinceDisconnect);
398+
if (disconnectionHistory.disconnectCause != null) {
399+
logger.warn(
400+
() -> format(
401+
"""
402+
reopened transport connection to node [%s] \
403+
which disconnected exceptionally [%s/%dms] ago but did not \
404+
restart, so the disconnection is unexpected; \
405+
see [%s] for troubleshooting guidance""",
406+
node.descriptionWithoutAttributes(),
407+
timeValueSinceDisconnect,
408+
millisSinceDisconnect,
409+
ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
410+
),
411+
disconnectionHistory.disconnectCause
412+
);
413+
} else {
414+
logger.warn(
415+
"""
416+
reopened transport connection to node [{}] \
417+
which disconnected gracefully [{}/{}ms] ago but did not \
418+
restart, so the disconnection is unexpected; \
419+
see [{}] for troubleshooting guidance""",
420+
node.descriptionWithoutAttributes(),
421+
timeValueSinceDisconnect,
422+
millisSinceDisconnect,
423+
ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
424+
);
425+
}
426+
}
427+
}
428+
429+
@Override
430+
public void onNodeDisconnected(DiscoveryNode node, @Nullable Exception closeException) {
431+
DisconnectionHistory disconnectionHistory = new DisconnectionHistory(threadPool.absoluteTimeInMillis(), closeException);
432+
synchronized (mutex) {
433+
ConnectionTarget connectionTarget = targetsByNode.get(node);
434+
if (connectionTarget != null) {
435+
connectionTarget.disconnectionHistory = disconnectionHistory;
436+
}
437+
}
438+
}
439+
}
348440
}

server/src/test/java/org/elasticsearch/cluster/NodeConnectionsServiceTests.java

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.elasticsearch.action.ActionRunnable;
1818
import org.elasticsearch.action.support.PlainActionFuture;
1919
import org.elasticsearch.action.support.SubscribableListener;
20+
import org.elasticsearch.cluster.NodeConnectionsService.DisconnectionHistory;
2021
import org.elasticsearch.cluster.node.DiscoveryNode;
2122
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
2223
import org.elasticsearch.cluster.node.DiscoveryNodeUtils;
@@ -49,6 +50,7 @@
4950
import org.elasticsearch.transport.TransportRequestOptions;
5051
import org.elasticsearch.transport.TransportService;
5152
import org.elasticsearch.transport.TransportStats;
53+
import org.hamcrest.Matchers;
5254
import org.junit.After;
5355
import org.junit.Before;
5456

@@ -75,6 +77,7 @@ public class NodeConnectionsServiceTests extends ESTestCase {
7577
private ThreadPool threadPool;
7678
private TransportService transportService;
7779
private Map<DiscoveryNode, CheckedRunnable<Exception>> nodeConnectionBlocks;
80+
private Map<DiscoveryNode, Exception> nodeCloseExceptions;
7881

7982
private List<DiscoveryNode> generateNodes() {
8083
List<DiscoveryNode> nodes = new ArrayList<>();
@@ -246,6 +249,110 @@ public String toString() {
246249
assertConnectedExactlyToNodes(transportService, targetNodes);
247250
}
248251

252+
public void testDisconnectionHistory() {
253+
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue();
254+
final ThreadPool threadPool = deterministicTaskQueue.getThreadPool();
255+
final TimeValue reconnectIntervalTimeValue = CLUSTER_NODE_RECONNECT_INTERVAL_SETTING.get(Settings.EMPTY);
256+
final long reconnectIntervalMillis = reconnectIntervalTimeValue.millis();
257+
258+
MockTransport transport = new MockTransport(threadPool);
259+
TestTransportService transportService = new TestTransportService(transport, threadPool);
260+
transportService.start();
261+
transportService.acceptIncomingRequests();
262+
263+
final NodeConnectionsService service = new NodeConnectionsService(Settings.EMPTY, threadPool, transportService);
264+
service.start();
265+
266+
final DiscoveryNode noClose = DiscoveryNodeUtils.create("noClose");
267+
final DiscoveryNode gracefulClose = DiscoveryNodeUtils.create("gracefulClose");
268+
final DiscoveryNode exceptionalClose = DiscoveryNodeUtils.create("exceptionalClose");
269+
270+
nodeCloseExceptions.put(exceptionalClose, new RuntimeException());
271+
272+
final AtomicBoolean connectionCompleted = new AtomicBoolean();
273+
DiscoveryNodes nodes = DiscoveryNodes.builder().add(noClose).add(gracefulClose).add(exceptionalClose).build();
274+
275+
service.connectToNodes(nodes, () -> connectionCompleted.set(true));
276+
deterministicTaskQueue.runAllRunnableTasks();
277+
assertTrue(connectionCompleted.get());
278+
279+
assertNullDisconnectionHistory(service, noClose);
280+
assertNullDisconnectionHistory(service, gracefulClose);
281+
assertNullDisconnectionHistory(service, exceptionalClose);
282+
283+
transportService.disconnectFromNode(gracefulClose);
284+
transportService.disconnectFromNode(exceptionalClose);
285+
286+
// check disconnection history set after close
287+
assertNullDisconnectionHistory(service, noClose);
288+
assertDisconnectionHistoryDetails(service, threadPool, gracefulClose, null);
289+
assertDisconnectionHistoryDetails(service, threadPool, exceptionalClose, RuntimeException.class);
290+
291+
try (var mockLog = MockLog.capture(NodeConnectionsService.class)) {
292+
mockLog.addExpectation(
293+
new MockLog.SeenEventExpectation(
294+
"reconnect after graceful close",
295+
NodeConnectionsService.class.getCanonicalName(),
296+
Level.WARN,
297+
"reopened transport connection to node ["
298+
+ gracefulClose.descriptionWithoutAttributes()
299+
+ "] which disconnected gracefully ["
300+
+ reconnectIntervalTimeValue
301+
+ "/"
302+
+ reconnectIntervalMillis
303+
+ "ms] ago "
304+
+ "but did not restart, so the disconnection is unexpected; "
305+
+ "see [https://www.elastic.co/guide/*] for troubleshooting guidance"
306+
)
307+
);
308+
mockLog.addExpectation(
309+
new MockLog.SeenEventExpectation(
310+
"reconnect after exceptional close",
311+
NodeConnectionsService.class.getCanonicalName(),
312+
Level.WARN,
313+
"reopened transport connection to node ["
314+
+ exceptionalClose.descriptionWithoutAttributes()
315+
+ "] which disconnected exceptionally ["
316+
+ reconnectIntervalTimeValue
317+
+ "/"
318+
+ reconnectIntervalMillis
319+
+ "ms] ago "
320+
+ "but did not restart, so the disconnection is unexpected; "
321+
+ "see [https://www.elastic.co/guide/*] for troubleshooting guidance"
322+
)
323+
);
324+
runTasksUntil(deterministicTaskQueue, deterministicTaskQueue.getCurrentTimeMillis() + reconnectIntervalMillis);
325+
mockLog.assertAllExpectationsMatched();
326+
}
327+
328+
// check on reconnect -- disconnection history is reset
329+
assertNullDisconnectionHistory(service, noClose);
330+
assertNullDisconnectionHistory(service, gracefulClose);
331+
assertNullDisconnectionHistory(service, exceptionalClose);
332+
}
333+
334+
private void assertNullDisconnectionHistory(NodeConnectionsService service, DiscoveryNode node) {
335+
DisconnectionHistory disconnectionHistory = service.disconnectionHistoryForNode(node);
336+
assertNull(disconnectionHistory);
337+
}
338+
339+
private void assertDisconnectionHistoryDetails(
340+
NodeConnectionsService service,
341+
ThreadPool threadPool,
342+
DiscoveryNode node,
343+
@Nullable Class<?> disconnectCauseClass
344+
) {
345+
DisconnectionHistory disconnectionHistory = service.disconnectionHistoryForNode(node);
346+
assertNotNull(disconnectionHistory);
347+
assertTrue(threadPool.absoluteTimeInMillis() - disconnectionHistory.disconnectTimeMillis() >= 0);
348+
assertTrue(threadPool.absoluteTimeInMillis() - disconnectionHistory.disconnectTimeMillis() <= 200);
349+
if (disconnectCauseClass != null) {
350+
assertThat(disconnectionHistory.disconnectCause(), Matchers.isA(disconnectCauseClass));
351+
} else {
352+
assertNull(disconnectionHistory.disconnectCause());
353+
}
354+
}
355+
249356
public void testOnlyBlocksOnConnectionsToNewNodes() throws Exception {
250357
final NodeConnectionsService service = new NodeConnectionsService(Settings.EMPTY, threadPool, transportService);
251358

@@ -526,6 +633,7 @@ public void setUp() throws Exception {
526633
ThreadPool threadPool = new TestThreadPool(getClass().getName());
527634
this.threadPool = threadPool;
528635
nodeConnectionBlocks = newConcurrentMap();
636+
nodeCloseExceptions = newConcurrentMap();
529637
transportService = new TestTransportService(new MockTransport(threadPool), threadPool);
530638
transportService.start();
531639
transportService.acceptIncomingRequests();
@@ -644,7 +752,12 @@ public void addCloseListener(ActionListener<Void> listener1) {
644752

645753
@Override
646754
public void close() {
647-
closeListener.onResponse(null);
755+
Exception closeException = nodeCloseExceptions.get(node);
756+
if (closeException != null) {
757+
closeListener.onFailure(closeException);
758+
} else {
759+
closeListener.onResponse(null);
760+
}
648761
}
649762

650763
@Override

0 commit comments

Comments
 (0)