Skip to content

Commit 21d7b30

Browse files
Don't fail on first node-join publication failure
Extends the `Coordinator.handleJoinRequest` `onFailure` method in the callback to only fail if the node is not in already in the `ClusterState`. This avoids a rare corner case where a master node's `ClusterState` already has a node in it that is attempting to join the cluster, and it logs an incorrect error message
1 parent 75ca874 commit 21d7b30

File tree

2 files changed

+138
-20
lines changed

2 files changed

+138
-20
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -672,28 +672,32 @@ public void onResponse(Releasable response) {
672672

673673
@Override
674674
public void onFailure(Exception e) {
675-
logger.warn(
676-
() -> format(
677-
"received join request from [%s] but could not connect back to the joining node",
678-
joinRequest.getSourceNode()
679-
),
680-
e
681-
);
682-
683-
joinListener.onFailure(
684-
// NodeDisconnectedException mainly to suppress uninteresting stack trace
685-
new NodeDisconnectedException(
686-
joinRequest.getSourceNode(),
687-
String.format(
688-
Locale.ROOT,
689-
"failure when opening connection back from [%s] to [%s]",
690-
getLocalNode().descriptionWithoutAttributes(),
691-
joinRequest.getSourceNode().descriptionWithoutAttributes()
675+
// The failure of the first node-join publication does not imply that the join failed,
676+
// because it may be retried and eventually succeed
677+
if (!applierState.nodes().nodeExists(joinRequest.getSourceNode())) {
678+
logger.warn(
679+
() -> format(
680+
"received join request from [%s] but could not connect back to the joining node",
681+
joinRequest.getSourceNode()
692682
),
693-
JoinHelper.JOIN_ACTION_NAME,
694683
e
695-
)
696-
);
684+
);
685+
686+
joinListener.onFailure(
687+
// NodeDisconnectedException mainly to suppress uninteresting stack trace
688+
new NodeDisconnectedException(
689+
joinRequest.getSourceNode(),
690+
String.format(
691+
Locale.ROOT,
692+
"failure when opening connection back from [%s] to [%s]",
693+
getLocalNode().descriptionWithoutAttributes(),
694+
joinRequest.getSourceNode().descriptionWithoutAttributes()
695+
),
696+
JoinHelper.JOIN_ACTION_NAME,
697+
e
698+
)
699+
);
700+
}
697701
}
698702
});
699703
}

server/src/test/java/org/elasticsearch/cluster/coordination/CoordinatorTests.java

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,6 +1474,120 @@ public void assertMatched() {
14741474
}
14751475
}
14761476

1477+
// TODO:
1478+
// 1: Refactor this test and above
1479+
// 2: Add a test where it fails to begin with, then the handshake block is removed and it succeeds
1480+
@TestLogging(
1481+
reason = "test includes assertions about logging",
1482+
value = "org.elasticsearch.cluster.coordination.Coordinator:WARN,org.elasticsearch.cluster.coordination.JoinHelper:INFO"
1483+
)
1484+
public void testDoesNotReportConnectBackProblemsDuringJoiningIfNodeIsInClusterState() throws Exception {
1485+
try (var cluster = new Cluster(3)) {
1486+
cluster.runRandomly();
1487+
cluster.stabilise();
1488+
1489+
final var partitionedNode = cluster.getAnyNode();
1490+
partitionedNode.disconnect();
1491+
cluster.stabilise();
1492+
1493+
logger.info("--> removed [{}] but adding to master's cluster state", partitionedNode);
1494+
final ClusterNode leader = cluster.getAnyLeader();
1495+
leader.submitUpdateTask("updating cluster state",
1496+
cs -> {
1497+
ClusterState cs2 = ClusterState.builder(cs)
1498+
.nodes(DiscoveryNodes.builder(cs.nodes())
1499+
.add(partitionedNode.getLocalNode())
1500+
.build())
1501+
.build();
1502+
// Insert breakpoint here
1503+
return cs2;
1504+
},
1505+
(e) -> {}
1506+
);
1507+
1508+
logger.info("--> healing [{}] but blocking handshakes", partitionedNode);
1509+
partitionedNode.heal();
1510+
leader.addActionBlock(TransportService.HANDSHAKE_ACTION_NAME);
1511+
1512+
try (var mockLog = MockLog.capture(Coordinator.class, JoinHelper.class)) {
1513+
1514+
// Since the node is in the cluster state, we do not expect this log
1515+
mockLog.addExpectation(
1516+
new MockLog.UnseenEventExpectation(
1517+
"connect-back failure",
1518+
Coordinator.class.getCanonicalName(),
1519+
Level.WARN,
1520+
"*received join request from ["
1521+
+ partitionedNode.getLocalNode().descriptionWithoutAttributes()
1522+
+ "] but could not connect back to the joining node"
1523+
)
1524+
);
1525+
1526+
// We do not expect an info log from JoinHelper about the handshake failure
1527+
mockLog.addExpectation(new MockLog.LoggingExpectation() {
1528+
boolean matched = false;
1529+
1530+
@Override
1531+
public void match(LogEvent event) {
1532+
if (event.getLevel() != Level.INFO) {
1533+
return;
1534+
}
1535+
if (event.getLoggerName().equals(JoinHelper.class.getCanonicalName()) == false) {
1536+
return;
1537+
}
1538+
1539+
var cause = event.getThrown();
1540+
if (cause == null) {
1541+
return;
1542+
}
1543+
cause = cause.getCause();
1544+
if (cause == null) {
1545+
return;
1546+
}
1547+
if (Regex.simpleMatch(
1548+
"* failure when opening connection back from ["
1549+
+ leader.getLocalNode().descriptionWithoutAttributes()
1550+
+ "] to ["
1551+
+ partitionedNode.getLocalNode().descriptionWithoutAttributes()
1552+
+ "]",
1553+
cause.getMessage()
1554+
) == false) {
1555+
return;
1556+
}
1557+
if (cause.getStackTrace() != null && cause.getStackTrace().length != 0) {
1558+
return;
1559+
}
1560+
matched = true;
1561+
}
1562+
1563+
@Override
1564+
public void assertMatched() {
1565+
assertFalse(matched);
1566+
}
1567+
});
1568+
1569+
cluster.runFor(
1570+
// This expects 8 tasks to be executed after PeerFinder handling wakeup:
1571+
//
1572+
// * connectToRemoteMasterNode[0.0.0.0:11]
1573+
// * [internal:transport/handshake] from {node1} to {node2}
1574+
// * response to [internal:transport/handshake] from {node1} to {node2}
1575+
// * [internal:discovery/request_peers] from {node1} to
1576+
// * response to [internal:discovery/request_peers] from {node1} to {node2}
1577+
// * [internal:cluster/coordination/join] from {node1} to {node2}
1578+
// * [internal:transport/handshake] from {node2} to {node1} (rejected due to action block)
1579+
// * error response to [internal:cluster/coordination/join] from {node1} to {node2}
1580+
//
1581+
defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING) + 8 * DEFAULT_DELAY_VARIABILITY,
1582+
"allowing time for join attempt"
1583+
);
1584+
mockLog.assertAllExpectationsMatched();
1585+
}
1586+
1587+
leader.clearActionBlocks();
1588+
}
1589+
}
1590+
14771591
public void testFollowerRemovedIfUnableToSendRequestsToMaster() {
14781592
try (Cluster cluster = new Cluster(3)) {
14791593
cluster.runRandomly();

0 commit comments

Comments
 (0)