elastic · joshua-adams-1 · Sep 3, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/...er/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/NodeJoiningIT.java b/...er/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/NodeJoiningIT.java
@@ -11,14 +11,12 @@
 
 import org.apache.logging.log4j.Level;
 import org.elasticsearch.ElasticsearchException;
-import org.elasticsearch.action.support.SubscribableListener;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterApplierService;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.cluster.service.MasterService;
-import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.CollectionUtils;
 import org.elasticsearch.plugins.Plugin;
@@ -85,18 +83,8 @@ public void testNodeTriesToJoinClusterAndThenDifferentMasterIsElected() {
         ensureStableCluster(3);
         String originalMasterNodeName = internalCluster().getMasterName();
         int numberOfNodesOriginallyInCluster = internalCluster().clusterService(originalMasterNodeName).state().getNodes().size();
-        int numberOfMasterNodesOriginallyInCluster = internalCluster().clusterService(originalMasterNodeName)
-            .state()
-            .nodes()
-            .getMasterNodes()
-            .size();
-        int numberOfDataNodesOriginallyInCluster = internalCluster().clusterService(originalMasterNodeName)
-            .state()
-            .nodes()
-            .getDataNodes()
-            .size();
         // Determine upfront who we want the next master to be
-        final var newMasterNodeName = nodeNames.getFirst();
+        final var newMasterNodeName = randomValueOtherThan(originalMasterNodeName, () -> randomFrom(nodeNames));
 
         // Ensure the logging is as expected
         try (var mockLog = MockLog.capture(NodeJoinExecutor.class)) {
@@ -128,14 +116,15 @@ public void testNodeTriesToJoinClusterAndThenDifferentMasterIsElected() {
             assertNotEquals(originalMasterNodeName, internalCluster().getMasterName());
             logger.info("New master is elected");
 
+            // Assert all nodes have accepted N into their cluster state
+            assertNewNodeIsInAllClusterStates(newNodeName);
+
             mockLog.assertAllExpectationsMatched();
 
             // Assert the new data node was added
             DiscoveryNodes discoveryNodes = internalCluster().clusterService().state().nodes();
             assertEquals(numberOfNodesOriginallyInCluster + 1, discoveryNodes.getSize());
-            assertEquals(numberOfDataNodesOriginallyInCluster + 1, discoveryNodes.getDataNodes().size());
             assertTrue(getListOfDataNodeNamesFromCluster(newMasterNodeName).contains(newNodeName));
-            assertEquals(numberOfMasterNodesOriginallyInCluster, discoveryNodes.getMasterNodes().size());
         }
     }
 
@@ -149,20 +138,10 @@ public void testNodeTriesToJoinClusterAndThenSameMasterIsElected() {
         ensureStableCluster(3);
         String masterNodeName = internalCluster().getMasterName();
 
-        long originalTerm = internalCluster().clusterService(masterNodeName).state().coordinationMetadata().term();
+        long originalTerm = getTerm(masterNodeName);
         int numberOfNodesOriginallyInCluster = internalCluster().clusterService(masterNodeName).state().getNodes().size();
-        int numberOfMasterNodesOriginallyInCluster = internalCluster().clusterService(masterNodeName)
-            .state()
-            .nodes()
-            .getMasterNodes()
-            .size();
-        int numberOfDataNodesOriginallyInCluster = internalCluster().clusterService(masterNodeName).state().nodes().getDataNodes().size();
-        String[] namesOfAllNodesInOriginalCluster = internalCluster().getNodeNames();
 
-        // Ensure the logging is as expected
         try (var mockLog = MockLog.capture(NodeJoinExecutor.class, MasterService.class, ClusterApplierService.class)) {
-            SubscribableListener<Void> publishingBanRemovedListener = null;
-
             for (String nodeName : internalCluster().getNodeNames()) {
                 final var mockTransportService = MockTransportService.getInstance(nodeName);
 
@@ -175,18 +154,16 @@ public void testNodeTriesToJoinClusterAndThenSameMasterIsElected() {
 
                     // Wait until the master has stepped down before removing the publishing ban
                     // This allows the master to be re-elected
-                    publishingBanRemovedListener = ClusterServiceUtils.addTemporaryStateListener(
-                        internalCluster().clusterService(masterNodeName),
-                        clusterState -> {
-                            DiscoveryNode currentMasterNode = clusterState.nodes().getMasterNode();
-                            boolean hasMasterSteppedDown = currentMasterNode == null
-                                || currentMasterNode.getName().equals(masterNodeName) == false;
-                            if (hasMasterSteppedDown) {
-                                mockTransportService.addSendBehavior(Transport.Connection::sendRequest);
-                            }
-                            return hasMasterSteppedDown;
+                    ClusterServiceUtils.addTemporaryStateListener(internalCluster().clusterService(masterNodeName), clusterState -> {
+                        DiscoveryNode currentMasterNode = clusterState.nodes().getMasterNode();
+                        boolean hasMasterSteppedDown = currentMasterNode == null
+                            || currentMasterNode.getName().equals(masterNodeName) == false;
+                        if (hasMasterSteppedDown) {
+                            logger.info("Master publishing ban removed");
+                            mockTransportService.addSendBehavior(Transport.Connection::sendRequest);
                         }
-                    );
+                        return hasMasterSteppedDown;
+                    });
 
                 } else {
                     // This disables pre-voting on all nodes except the master, forcing it to win the election
@@ -220,45 +197,29 @@ Therefore, this WARN log should not be thrown (#ES-11449)
             logger.info("Sending node join request");
             String newNodeName = internalCluster().startDataOnlyNode();
 
-            safeAwait(publishingBanRemovedListener);
-            logger.info("Master publishing ban removed");
+            // Assert the master was re-elected
+            assertTrue(masterNodeName.equals(internalCluster().getMasterName()) && originalTerm < getTerm(masterNodeName));
 
-            // Wait until the master acknowledges its re-election. The master is only re-elected once it's publishing ban is lifted
-            SubscribableListener<Void> masterKnowsItsReElectedListener = ClusterServiceUtils.addTemporaryStateListener(
-                internalCluster().getInstance(ClusterService.class, masterNodeName),
-                clusterState -> {
-                    DiscoveryNode currentMasterNode = clusterState.nodes().getMasterNode();
-                    long currentTerm = clusterState.coordinationMetadata().term();
-                    return currentMasterNode != null && currentMasterNode.getName().equals(masterNodeName) && currentTerm > originalTerm;
-                }
-            );
-            safeAwait(masterKnowsItsReElectedListener);
-
-            assertEquals(masterNodeName, internalCluster().getMasterName());
-            logger.info("Master has been re-elected");
-
-            try {
-                // Await for N to be in the cluster state of all nodes
-                for (String nodeName : namesOfAllNodesInOriginalCluster) {
-                    ClusterServiceUtils.awaitClusterState(
-                        logger,
-                        clusterState -> nodeExistsWithName(clusterState.nodes(), newNodeName),
-                        internalCluster().clusterService(nodeName)
-                    );
-                }
-            } catch (Exception e) {
-                throw new RuntimeException(e);
-            }
+            // Assert all nodes have accepted N into their cluster state
+            assertNewNodeIsInAllClusterStates(newNodeName);
 
             // If the WARN log was thrown, then the connection to N was disconnected so fail the test
             mockLog.assertAllExpectationsMatched();
 
             // Assert the new data node was added
             DiscoveryNodes discoveryNodes = internalCluster().clusterService().state().nodes();
             assertEquals(numberOfNodesOriginallyInCluster + 1, discoveryNodes.getSize());
-            assertEquals(numberOfDataNodesOriginallyInCluster + 1, discoveryNodes.getDataNodes().size());
             assertTrue(getListOfDataNodeNamesFromCluster(masterNodeName).contains(newNodeName));
-            assertEquals(numberOfMasterNodesOriginallyInCluster, discoveryNodes.getMasterNodes().size());
+        }
+    }
+
+    private long getTerm(String masterNodeName) {
+        return internalCluster().clusterService(masterNodeName).state().coordinationMetadata().term();
+    }
+
+    private void assertNewNodeIsInAllClusterStates(String newNodeName) {
+        for (ClusterService clusterService : internalCluster().getInstances(ClusterService.class)) {
+            assertTrue(clusterService.state().nodes().getAllNodes().stream().map(DiscoveryNode::getName).toList().contains(newNodeName));
         }
     }
 
@@ -273,25 +234,13 @@ private List<String> getListOfDataNodeNamesFromCluster(String nodeName) {
             .toList();
     }
 
-    private boolean nodeExistsWithName(DiscoveryNodes nodes, String nodeName) {
-        for (DiscoveryNode node : nodes.getAllNodes()) {
-            if (node.getName().equals(nodeName)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
     private void addJoiningNodeDisconnectedWarnLogFalseExpectation(MockLog mockLog) {
         mockLog.addExpectation(
             new MockLog.UnseenEventExpectation(
                 "warn message with troubleshooting link",
                 "org.elasticsearch.cluster.coordination.NodeJoinExecutor",
                 Level.WARN,
-                "node-join: [*] with reason ["
-                    + ReferenceDocs.UNSTABLE_CLUSTER_TROUBLESHOOTING
-                    + "]; for troubleshooting guidance, see "
-                    + "https://www.elastic.co/docs/troubleshoot/elasticsearch/troubleshooting-unstable-cluster*"
+                "*"
             )
         );
     }

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
@@ -668,51 +668,57 @@ private void handleJoinRequest(JoinRequest joinRequest, ActionListener<Void> joi
         transportService.connectToNode(joinRequest.getSourceNode(), new ActionListener<>() {
             @Override
             public void onResponse(Releasable response) {
-                validateJoinRequest(
-                    joinRequest,
-                    ActionListener.runBefore(joinListener, () -> Releasables.close(response))
-                        .delegateFailure((l, ignored) -> processJoinRequest(joinRequest, l.delegateResponse((ignoredListener, e) -> {
-
-                            /*
-                                This prevents a corner case, explained in #ES-11449, occurring as follows:
-                                - Master M is in term T and has cluster state (T, V).
-                                - Node N tries to join the cluster.
-                                - M proposes cluster state (T, V+1) with N in the cluster.
-                                - M accepts its own proposal and commits it to disk.
-                                - M receives no responses. M doesn't know whether the state was accepted by a majority of nodes,
-                                    rejected, or did not reach any nodes.
-                                - There is a re-election and M wins. M publishes cluster state (T+1, V+2).
-                                  Since it's built from the cluster state on disk, N is still in the cluster.
-                                - Since (T, V+1) failed, a FailedToCommitClusterStateException is thrown and N's connection is dropped,
-                                    even though its inclusion in the cluster may have been committed on a majority of master nodes.
-                                - It can rejoin, but this throws a WARN log since it did not restart.
-
-                                The above situation occurs here when a FailedToCommitClusterStateException is thrown.
-                                When we catch this exception, we keep the connection open until the next cluster state update.
-                                N is accepted -> By waiting, we did not close the connection to N unnecessarily
-                                N is rejected -> A new cluster state is published without N in it. Closing is correct here.
-                             */
-                            if (e instanceof FailedToCommitClusterStateException) {
-                                ClusterStateListener clusterStateListener = new ClusterStateListener() {
-                                    @Override
-                                    public void clusterChanged(ClusterChangedEvent event) {
-                                        // Keep the connection open until the next committed state
-                                        if (event.state().nodes().getMasterNode() != null) {
-                                            Releasables.close(response);
-                                            // Remove this listener to avoid memory leaks
-                                            clusterService.removeListener(this);
-                                            joinListener.onResponse(null);
+                SubscribableListener
+                    // Validates the join request: can the remote node deserialize our cluster state and does it respond to pings?
+                    .<Void>newForked(l -> validateJoinRequest(joinRequest, l))
+
+                    // Adds the joining node to the cluster state
+                    .<Void>andThen(l -> processJoinRequest(joinRequest, l.delegateResponse((ll, e) -> {
+                        // #ES-11449
+                        if (e instanceof FailedToCommitClusterStateException) {
+                            // The commit failed (i.e. master is failing over) but this does not imply that the join has actually failed:
+                            // the next master may have already accepted the state that we just published and will therefore include the
+                            // joining node in its future states too. Thus we need to wait for the next committed state before we know the
+                            // eventual outcome, and we need to wait for that before we can release (our ref to) the connection and complete
+                            // the listener.
+
+                            // NB we are on the master update thread here at the end of processing the failed cluster state update, so this
+                            // all happens before any cluster state update that re-elects a master
+                            assert ThreadPool.assertCurrentThreadPool(MasterService.MASTER_UPDATE_THREAD_NAME);
+
+                            final ClusterStateListener clusterStateListener = new ClusterStateListener() {
+                                @Override
+                                public void clusterChanged(ClusterChangedEvent event) {
+                                    final var discoveryNodes = event.state().nodes();
+                                    // Keep the connection open until the next committed state
+                                    if (discoveryNodes.getMasterNode() != null) {
+                                        // Remove this listener to avoid memory leaks
+                                        clusterService.removeListener(this);
+
+                                        if (discoveryNodes.nodeExists(joinRequest.getSourceNode().getId())) {
+                                            ll.onResponse(null);
+                                        } else {
+                                            ll.onFailure(e);
                                         }
                                     }
-                                };
+                                }
+                            };
+                            clusterService.addListener(clusterStateListener);
 
-                                clusterService.addListener(clusterStateListener);
-                            } else {
-                                Releasables.close(response);
-                                joinListener.onFailure(e);
+                            // Immediate condition check in case another node is elected master
+                            if (clusterService.state().nodes().nodeExists(joinRequest.getSourceNode().getId())) {
  * If this listener is completed more than once then all results other than the first (whether successful or otherwise) are silently 
  * discarded. All subscribed listeners will be notified of the same result, exactly once, even if several completions occur concurrently. 
  * If this listener is completed more than once then all results other than the first (whether successful or otherwise) are silently 
  * discarded. All subscribed listeners will be notified of the same result, exactly once, even if several completions occur concurrently. 
+                                // Remove this listener to avoid memory leaks
+                                clusterService.removeListener(clusterStateListener);
+
+                                ll.onResponse(null);
                             }
-                        })))
-                );
+                        } else {
+                            ll.onFailure(e);
+                        }
+                    })))
+
+                    // Whatever the outcome, release (our ref to) the connection we just opened and notify the joining node.
+                    .addListener(ActionListener.runBefore(joinListener, () -> Releasables.close(response)));
             }
 
             @Override