@@ -669,65 +669,56 @@ private void handleJoinRequest(JoinRequest joinRequest, ActionListener<Void> joi
669
669
transportService .connectToNode (joinRequest .getSourceNode (), new ActionListener <>() {
670
670
@ Override
671
671
public void onResponse (Releasable response ) {
672
- validateJoinRequest (
673
- joinRequest ,
674
- new ActionListener <>() {
675
- @ Override
676
- public void onResponse (Void unused ) {
677
- processJoinRequest (
678
- joinRequest ,
679
- ActionListener .runBefore (
680
- joinListener ,
681
- () -> Releasables .close (response )
682
- )
683
- );
684
- }
672
+ validateJoinRequest (joinRequest , new ActionListener <>() {
673
+ @ Override
674
+ public void onResponse (Void unused ) {
675
+ processJoinRequest (joinRequest , ActionListener .runBefore (joinListener , () -> Releasables .close (response )));
676
+ }
685
677
686
- /*
687
- This prevents a corner case, explained in #ES-11449, occurring as follows:
688
- - Master M is in term T and has cluster state (T, V).
689
- - Node N tries to join the cluster.
690
- - M proposes cluster state (T, V+1) with N in the cluster.
691
- - M accepts its own proposal and commits it to disk.
692
- - M receives no responses. M doesn't know whether the state was accepted by a majority of nodes,
693
- rejected, or did not reach any nodes.
694
- - There is a re-election and M wins. M publishes cluster state (T+1, V+2).
695
- Since it's built from the cluster state on disk, N is still in the cluster.
696
- - Since (T, V+1) failed, a FailedToCommitClusterStateException is thrown and N's connection is dropped,
697
- even though its inclusion in the cluster may have been committed on a majority of master nodes.
698
- - It can rejoin, but this throws a WARN log since it did not restart.
699
-
700
- To mitigate this, we optionally listen for the next committed cluster state update:
701
- 1. (T, V+1) is accepted -> NodeConnectionsService now stores an open connection to N.
702
- The connection can be closed as soon as the node has joined. This is handled by onResponse above.
703
- 2. (T, V+1) is rejected -> A new cluster state is published without N in it
704
- It is right to close the connection and retry. This is handled by onResponse above.
705
- 3. The above scenario occurs, and a FailedToCommitClusterStateException is thrown for state (T, V+1).
706
- Now, we keep the connection open until the next committed cluster state, rather than disconnecting:
707
- 3.1 (T+1, V+2) is accepted -> By waiting, we did not close the connection to N unnecessarily
708
- 3.2 (T+1, V+2) is rejected -> A new cluster state is published without N in it. Closing is correct here.
709
- */
710
- @ Override
711
- public void onFailure (Exception e ) {
712
- if (e instanceof FailedToCommitClusterStateException ) {
713
- ClusterStateListener clusterStateListener = new ClusterStateListener () {
714
- @ Override
715
- public void clusterChanged (ClusterChangedEvent event ) {
716
- // Keep the connection open until the next committed state
717
- if (event .state ().nodes ().getMasterNode () != null ) {
718
- Releasables .close (response );
719
- // Remove this listener to avoid memory leaks
720
- clusterService .removeListener (this );
721
- }
678
+ /*
679
+ This prevents a corner case, explained in #ES-11449, occurring as follows:
680
+ - Master M is in term T and has cluster state (T, V).
681
+ - Node N tries to join the cluster.
682
+ - M proposes cluster state (T, V+1) with N in the cluster.
683
+ - M accepts its own proposal and commits it to disk.
684
+ - M receives no responses. M doesn't know whether the state was accepted by a majority of nodes,
685
+ rejected, or did not reach any nodes.
686
+ - There is a re-election and M wins. M publishes cluster state (T+1, V+2).
687
+ Since it's built from the cluster state on disk, N is still in the cluster.
688
+ - Since (T, V+1) failed, a FailedToCommitClusterStateException is thrown and N's connection is dropped,
689
+ even though its inclusion in the cluster may have been committed on a majority of master nodes.
690
+ - It can rejoin, but this throws a WARN log since it did not restart.
691
+
692
+ To mitigate this, we optionally listen for the next committed cluster state update:
693
+ 1. (T, V+1) is accepted -> NodeConnectionsService now stores an open connection to N.
694
+ The connection can be closed as soon as the node has joined. This is handled by onResponse above.
695
+ 2. (T, V+1) is rejected -> A new cluster state is published without N in it
696
+ It is right to close the connection and retry. This is handled by onResponse above.
697
+ 3. The above scenario occurs, and a FailedToCommitClusterStateException is thrown for state (T, V+1).
698
+ Now, we keep the connection open until the next committed cluster state, rather than disconnecting:
699
+ 3.1 (T+1, V+2) is accepted -> By waiting, we did not close the connection to N unnecessarily
700
+ 3.2 (T+1, V+2) is rejected -> A new cluster state is published without N in it. Closing is correct here.
701
+ */
702
+ @ Override
703
+ public void onFailure (Exception e ) {
704
+ if (e instanceof FailedToCommitClusterStateException ) {
705
+ ClusterStateListener clusterStateListener = new ClusterStateListener () {
706
+ @ Override
707
+ public void clusterChanged (ClusterChangedEvent event ) {
708
+ // Keep the connection open until the next committed state
709
+ if (event .state ().nodes ().getMasterNode () != null ) {
710
+ Releasables .close (response );
711
+ // Remove this listener to avoid memory leaks
712
+ clusterService .removeListener (this );
722
713
}
723
- };
714
+ }
715
+ };
724
716
725
- clusterService .addListener (clusterStateListener );
726
- }
727
- joinListener .onFailure (e );
717
+ clusterService .addListener (clusterStateListener );
728
718
}
719
+ joinListener .onFailure (e );
729
720
}
730
- );
721
+ } );
731
722
}
732
723
733
724
@ Override
0 commit comments