NRG: Disjoint majorities during membership changes and network partitions

sciascid · sciascid · commit 565024082225 · 2025-12-23T17:35:07.000+01:00
This commit fixes the following bugs:

- Inconsistent Cluster Size: When a leader was partitioned from the
  cluster, immediately after proposing a EntryAddPeer. The remaining
  nodes could end up with different view of the cluster size and
  quorum. So followers could have cluster size and would not match
  the number of peers in the peer set. A subsequent leader election,
  electing one of the followers, could break the quorum system.

- Incorrect Leader Election: It was possible for a new leader to be
  elected without a proper quorum. This could happen if a partition
  occurred after a new peer was proposed but before that change was
  committed. A follower could add the uncommitted peer to its peer
  set but would not update its cluster size and quorum, leading to
  an invalid election.

Both issues are solved by making sure that when a peer is added or
removed from the membership, the cluster size and quorum are adjusted
accordingly, at the same time. Followers would first add peers when
receiving the EntryAddPeer, and then adjusting the cluster size
only after commit. This patch changes this behavior such that the
cluster size and quorum are recomputed upon receiving the EntryAddPeer
/ EntryRemovePeer proposals. This is inline with the membership
protocol proposed in Ongaro's dissertation, section 4.1.
This patch also removes the concept of a "known" peer from the Raft
layer. A node would add a peer to its peer set when first receiving
the corresponding appendEntry, and on commit it would be marked as
"known". This distinction no longer applies.

Signed-off-by: Daniele Sciascia &lt;daniele@nats.io&gt;
diff --git a/server/monitor.go b/server/monitor.go
@@ -4190,8 +4190,10 @@ func (s *Server) Raftz(opts *RaftzOptions) *RaftzStatus {
 			}
 			peer := RaftzGroupPeer{
 				Name:                s.serverNameForNode(id),
-				Known:               p.kp,
 				LastReplicatedIndex: p.li,
+				// The Raft layer no longer distinguishes between
+				// 'known' and 'unknown' peers.
+				Known: true,
 			}
 			if !p.ts.IsZero() {
 				peer.LastSeen = time.Since(p.ts).String()
diff --git a/server/raft.go b/server/raft.go
@@ -251,7 +251,6 @@ type catchupState struct {
 type lps struct {
 	ts time.Time // Last timestamp
 	li uint64    // Last index replicated
-	kp bool      // Known peer
 }
 
 const (
@@ -541,13 +540,13 @@ func (s *Server) initRaftNode(accName string, cfg *RaftConfig, labels pprofLabel
 	}
 
 	// Make sure to track ourselves.
-	n.peers[n.id] = &lps{time.Now(), 0, true}
+	n.peers[n.id] = &lps{time.Now(), 0}
 
 	// Track known peers
 	for _, peer := range ps.knownPeers {
 		if peer != n.id {
 			// Set these to 0 to start but mark as known peer.
-			n.peers[peer] = &lps{time.Time{}, 0, true}
+			n.peers[peer] = &lps{time.Time{}, 0}
 		}
 	}
 
@@ -2620,13 +2619,10 @@ func (n *raft) addPeer(peer string) {
 		delete(n.removed, peer)
 	}
 
-	if lp, ok := n.peers[peer]; !ok {
+	if _, ok := n.peers[peer]; !ok {
 		// We are not tracking this one automatically so we need
 		// to bump cluster size.
-		n.peers[peer] = &lps{time.Time{}, 0, true}
-	} else {
-		// Mark as added.
-		lp.kp = true
+		n.peers[peer] = &lps{time.Time{}, 0}
 	}
 
 	// Adjust cluster size and quorum if needed.
@@ -3191,29 +3187,15 @@ func (n *raft) applyCommit(index uint64) error {
 				}
 			}
 		case EntryAddPeer:
-			newPeer := string(e.Data)
-			n.debug("Added peer %q", newPeer)
-
-			// Store our peer in our global peer map for all peers.
-			peers.LoadOrStore(newPeer, newPeer)
-
-			n.addPeer(newPeer)
-
 			// We pass these up as well.
 			committed = append(committed, e)
 
 			// We are done with this membership change
 			n.membChanging = false
-
 		case EntryRemovePeer:
 			peer := string(e.Data)
 			n.debug("Removing peer %q", peer)
 
-			n.removePeer(peer)
-
-			// Remove from string intern map.
-			peers.Delete(peer)
-
 			// We pass these up as well.
 			committed = append(committed, e)
 
@@ -3301,25 +3283,20 @@ func (n *raft) trackResponse(ar *appendEntryResponse) bool {
 // Used to adjust cluster size and peer count based on added official peers.
 // lock should be held.
 func (n *raft) adjustClusterSizeAndQuorum() {
-	pcsz, ncsz := n.csz, 0
-	for _, peer := range n.peers {
-		if peer.kp {
-			ncsz++
-		}
-	}
-	n.csz = ncsz
+	pcsz := n.csz
+	n.csz = len(n.peers)
 	n.qn = n.csz/2 + 1
 
-	if ncsz > pcsz {
-		n.debug("Expanding our clustersize: %d -> %d", pcsz, ncsz)
+	if n.csz > pcsz {
+		n.debug("Expanding our clustersize: %d -> %d", pcsz, n.csz)
 		n.lsut = time.Now()
-	} else if ncsz < pcsz {
-		n.debug("Decreasing our clustersize: %d -> %d", pcsz, ncsz)
+	} else if n.csz < pcsz {
+		n.debug("Decreasing our clustersize: %d -> %d", pcsz, n.csz)
 		if n.State() == Leader {
 			go n.sendHeartbeat()
 		}
 	}
-	if ncsz != pcsz {
+	if n.csz != pcsz {
 		n.recreateInternalSubsLocked()
 	}
 }
@@ -3337,7 +3314,7 @@ func (n *raft) trackPeer(peer string) error {
 		}
 	}
 	if n.State() == Leader {
-		if lp, ok := n.peers[peer]; !ok || !lp.kp {
+		if _, ok := n.peers[peer]; !ok {
 			// Check if this peer had been removed previously.
 			needPeerAdd = !isRemoved
 		}
@@ -3937,14 +3914,17 @@ CONTINUE:
 				}
 			}
 		case EntryAddPeer:
-			if newPeer := string(e.Data); len(newPeer) == idLen {
-				// Track directly, but wait for commit to be official
-				if _, ok := n.peers[newPeer]; !ok {
-					n.peers[newPeer] = &lps{time.Time{}, 0, false}
-				}
-				// Store our peer in our global peer map for all peers.
-				peers.LoadOrStore(newPeer, newPeer)
-			}
+			peer := string(e.Data)
+			// Store our peer in our global peer map for all peers.
+			peers.LoadOrStore(peer, peer)
+			n.addPeer(peer)
+			n.debug("Added peer %q", peer)
+		case EntryRemovePeer:
+			peer := string(e.Data)
+			// Remove from string intern map.
+			peers.Delete(peer)
+			n.removePeer(peer)
+			n.debug("Removed peer %q", peer)
 		}
 	}
 
@@ -4006,10 +3986,9 @@ func (n *raft) processPeerState(ps *peerState) {
 	n.peers = make(map[string]*lps)
 	for _, peer := range ps.knownPeers {
 		if lp := old[peer]; lp != nil {
-			lp.kp = true
 			n.peers[peer] = lp
 		} else {
-			n.peers[peer] = &lps{time.Time{}, 0, true}
+			n.peers[peer] = &lps{time.Time{}, 0}
 		}
 	}
 	n.debug("Update peers from leader to %+v", n.peers)
@@ -4251,10 +4230,8 @@ func decodePeerState(buf []byte) (*peerState, error) {
 // Lock should be held.
 func (n *raft) peerNames() []string {
 	var peers []string
-	for name, peer := range n.peers {
-		if peer.kp {
-			peers = append(peers, name)
-		}
+	for name := range n.peers {
+		peers = append(peers, name)
 	}
 	return peers
 }
diff --git a/server/raft_helpers_test.go b/server/raft_helpers_test.go
@@ -57,7 +57,7 @@ func (sg smGroup) leader() stateMachine {
 	return nil
 }
 
-func (sg smGroup) followers() []stateMachine {
+func (sg smGroup) followers() smGroup {
 	var f []stateMachine
 	for _, sm := range sg {
 		if sm.node().Leader() {
diff --git a/server/raft_test.go b/server/raft_test.go
@@ -3870,9 +3870,6 @@ func TestNRGQuorumAfterLeaderStepdown(t *testing.T) {
 	require_NoError(t, n.trackPeer(nats1))
 	require_True(t, n.Quorum())
 	require_Len(t, len(n.peers), 3)
-	for _, ps := range n.peers {
-		ps.kp = true
-	}
 
 	// If we hand off leadership to another server, we should
 	// still be reporting we have quorum.
@@ -4680,3 +4677,187 @@ func TestNRGPartitionedPeerRemove(t *testing.T) {
 		return nil
 	})
 }
+
+func TestNRGPeerAddAndPartitionLeader(t *testing.T) {
+	c := createJetStreamClusterExplicit(t, "R3S", 3)
+	defer c.shutdown()
+
+	hub, rtf, rg := c.createMockMemRaftGroup("MOCK", 3, newStateAdder)
+
+	leader := rg.waitOnLeader()
+	followers := rg.followers()
+
+	// When the leader sends a EntryAddPeer, isolate it from
+	// the rest of the cluster.
+	hub.setAfterMsgHook(func(subject, reply string, msg []byte) {
+		if subject != "$NRG.AE.MOCK" {
+			return
+		}
+		ae, _ := decodeAppendEntry(msg, nil, reply)
+		if ae == nil || len(ae.entries) != 1 {
+			return
+		}
+		if ae.leader != leader.node().ID() {
+			return
+		}
+		if ae.entries[0].Type == EntryAddPeer {
+			hub.partition(leader.node().ID(), 1)
+		}
+	})
+
+	// Add a new node and expect a new leader to be elected
+	newNode := c.addMockMemRaftNode("MOCK", rtf, newStateAdder)
+	newGroup := append(followers, newNode)
+	newLeader := newGroup.waitOnLeader()
+	require_True(t, newLeader != nil)
+
+	// If bug is present: The new leader has not yet committed the
+	// appendEntry containing EntryAddPeer. The new leader (and
+	// followers) would not update their cluster size until the
+	// EntryAddPeer is committed. So that we have the following
+	// sequence of events:
+	// 1) the new leader has initially cluster size of 3
+	// 2) it sends a peerState message with cluster size 3
+	// 3) the leader commits the EntryAddPeer from the previous
+	//    leader applies it. Now cluster size is 4.
+	// 4) the followers commit the EntryAddPeer, incrementing
+	//    their cluster size to 4. Next, the EntryPeerState is
+	//    committed and the cluster size goes back to 3.
+	// 5) At this point cluster size from the leader has diverged
+	//    from the cluster size of its followers.
+
+	// Expect all nodes to report cluster size of 4
+	for _, n := range newGroup {
+		checkFor(t, 1*time.Second, 10*time.Millisecond, func() error {
+			if n.node().ClusterSize() != 4 {
+				return errors.New("node addition still in progress")
+			}
+			return nil
+		})
+	}
+
+	// Finally bring back the old leader
+	hub.healPartitions()
+	checkFor(t, 1*time.Second, 10*time.Millisecond, func() error {
+		if leader.node().MembershipChangeInProgress() {
+			return errors.New("membership still in progress")
+		}
+		if leader.node().ClusterSize() != 4 {
+			return errors.New("node addition still in progress")
+		}
+		return nil
+	})
+}
+
+func TestNRGPeerRemoveAndPartitionLeader(t *testing.T) {
+	c := createJetStreamClusterExplicit(t, "R5S", 5)
+	defer c.shutdown()
+
+	hub, _, rg := c.createMockMemRaftGroup("MOCK", 5, newStateAdder)
+
+	leader := rg.waitOnLeader()
+	followers := rg.followers()
+
+	// When the leader sends a EntryRemovePeer, isolate it from
+	// the rest of the cluster.
+	hub.setAfterMsgHook(func(subject, reply string, msg []byte) {
+		if subject != "$NRG.AE.MOCK" {
+			return
+		}
+		ae, _ := decodeAppendEntry(msg, nil, reply)
+		if ae == nil || len(ae.entries) != 1 {
+			return
+		}
+		if ae.leader != leader.node().ID() {
+			return
+		}
+		if ae.entries[0].Type == EntryRemovePeer {
+			hub.partition(leader.node().ID(), 1)
+		}
+	})
+
+	leader.node().ProposeRemovePeer(leader.node().ID())
+
+	// Expect followers to elect a new leader
+	newLeader := followers.waitOnLeader()
+	require_True(t, newLeader != nil)
+
+	// Expect all nodes to report cluster size 4
+	for _, n := range followers {
+		checkFor(t, 1*time.Second, 10*time.Millisecond, func() error {
+			if n.node().ClusterSize() != 4 {
+				return errors.New("node addition still in progress")
+			}
+			return nil
+		})
+	}
+}
+
+func TestNRGLeaderWithoutQuorumAfterPeerAdd(t *testing.T) {
+	c := createJetStreamClusterExplicit(t, "R3S", 3)
+	defer c.shutdown()
+
+	hub, rtf, rg := c.createMockMemRaftGroup("MOCK", 3, newStateAdder)
+	defer hub.healPartitions()
+
+	leader := rg.waitOnLeader()
+	followers := rg.followers()
+
+	// Setup a after message hook to create a partition as soon as
+	// the leader publishes a EntryAddPeer. The partition will
+	// prevent committing the entry.
+	hub.setAfterMsgHook(func(subject, reply string, msg []byte) {
+		if subject != "$NRG.AE.MOCK" {
+			return
+		}
+		ae, _ := decodeAppendEntry(msg, nil, reply)
+		if ae == nil || len(ae.entries) != 1 {
+			return
+		}
+		if ae.leader != leader.node().ID() {
+			return
+		}
+
+		// After EntryAddPeer is published, partition the
+		// leader and one of the followers. This partition
+		// can't commit the entry.
+		if ae.entries[0].Type == EntryAddPeer {
+			hub.partition(leader.node().ID(), 1)
+			hub.partition(followers[0].node().ID(), 1)
+		}
+	})
+
+	newNode := c.addMockMemRaftNode("MOCK", rtf, newStateAdder)
+
+	// At some point here the cluster gets partitioned in two
+	// parts: {leader, followers[0]} and {newNode, followers[1]}.
+	// Neither side should be able to make progress.
+	newGroup := smGroup{newNode, followers[1]}
+	newLeader := newGroup.waitOnLeader()
+
+	// If the bug is present: we managed to elect a new leader,
+	// in a 4 node cluster, with only two nodes in the partition!
+	// This is because of the following sequence of events:
+	// 1) the follower has received the EntryPeerAdd
+	// 2) the leader and the other follower have partitioned away
+	// 3) the entry is uncommitted, however the follower has added
+	///   the new peer to its peer set, but won't adjust cluster
+	//    size and quorum until after the entry is committed.
+	// 4) follower becomes a canditate and will become leader with
+	//    with a single vote from the new node
+	require_Equal(t, newLeader, nil)
+
+	// Check that node addtition completes after healing the partition
+	hub.healPartitions()
+	rg = append(rg, newNode)
+	newLeader = rg.waitOnLeader()
+	require_True(t, newLeader != nil)
+	for _, n := range rg {
+		checkFor(t, 1*time.Second, 10*time.Millisecond, func() error {
+			if n.node().ClusterSize() != 4 {
+				return errors.New("node addition still in progress")
+			}
+			return nil
+		})
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -4190,8 +4190,10 @@ func (s Server) Raftz(opts RaftzOptions) *RaftzStatus {`
`4190`	`4190`	`}`
`4191`	`4191`	`peer := RaftzGroupPeer{`
`4192`	`4192`	`Name: s.serverNameForNode(id),`
`4193`		`- Known: p.kp,`
`4194`	`4193`	`LastReplicatedIndex: p.li,`
	`4194`	`+ // The Raft layer no longer distinguishes between`
	`4195`	`+ // 'known' and 'unknown' peers.`
	`4196`	`+ Known: true,`
`4195`	`4197`	`}`
`4196`	`4198`	`if !p.ts.IsZero() {`
`4197`	`4199`	`peer.LastSeen = time.Since(p.ts).String()`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ func (sg smGroup) leader() stateMachine {`
`57`	`57`	`return nil`
`58`	`58`	`}`
`59`	`59`
`60`		`-func (sg smGroup) followers() []stateMachine {`
	`60`	`+func (sg smGroup) followers() smGroup {`
`61`	`61`	`var f []stateMachine`
`62`	`62`	`for _, sm := range sg {`
`63`	`63`	`if sm.node().Leader() {`