Skip to content

Commit 6b4e320

Browse files
Dhia Ayachincabatoff
andauthored
check if server is in configuration when receiving a voteRequest (#526)
* check if server is in configuration, and not have vote rights, when receiving a voteRequest * change test term to 20 to reduce the chance that the test passes when the bug manifest Co-authored-by: Nick Cabatoff <ncabatoff@hashicorp.com> * non voter with higher term vote request would make the node step-down but don't grant a vote * fix test to check we use the right term * add more details in the comment and a reference to the PR Co-authored-by: Nick Cabatoff <ncabatoff@hashicorp.com>
1 parent 24e68f8 commit 6b4e320

File tree

3 files changed

+115
-2
lines changed

3 files changed

+115
-2
lines changed

configuration.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,17 @@ func hasVote(configuration Configuration, id ServerID) bool {
177177
return false
178178
}
179179

180+
// inConfiguration returns true if the server identified by 'id' is in in the
181+
// provided Configuration.
182+
func inConfiguration(configuration Configuration, id ServerID) bool {
183+
for _, server := range configuration.Servers {
184+
if server.ID == id {
185+
return true
186+
}
187+
}
188+
return false
189+
}
190+
180191
// checkConfiguration tests a cluster membership configuration for common
181192
// errors.
182193
func checkConfiguration(configuration Configuration) error {

raft.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1566,8 +1566,8 @@ func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
15661566
candidateID := ServerID(req.ID)
15671567
// if the Servers list is empty that mean the cluster is very likely trying to bootstrap,
15681568
// Grant the vote
1569-
if len(r.configurations.latest.Servers) > 0 && !hasVote(r.configurations.latest, candidateID) {
1570-
r.logger.Warn("rejecting vote request since node is not a voter",
1569+
if len(r.configurations.latest.Servers) > 0 && !inConfiguration(r.configurations.latest, candidateID) {
1570+
r.logger.Warn("rejecting vote request since node is not in configuration",
15711571
"from", candidate)
15721572
return
15731573
}
@@ -1594,6 +1594,18 @@ func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
15941594
resp.Term = req.Term
15951595
}
15961596

1597+
// if we get a request for vote from a nonVoter and the request term is higher,
1598+
// step down and update term, but reject the vote request
1599+
// This could happen when a node, previously voter, is converted to non-voter
1600+
// The reason we need to step in is to permit to the cluster to make progress in such a scenario
1601+
// More details about that in https://github.com/hashicorp/raft/pull/526
1602+
if len(req.ID) > 0 {
1603+
candidateID := ServerID(req.ID)
1604+
if len(r.configurations.latest.Servers) > 0 && !hasVote(r.configurations.latest, candidateID) {
1605+
r.logger.Warn("rejecting vote request since node is not a voter", "from", candidate)
1606+
return
1607+
}
1608+
}
15971609
// Check if we have voted yet
15981610
lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm)
15991611
if err != nil && err.Error() != "not found" {

raft_test.go

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2644,6 +2644,96 @@ func TestRaft_VoteNotGranted_WhenNodeNotInCluster(t *testing.T) {
26442644
}
26452645
}
26462646

2647+
func TestRaft_ClusterCanRegainStability_WhenNonVoterWithHigherTermJoin(t *testing.T) {
2648+
// Make a cluster
2649+
c := MakeCluster(3, t, nil)
2650+
2651+
defer c.Close()
2652+
2653+
// Get the leader
2654+
leader := c.Leader()
2655+
2656+
// Wait until we have 2 followers
2657+
limit := time.Now().Add(c.longstopTimeout)
2658+
var followers []*Raft
2659+
for time.Now().Before(limit) && len(followers) != 2 {
2660+
c.WaitEvent(nil, c.conf.CommitTimeout)
2661+
followers = c.GetInState(Follower)
2662+
}
2663+
if len(followers) != 2 {
2664+
t.Fatalf("expected two followers: %v", followers)
2665+
}
2666+
2667+
// Remove a follower
2668+
followerRemoved := followers[0]
2669+
c.Disconnect(followerRemoved.localAddr)
2670+
time.Sleep(c.propagateTimeout)
2671+
2672+
future := leader.RemoveServer(followerRemoved.localID, 0, 0)
2673+
if err := future.Error(); err != nil {
2674+
t.Fatalf("err: %v", err)
2675+
}
2676+
2677+
//set that follower term to higher term to faster simulate a partitioning
2678+
newTerm := leader.getCurrentTerm() + 20
2679+
followerRemoved.setCurrentTerm(newTerm)
2680+
//Add the node back as NonVoter
2681+
future = leader.AddNonvoter(followerRemoved.localID, followerRemoved.localAddr, 0, 0)
2682+
if err := future.Error(); err != nil {
2683+
t.Fatalf("err: %v", err)
2684+
}
2685+
2686+
c.FullyConnect()
2687+
2688+
// Wait a while
2689+
time.Sleep(c.propagateTimeout)
2690+
// Check the term is now a new term
2691+
leader = c.Leader()
2692+
currentTerm := leader.getCurrentTerm()
2693+
if newTerm > currentTerm {
2694+
t.Fatalf("term should have changed,%d < %d", newTerm, currentTerm)
2695+
}
2696+
2697+
// check nonVoter is not elected
2698+
if leader.localID == followerRemoved.localID {
2699+
t.Fatalf("Should not be leader %s", followerRemoved.localID)
2700+
}
2701+
2702+
//Write some logs to ensure they replicate
2703+
for i := 0; i < 100; i++ {
2704+
future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
2705+
if err := future.Error(); err != nil {
2706+
t.Fatalf("[ERR] apply err: %v", err)
2707+
}
2708+
}
2709+
c.WaitForReplication(100)
2710+
2711+
//Remove the server and add it back as Voter
2712+
future = leader.RemoveServer(followerRemoved.localID, 0, 0)
2713+
if err := future.Error(); err != nil {
2714+
t.Fatalf("err: %v", err)
2715+
}
2716+
leader.AddVoter(followerRemoved.localID, followerRemoved.localAddr, 0, 0)
2717+
2718+
// Wait a while
2719+
time.Sleep(c.propagateTimeout * 10)
2720+
2721+
//Write some logs to ensure they replicate
2722+
for i := 100; i < 200; i++ {
2723+
future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
2724+
if err := future.Error(); err != nil {
2725+
t.Fatalf("[ERR] apply err: %v", err)
2726+
}
2727+
}
2728+
c.WaitForReplication(200)
2729+
2730+
// Check leader stable
2731+
newLeader := c.Leader()
2732+
if newLeader.leaderID != leader.leaderID {
2733+
t.Fatalf("leader changed")
2734+
}
2735+
}
2736+
26472737
// TestRaft_FollowerRemovalNoElection ensures that a leader election is not
26482738
// started when a standby is shut down and restarted.
26492739
func TestRaft_FollowerRemovalNoElection(t *testing.T) {

0 commit comments

Comments
 (0)