Skip to content

Commit 856c3dd

Browse files
authored
Merge pull request #20071 from ahrtr/20250530_force_new_cluster_3.6
[release-3.6] Fix the issue that `--force-new-cluster` can't remove all other members in a corner case
2 parents 322b79c + 78e0e23 commit 856c3dd

File tree

3 files changed

+117
-2
lines changed

3 files changed

+117
-2
lines changed

server/etcdserver/bootstrap.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ func bootstrap(cfg config.ServerConfig) (b *bootstrappedServer, err error) {
8787
if err = fileutil.IsDirWriteable(cfg.WALDir()); err != nil {
8888
return nil, fmt.Errorf("cannot write to WAL directory: %w", err)
8989
}
90-
bwal = bootstrapWALFromSnapshot(cfg, backend.snapshot)
90+
cfg.Logger.Info("Bootstrapping WAL from snapshot")
91+
bwal = bootstrapWALFromSnapshot(cfg, backend.snapshot, backend.ci)
9192
}
9293

9394
cfg.Logger.Info("bootstrapping cluster")
@@ -556,7 +557,7 @@ func (b *bootstrappedRaft) newRaftNode(ss *snap.Snapshotter, wal *wal.WAL, cl *m
556557
)
557558
}
558559

559-
func bootstrapWALFromSnapshot(cfg config.ServerConfig, snapshot *raftpb.Snapshot) *bootstrappedWAL {
560+
func bootstrapWALFromSnapshot(cfg config.ServerConfig, snapshot *raftpb.Snapshot, ci cindex.ConsistentIndexer) *bootstrappedWAL {
560561
wal, st, ents, snap, meta := openWALFromSnapshot(cfg, snapshot)
561562
bwal := &bootstrappedWAL{
562563
lg: cfg.Logger,
@@ -569,6 +570,19 @@ func bootstrapWALFromSnapshot(cfg config.ServerConfig, snapshot *raftpb.Snapshot
569570
}
570571

571572
if cfg.ForceNewCluster {
573+
consistentIndex := ci.ConsistentIndex()
574+
oldCommitIndex := bwal.st.Commit
575+
// If only `HardState.Commit` increases, HardState won't be persisted
576+
// to disk, even though the committed entries might have already been
577+
// applied. This can result in consistent_index > CommitIndex.
578+
//
579+
// When restarting etcd with `--force-new-cluster`, all uncommitted
580+
// entries are dropped. To avoid losing entries that were actually
581+
// committed, we reset Commit to max(HardState.Commit, consistent_index).
582+
//
583+
// See: https://github.com/etcd-io/raft/pull/300 for more details.
584+
bwal.st.Commit = max(oldCommitIndex, consistentIndex)
585+
572586
// discard the previously uncommitted entries
573587
bwal.ents = bwal.CommitedEntries()
574588
entries := bwal.NewConfigChangeEntries()
@@ -578,6 +592,7 @@ func bootstrapWALFromSnapshot(cfg config.ServerConfig, snapshot *raftpb.Snapshot
578592
"forcing restart member",
579593
zap.String("cluster-id", meta.clusterID.String()),
580594
zap.String("local-member-id", meta.nodeID.String()),
595+
zap.Uint64("wal-commit-index", oldCommitIndex),
581596
zap.Uint64("commit-index", bwal.st.Commit),
582597
)
583598
} else {

tests/e2e/force_new_cluster_test.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,22 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//go:build !cluster_proxy
16+
1517
package e2e
1618

1719
import (
1820
"context"
21+
"encoding/json"
1922
"testing"
23+
"time"
2024

2125
"github.com/stretchr/testify/require"
2226

27+
"go.etcd.io/bbolt"
28+
"go.etcd.io/etcd/server/v3/etcdserver/api/membership"
29+
"go.etcd.io/etcd/server/v3/storage/datadir"
30+
"go.etcd.io/etcd/server/v3/storage/schema"
2331
"go.etcd.io/etcd/tests/v3/framework/config"
2432
"go.etcd.io/etcd/tests/v3/framework/e2e"
2533
)
@@ -70,3 +78,63 @@ func TestForceNewCluster(t *testing.T) {
7078
})
7179
}
7280
}
81+
82+
func TestForceNewCluster_MemberCount(t *testing.T) {
83+
e2e.BeforeTest(t)
84+
85+
ctx := context.Background()
86+
87+
epc, promotedMembers := mustCreateNewClusterByPromotingMembers(t, e2e.CurrentVersion, 3, e2e.WithKeepDataDir(true))
88+
require.Len(t, promotedMembers, 2)
89+
90+
// Wait for the backend TXN to sync/commit the data to disk, to ensure
91+
// the consistent-index is persisted. Another way is to issue a snapshot
92+
// command to forcibly commit the backend TXN.
93+
time.Sleep(time.Second)
94+
95+
t.Log("Killing all the members")
96+
require.NoError(t, epc.Kill())
97+
require.NoError(t, epc.Wait(ctx))
98+
99+
m := epc.Procs[0]
100+
t.Logf("Forcibly create a one-member cluster with member: %s", m.Config().Name)
101+
m.Config().Args = append(m.Config().Args, "--force-new-cluster")
102+
require.NoError(t, m.Start(ctx))
103+
104+
t.Log("Online checking the member count")
105+
mresp, merr := m.Etcdctl().MemberList(ctx, false)
106+
require.NoError(t, merr)
107+
require.Len(t, mresp.Members, 1)
108+
109+
t.Log("Closing the member")
110+
require.NoError(t, m.Close())
111+
require.NoError(t, m.Wait(ctx))
112+
113+
t.Log("Offline checking the member count")
114+
members := mustReadMembersFromBoltDB(t, m.Config().DataDirPath)
115+
require.Len(t, members, 1)
116+
}
117+
118+
func mustReadMembersFromBoltDB(t *testing.T, dataDir string) []*membership.Member {
119+
dbPath := datadir.ToBackendFileName(dataDir)
120+
db, err := bbolt.Open(dbPath, 0o400, &bbolt.Options{ReadOnly: true})
121+
require.NoError(t, err)
122+
defer func() {
123+
require.NoError(t, db.Close())
124+
}()
125+
126+
var members []*membership.Member
127+
_ = db.View(func(tx *bbolt.Tx) error {
128+
b := tx.Bucket(schema.Members.Name())
129+
_ = b.ForEach(func(k, v []byte) error {
130+
m := membership.Member{}
131+
err := json.Unmarshal(v, &m)
132+
require.NoError(t, err)
133+
members = append(members, &m)
134+
return nil
135+
})
136+
return nil
137+
})
138+
139+
return members
140+
}

tests/framework/e2e/cluster.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,6 +1002,38 @@ func (epc *EtcdProcessCluster) rollingStart(f func(ep EtcdProcess) error) error
10021002
return nil
10031003
}
10041004

1005+
func (epc *EtcdProcessCluster) Kill() (err error) {
1006+
for _, p := range epc.Procs {
1007+
if p == nil {
1008+
continue
1009+
}
1010+
if curErr := p.Kill(); curErr != nil {
1011+
if err != nil {
1012+
err = fmt.Errorf("%w; %w", err, curErr)
1013+
} else {
1014+
err = curErr
1015+
}
1016+
}
1017+
}
1018+
return err
1019+
}
1020+
1021+
func (epc *EtcdProcessCluster) Wait(ctx context.Context) error {
1022+
closedC := make(chan error, len(epc.Procs))
1023+
for i := range epc.Procs {
1024+
go func(n int) {
1025+
epc.Procs[n].Wait(ctx)
1026+
closedC <- epc.Procs[n].Wait(ctx)
1027+
}(i)
1028+
}
1029+
for range epc.Procs {
1030+
if err := <-closedC; err != nil {
1031+
return err
1032+
}
1033+
}
1034+
return nil
1035+
}
1036+
10051037
func (epc *EtcdProcessCluster) Stop() (err error) {
10061038
for _, p := range epc.Procs {
10071039
if p == nil {

0 commit comments

Comments
 (0)