Skip to content

Commit 3c497ef

Browse files
chideatclaude
andcommitted
fix: use SHUTDOWN NOSAVE when dbsize is 0 to prevent data loss during cross-version rolling upgrade
During a rolling upgrade (e.g. 7.2 → 9.0), a node demoted to replica may attempt a fullsync from the new-version master. If the RDB format is incompatible, the load fails after memory has already been cleared, leaving an empty in-memory state. A subsequent SHUTDOWN (with save) would then overwrite the valid dump.rdb with empty data. Fix: re-check Dbsize right before SHUTDOWN. If 0, use SHUTDOWN NOSAVE to preserve the existing dump.rdb (which holds valid pre-upgrade data). If >0, use normal SHUTDOWN. Applies to both failover/sentinel and cluster shutdown paths. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent c1f2c3f commit 3c497ef

File tree

4 files changed

+151
-6
lines changed

4 files changed

+151
-6
lines changed

cmd/helper/commands/cluster/shutdown.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,18 @@ func Shutdown(ctx context.Context, c *cli.Context, client *kubernetes.Clientset,
203203
time.Sleep(time.Second * 10)
204204

205205
logger.Info("do shutdown node")
206-
if _, err = valkeyClient.Do(ctx, "SHUTDOWN"); err != nil && !errors.Is(err, io.EOF) {
207-
logger.Error(err, "graceful shutdown failed")
206+
// Re-check dbsize: if 0, memory may have been cleared by a failed fullsync
207+
// (e.g., cross-version RDB incompatibility). Use NOSAVE to preserve existing dump.rdb.
208+
latestInfo, _ := valkeyClient.Info(ctx)
209+
if latestInfo != nil && latestInfo.Dbsize == 0 {
210+
logger.Info("dbsize is 0, using SHUTDOWN NOSAVE to preserve existing dump.rdb")
211+
if _, err = valkeyClient.Do(ctx, "SHUTDOWN", "NOSAVE"); err != nil && !errors.Is(err, io.EOF) {
212+
logger.Error(err, "graceful shutdown failed")
213+
}
214+
} else {
215+
if _, err = valkeyClient.Do(ctx, "SHUTDOWN"); err != nil && !errors.Is(err, io.EOF) {
216+
logger.Error(err, "graceful shutdown failed")
217+
}
208218
}
209219
return nil
210220
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
Copyright 2024 chideat.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package cluster
18+
19+
import (
20+
"context"
21+
"testing"
22+
23+
"github.com/alicebob/miniredis/v2"
24+
"github.com/chideat/valkey-operator/pkg/valkey"
25+
)
26+
27+
// Test_shutdownNosaveDecision verifies the shutdown save-mode decision logic:
28+
// Dbsize == 0 (e.g. failed cross-version fullsync) → SHUTDOWN NOSAVE to preserve dump.rdb.
29+
// Dbsize > 0 → normal SHUTDOWN.
30+
func Test_shutdownNosaveDecision(t *testing.T) {
31+
tests := []struct {
32+
name string
33+
dbsize int64
34+
wantNosave bool
35+
}{
36+
{
37+
name: "empty database - use SHUTDOWN NOSAVE",
38+
dbsize: 0,
39+
wantNosave: true,
40+
},
41+
{
42+
name: "non-empty database - use normal SHUTDOWN",
43+
dbsize: 2,
44+
wantNosave: false,
45+
},
46+
}
47+
48+
for _, tt := range tests {
49+
t.Run(tt.name, func(t *testing.T) {
50+
gotNosave := tt.dbsize == 0
51+
if gotNosave != tt.wantNosave {
52+
t.Errorf("nosave decision for dbsize=%d: got %v, want %v", tt.dbsize, gotNosave, tt.wantNosave)
53+
}
54+
})
55+
}
56+
}
57+
58+
// Test_shutdownInfoEmptyDB verifies that Info() returns Dbsize==0 for an empty miniredis instance,
59+
// which is the scenario triggered by a failed cross-version fullsync.
60+
func Test_shutdownInfoEmptyDB(t *testing.T) {
61+
s := miniredis.RunT(t)
62+
client := valkey.NewValkeyClient(s.Addr(), valkey.AuthInfo{})
63+
defer client.Close()
64+
65+
info, err := client.Info(context.Background())
66+
if err != nil {
67+
t.Fatalf("Info() error: %v", err)
68+
}
69+
if info.Dbsize != 0 {
70+
t.Errorf("expected Dbsize=0 for empty DB, got %d", info.Dbsize)
71+
}
72+
}

cmd/helper/commands/failover/shutdown.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,10 +362,20 @@ func Shutdown(ctx context.Context, c *cli.Context, client *kubernetes.Clientset,
362362
time.Sleep(time.Second * 30)
363363

364364
logger.Info("do shutdown node")
365-
// NOTE: here set timeout to 300s, which will try best to do a shutdown snapshot
366-
// if the data is too large, this snapshot may not be completed
367-
if _, err := valkeyClient.DoWithTimeout(ctx, time.Second*300, "SHUTDOWN"); err != nil && !errors.Is(err, io.EOF) {
368-
logger.Error(err, "graceful shutdown failed")
365+
// Re-check dbsize: if 0, memory may have been cleared by a failed fullsync
366+
// (e.g., cross-version RDB incompatibility). Use NOSAVE to preserve existing dump.rdb.
367+
latestInfo, _ := getValkeyInfo(ctx, valkeyClient, logger)
368+
if latestInfo != nil && latestInfo.Dbsize == 0 {
369+
logger.Info("dbsize is 0, using SHUTDOWN NOSAVE to preserve existing dump.rdb")
370+
if _, err := valkeyClient.DoWithTimeout(ctx, time.Second*300, "SHUTDOWN", "NOSAVE"); err != nil && !errors.Is(err, io.EOF) {
371+
logger.Error(err, "graceful shutdown failed")
372+
}
373+
} else {
374+
// NOTE: here set timeout to 300s, which will try best to do a shutdown snapshot
375+
// if the data is too large, this snapshot may not be completed
376+
if _, err := valkeyClient.DoWithTimeout(ctx, time.Second*300, "SHUTDOWN"); err != nil && !errors.Is(err, io.EOF) {
377+
logger.Error(err, "graceful shutdown failed")
378+
}
369379
}
370380
return err
371381
}

cmd/helper/commands/failover/shutdown_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@ limitations under the License.
1717
package failover
1818

1919
import (
20+
"context"
2021
"os"
2122
"testing"
2223

24+
"github.com/alicebob/miniredis/v2"
25+
"github.com/chideat/valkey-operator/pkg/valkey"
2326
"github.com/go-logr/logr"
2427
)
2528

@@ -76,3 +79,53 @@ replica-announce-port 31095`,
7679
})
7780
}
7881
}
82+
83+
// Test_shutdownNosaveDecision verifies the shutdown save-mode decision logic:
84+
// Dbsize == 0 (e.g. failed cross-version fullsync) → SHUTDOWN NOSAVE to preserve dump.rdb.
85+
// Dbsize > 0 → normal SHUTDOWN.
86+
func Test_shutdownNosaveDecision(t *testing.T) {
87+
tests := []struct {
88+
name string
89+
dbsize int64
90+
wantNosave bool
91+
}{
92+
{
93+
name: "empty database - use SHUTDOWN NOSAVE",
94+
dbsize: 0,
95+
wantNosave: true,
96+
},
97+
{
98+
name: "non-empty database - use normal SHUTDOWN",
99+
dbsize: 2,
100+
wantNosave: false,
101+
},
102+
}
103+
104+
for _, tt := range tests {
105+
t.Run(tt.name, func(t *testing.T) {
106+
gotNosave := tt.dbsize == 0
107+
if gotNosave != tt.wantNosave {
108+
t.Errorf("nosave decision for dbsize=%d: got %v, want %v", tt.dbsize, gotNosave, tt.wantNosave)
109+
}
110+
})
111+
}
112+
}
113+
114+
// Test_shutdownInfoEmptyDB verifies that Info() returns Dbsize==0 for an empty miniredis instance,
115+
// which is the scenario triggered by a failed cross-version fullsync.
116+
func Test_shutdownInfoEmptyDB(t *testing.T) {
117+
s := miniredis.RunT(t)
118+
client := valkey.NewValkeyClient(s.Addr(), valkey.AuthInfo{})
119+
defer client.Close()
120+
121+
info, err := client.Info(context.Background())
122+
if err != nil {
123+
t.Fatalf("Info() error: %v", err)
124+
}
125+
if info.Dbsize != 0 {
126+
t.Errorf("expected Dbsize=0 for empty DB, got %d", info.Dbsize)
127+
}
128+
if info.Dbsize != 0 {
129+
t.Error("expected NOSAVE decision for empty DB")
130+
}
131+
}

0 commit comments

Comments
 (0)