Skip to content

Commit bf03b0c

Browse files
gmbnomiszuiderkwast
authored andcommitted
Sentinel: fix regression requiring "+failover" ACL in failover path (#2780)
Since Valkey Sentinel 9.0, sentinel tries to abort an ongoing failover when changing the role of a monitored instance. Since the result of the command is ignored, the "FAILOVER ABORT" command is sent irrespective of the actual failover status. However, when using the documented pre 9.0 ACLs for a dedicated sentinel user, the FAILOVER command is not allowed and _all_ failover cases fail. (Additionally, the necessary ACL adaptation was not communicated well.) Address this by: - Updating the documentation in "sentinel.conf" to reflect the need for an adapted ACL - Only abort a failover when sentinel detected an ongoing (probably stuck) failover. This means that standard failover and manual failover continue to work with unchanged pre 9.0 ACLs. Only the new "SENTINEL FAILOVER COORDINATED" requires to adapt the ACL on all Valkey nodes. - Actually use a dedicated sentinel user and ACLs when testing standard failover, manual failover, and manual coordinated failover. Fixes #2779 Signed-off-by: Simon Baatz <[email protected]>
1 parent 1ca63e1 commit bf03b0c

File tree

7 files changed

+83
-27
lines changed

7 files changed

+83
-27
lines changed

sentinel.conf

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,11 @@ sentinel monitor mymaster 127.0.0.1 6379 2
119119
# In the Valkey servers side, the ACL to provide just minimal access to
120120
# Sentinel instances, should be configured along the following lines:
121121
#
122-
# user sentinel-user >somepassword +client +subscribe +publish \
122+
# user sentinel-user >somepassword +subscribe +publish +failover +script|kill \
123123
# +ping +info +multi +slaveof +config +client +exec &__sentinel__:hello on
124+
#
125+
# Since Valkey Sentinel 9.0, the sentinel user requires the +failover permission
126+
# on all monitored Valkey instances for proper operation.
124127

125128
# sentinel down-after-milliseconds <master-name> <milliseconds>
126129
#

src/sentinel.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4856,7 +4856,7 @@ int sentinelSendReplicaOf(sentinelValkeyInstance *ri, const sentinelAddr *addr)
48564856
if (retval == C_ERR) return retval;
48574857
ri->link->pending_commands++;
48584858

4859-
if (ri->monitored_instance_failover_state != SENTINEL_MONITORED_INSTANCE_FAILOVER_NS) {
4859+
if (ri->monitored_instance_failover_state == SENTINEL_MONITORED_INSTANCE_FAILOVER) {
48604860
retval = valkeyAsyncCommand(ri->link->cc, sentinelDiscardReplyCallback, ri, "%s ABORT",
48614861
sentinelInstanceMapCommand(ri, "FAILOVER"));
48624862
if (retval == C_ERR) return retval;

tests/sentinel/tests/00-base.tcl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ foreach_sentinel_id id {
66
S $id sentinel debug default-down-after 1000
77
}
88

9+
set ::user "sentinel-user"
10+
set ::password "sentinel-password"
11+
912
if {$::simulate_error} {
1013
test "This test will fail" {
1114
fail "Simulated error"
@@ -76,6 +79,10 @@ test "SENTINEL SIMULATE-FAILURE HELP list supported flags" {
7679
}
7780

7881
test "Basic failover works if the primary is down" {
82+
# Explicitly forbid the FAILOVER command to ensure backward compatibility with
83+
# ACLs that were documented for Valkey < 9.0
84+
configure_sentinel_user_acl $::user $::password 0
85+
7986
set old_port [RPort $master_id]
8087
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
8188
assert {[lindex $addr 1] == $old_port}
@@ -116,6 +123,7 @@ test "The old primary eventually gets reconfigured as a slave" {
116123
} else {
117124
fail "Old master not reconfigured as slave of new master"
118125
}
126+
reset_sentinel_user_acl $::user
119127
}
120128

121129
test "ODOWN is not possible without N (quorum) Sentinels reports" {

tests/sentinel/tests/03-runtime-reconf.tcl

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -38,31 +38,6 @@ proc server_reset_acl {id} {
3838
assert_equal {OK} [R $id CONFIG SET primaryauth ""]
3939
}
4040

41-
proc verify_sentinel_connect_replicas {id} {
42-
foreach replica [S $id SENTINEL REPLICAS mymaster] {
43-
if {[string match "*disconnected*" [dict get $replica flags]]} {
44-
return 0
45-
}
46-
}
47-
return 1
48-
}
49-
50-
proc wait_for_sentinels_connect_servers { {is_connect 1} } {
51-
foreach_sentinel_id id {
52-
wait_for_condition 1000 50 {
53-
[string match "*disconnected*" [dict get [S $id SENTINEL PRIMARY mymaster] flags]] != $is_connect
54-
} else {
55-
fail "At least some sentinel can't connect to master"
56-
}
57-
58-
wait_for_condition 1000 50 {
59-
[verify_sentinel_connect_replicas $id] == $is_connect
60-
} else {
61-
fail "At least some sentinel can't connect to replica"
62-
}
63-
}
64-
}
65-
6641
test "Sentinels (re)connection following SENTINEL SET myprimary auth-pass" {
6742
# 3 types of sentinels to test:
6843
# (re)started while primary changed pwd. Manage to connect only after setting pwd

tests/sentinel/tests/05-manual.tcl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@ foreach_sentinel_id id {
88
S $id sentinel debug publish-period 1000
99
}
1010

11+
set ::user "sentinel-user"
12+
set ::password "sentinel-password"
13+
14+
1115
test "Manual failover works" {
16+
configure_sentinel_user_acl $::user $::password
17+
1218
set old_port [RPort $master_id]
1319
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
1420
assert {[lindex $addr 1] == $old_port}
@@ -60,6 +66,7 @@ test "The old primary eventually gets reconfigured as a slave" {
6066
} else {
6167
fail "Old master not reconfigured as slave of new master"
6268
}
69+
reset_sentinel_user_acl $::user
6370
}
6471

6572
foreach flag {crash-after-election crash-after-promotion} {

tests/sentinel/tests/17-manual-coordinated.tcl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@
22

33
source "../tests/includes/init-tests.tcl"
44

5+
set ::user "sentinel-user"
6+
set ::password "sentinel-password"
7+
58
foreach_sentinel_id id {
69
S $id sentinel debug info-period 2000
710
S $id sentinel debug publish-period 1000
811
}
912

1013
test "Manual coordinated failover works" {
14+
configure_sentinel_user_acl $::user $::password
15+
1116
set old_port [RPort $master_id]
1217
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
1318
assert {[lindex $addr 1] == $old_port}
@@ -133,6 +138,7 @@ test "No change after failed failover: All sentinels agree on primary" {
133138
foreach_sentinel_id id {
134139
assert {[lindex [S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] 1] == [lindex $addr 1]}
135140
}
141+
reset_sentinel_user_acl $::user
136142
}
137143

138144
foreach flag {crash-after-election crash-after-promotion} {

tests/sentinel/tests/includes/utils.tcl

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,60 @@ proc verify_sentinel_auto_discovery {} {
3434
}
3535
}
3636
}
37+
38+
proc verify_sentinel_connect_replicas {id} {
39+
foreach replica [S $id SENTINEL REPLICAS mymaster] {
40+
if {[string match "*disconnected*" [dict get $replica flags]]} {
41+
return 0
42+
}
43+
}
44+
return 1
45+
}
46+
47+
proc wait_for_sentinels_connect_servers { {is_connect 1} } {
48+
foreach_sentinel_id id {
49+
wait_for_condition 1000 50 {
50+
[string match "*disconnected*" [dict get [S $id SENTINEL PRIMARY mymaster] flags]] != $is_connect
51+
} else {
52+
fail "At least some sentinel can't connect to master"
53+
}
54+
55+
wait_for_condition 1000 50 {
56+
[verify_sentinel_connect_replicas $id] == $is_connect
57+
} else {
58+
fail "At least some sentinel can't connect to replica"
59+
}
60+
}
61+
}
62+
63+
proc configure_sentinel_user_acl {user password {allow_failover 1}} {
64+
foreach_valkey_id id {
65+
R $id ACL SETUSER $user >$password +subscribe +publish [expr {$allow_failover ? "+failover" : "-failover"}] +script|kill +ping +info +multi +slaveof +config +client +exec &__sentinel__:hello ON
66+
# Ensure default user cannot be used for failover
67+
R $id ACL SETUSER default -failover -slaveof
68+
}
69+
foreach_sentinel_id id {
70+
S $id SENTINEL SET mymaster auth-user $user
71+
S $id SENTINEL SET mymaster auth-pass $password
72+
S $id SENTINEL FLUSHCONFIG
73+
}
74+
foreach_valkey_id id {
75+
R $id CLIENT KILL USER default SKIPME yes
76+
}
77+
wait_for_sentinels_connect_servers
78+
}
79+
80+
proc reset_sentinel_user_acl {user} {
81+
foreach_sentinel_id id {
82+
S $id SENTINEL SET mymaster auth-user ""
83+
S $id SENTINEL SET mymaster auth-pass ""
84+
S $id SENTINEL FLUSHCONFIG
85+
}
86+
foreach_valkey_id id {
87+
R $id ACL SETUSER default +failover +slaveof
88+
R $id ACL DELUSER $user
89+
R $id CONFIG REWRITE
90+
}
91+
wait_for_sentinels_connect_servers
92+
}
93+

0 commit comments

Comments
 (0)