Skip to content

Commit 58c09ee

Browse files
authored
test: add a case for node recovery in a broken cluster (#388)
1 parent bf83b04 commit 58c09ee

File tree

3 files changed

+46
-64
lines changed

3 files changed

+46
-64
lines changed

.github/workflows/test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ jobs:
2828
matrix:
2929
include:
3030
- {redis: '7.2', ruby: '3.3'}
31+
- {task: test_cluster_broken, restart: 'no', startup: '6'}
3132
- {redis: '7.2', ruby: '3.3', compose: compose.ssl.yaml}
3233
- {redis: '7.2', ruby: '3.3', driver: 'hiredis'}
3334
- {redis: '7.2', ruby: '3.3', driver: 'hiredis', compose: compose.ssl.yaml}
@@ -39,7 +40,6 @@ jobs:
3940
- {task: test_cluster_state, pattern: 'ScaleReadLatency', compose: compose.valkey.yaml, redis: '8', replica: '2', startup: '9'}
4041
- {ruby: 'jruby'}
4142
- {ruby: 'truffleruby'}
42-
- {task: test_cluster_broken, restart: 'no', startup: '6'}
4343
- {task: test_cluster_down}
4444
- {redis: '8', ruby: '3.3', compose: compose.valkey.yaml, replica: '2'}
4545
- {redis: '7.2', ruby: '3.2', compose: compose.auth.yaml}

compose.valkey.yaml

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -86,22 +86,3 @@ services:
8686
condition: service_healthy
8787
node9:
8888
condition: service_healthy
89-
ruby:
90-
image: "ruby:${RUBY_VERSION:-3}"
91-
restart: always
92-
working_dir: /client
93-
volumes:
94-
- .:/client
95-
command:
96-
- ruby
97-
- "-e"
98-
- 'Signal.trap(:INT, "EXIT"); Signal.trap(:TERM, "EXIT"); loop { sleep 1 }'
99-
environment:
100-
REDIS_HOST: node1
101-
cap_drop:
102-
- ALL
103-
healthcheck:
104-
test: ["CMD", "ruby", "-e", "'puts 1'"]
105-
interval: "5s"
106-
timeout: "3s"
107-
retries: 3

test/test_against_cluster_broken.rb

Lines changed: 45 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# frozen_string_literal: true
22

3+
require 'json'
34
require 'testing_helper'
45

56
class TestAgainstClusterBroken < TestingWrapper
@@ -37,20 +38,36 @@ def teardown
3738
"ClusterDownError: #{@cluster_down_error_count} = "
3839
end
3940

40-
def test_a_replica_is_down
41-
sacrifice = @controller.select_sacrifice_of_replica
42-
do_test_a_node_is_down(sacrifice, number_of_keys: NUMBER_OF_KEYS)
41+
def test_client_patience
42+
prepare_test_data(number_of_keys: NUMBER_OF_KEYS)
43+
44+
# a replica
45+
kill_a_node(@controller.select_sacrifice_of_replica)
46+
wait_for_cluster_to_be_ready(wait_attempts: MAX_ATTEMPTS)
47+
do_assertions(number_of_keys: NUMBER_OF_KEYS)
48+
refute(@captured_commands.count('cluster', 'nodes').zero?, @captured_commands.to_a.map(&:command))
49+
50+
# a primary
51+
kill_a_node(@controller.select_sacrifice_of_primary)
52+
wait_for_cluster_to_be_ready(wait_attempts: MAX_ATTEMPTS)
53+
do_assertions(number_of_keys: NUMBER_OF_KEYS)
4354
refute(@captured_commands.count('cluster', 'nodes').zero?, @captured_commands.to_a.map(&:command))
44-
end
4555

46-
def test_a_primary_is_down
47-
sacrifice = @controller.select_sacrifice_of_primary
48-
do_test_a_node_is_down(sacrifice, number_of_keys: NUMBER_OF_KEYS)
56+
# recovery
57+
revive_dead_nodes
58+
wait_for_cluster_to_be_ready(wait_attempts: MAX_ATTEMPTS)
59+
do_assertions(number_of_keys: NUMBER_OF_KEYS)
4960
refute(@captured_commands.count('cluster', 'nodes').zero?, @captured_commands.to_a.map(&:command))
5061
end
5162

5263
private
5364

65+
def prepare_test_data(number_of_keys:)
66+
number_of_keys.times { |i| @client.call('SET', "pre-#{i}", i) }
67+
number_of_keys.times { |i| @client.pipelined { |pi| pi.call('SET', "pre-pipelined-#{i}", i) } }
68+
wait_for_replication
69+
end
70+
5471
def wait_for_replication
5572
client_side_timeout = TEST_TIMEOUT_SEC + 1.0
5673
server_side_timeout = (TEST_TIMEOUT_SEC * 1000).to_i
@@ -59,61 +76,45 @@ def wait_for_replication
5976
end
6077
end
6178

62-
def do_test_a_node_is_down(sacrifice, number_of_keys:)
63-
prepare_test_data(number_of_keys: number_of_keys)
64-
65-
kill_a_node(sacrifice, kill_attempts: MAX_ATTEMPTS)
66-
wait_for_cluster_to_be_ready(wait_attempts: MAX_ATTEMPTS)
67-
68-
assert_equal('PONG', @client.call('PING'), 'Case: PING')
69-
do_assertions_without_pipelining(number_of_keys: number_of_keys)
70-
do_assertions_with_pipelining(number_of_keys: number_of_keys)
71-
end
79+
def wait_for_cluster_to_be_ready(wait_attempts:)
80+
loop do
81+
raise MaxRetryExceeded if wait_attempts <= 0
7282

73-
def prepare_test_data(number_of_keys:)
74-
number_of_keys.times { |i| @client.call('SET', "pre-#{i}", i) }
75-
number_of_keys.times { |i| @client.pipelined { |pi| pi.call('SET', "pre-pipelined-#{i}", i) } }
76-
wait_for_replication
83+
wait_attempts -= 1
84+
break if @client.call('PING') == 'PONG'
85+
rescue ::RedisClient::Cluster::NodeMightBeDown
86+
@cluster_down_error_count += 1
87+
ensure
88+
sleep WAIT_SEC
89+
end
7790
end
7891

79-
def kill_a_node(sacrifice, kill_attempts:)
92+
def kill_a_node(sacrifice)
8093
refute_nil(sacrifice, "#{sacrifice.config.host}:#{sacrifice.config.port}")
8194

82-
loop do
83-
raise MaxRetryExceeded if kill_attempts <= 0
95+
`docker compose ps --format json`.lines.map { |line| JSON.parse(line) }.each do |service|
96+
published_ports = service.fetch('Publishers').map { |e| e.fetch('PublishedPort') }.uniq
97+
next unless published_ports.include?(sacrifice.config.port)
8498

85-
kill_attempts -= 1
86-
sacrifice.call('SHUTDOWN', 'NOSAVE')
87-
rescue ::RedisClient::CommandError => e
88-
raise unless e.message.include?('Errors trying to SHUTDOWN')
89-
rescue ::RedisClient::ConnectionError
99+
service_name = service.fetch('Service')
100+
system("docker compose --progress quiet pause #{service_name}", exception: true)
90101
break
91-
ensure
92-
sleep WAIT_SEC
93102
end
94103

95104
assert_raises(::RedisClient::ConnectionError) { sacrifice.call('PING') }
96105
end
97106

98-
def wait_for_cluster_to_be_ready(wait_attempts:)
99-
loop do
100-
raise MaxRetryExceeded if wait_attempts <= 0
101-
102-
wait_attempts -= 1
103-
break if @client.call('PING') == 'PONG'
104-
rescue ::RedisClient::Cluster::NodeMightBeDown
105-
@cluster_down_error_count += 1
106-
ensure
107-
sleep WAIT_SEC
107+
def revive_dead_nodes
108+
`docker compose ps --format json --status paused`.lines.map { |line| JSON.parse(line) }.each do |service|
109+
service_name = service.fetch('Service')
110+
system("docker compose --progress quiet unpause #{service_name}", exception: true)
108111
end
109112
end
110113

111-
def do_assertions_without_pipelining(number_of_keys:)
114+
def do_assertions(number_of_keys:)
112115
number_of_keys.times { |i| assert_equal(i.to_s, @client.call('GET', "pre-#{i}"), "Case: pre-#{i}: GET") }
113116
number_of_keys.times { |i| assert_equal('OK', @client.call('SET', "post-#{i}", i), "Case: post-#{i}: SET") }
114-
end
115117

116-
def do_assertions_with_pipelining(number_of_keys:)
117118
want = Array.new(number_of_keys, &:to_s)
118119
got = @client.pipelined { |pi| number_of_keys.times { |i| pi.call('GET', "pre-pipelined-#{i}") } }
119120
assert_equal(want, got, 'Case: pre-pipelined: GET')

0 commit comments

Comments
 (0)