Use WAIT command instead of uncertain sleep when waiting for replication delay

supercaracal · supercaracal · commit 7a470e8889b2 · 2018-12-02T15:57:51.000+09:00
diff --git a/lib/redis/cluster.rb b/lib/redis/cluster.rb
@@ -132,6 +132,7 @@ def send_command(command, &block)
         @node.call_all(command, &block).first
       when 'flushall', 'flushdb'
         @node.call_master(command, &block).first
+      when 'wait'     then @node.call_master(command, &block).reduce(:+)
       when 'keys'     then @node.call_slave(command, &block).flatten.sort
       when 'dbsize'   then @node.call_slave(command, &block).reduce(:+)
       when 'lastsave' then @node.call_all(command, &block).sort
diff --git a/test/cluster_client_transactions_test.rb b/test/cluster_client_transactions_test.rb
@@ -40,7 +40,7 @@ def test_transaction_with_replicas
       100.times { |i| cli.set("{key}#{i}", i) }
     end
 
-    sleep 0.5
+    rc1.wait(1, TIMEOUT.to_i * 1000)
 
     100.times { |i| assert_equal i.to_s, rc1.get("{key}#{i}") }
     100.times { |i| assert_equal i.to_s, rc2.get("{key}#{i}") }
diff --git a/test/cluster_commands_on_keys_test.rb b/test/cluster_commands_on_keys_test.rb
@@ -115,7 +115,7 @@ def test_unlink
 
   def test_wait
     set_some_keys
-    assert_equal 1, redis.wait(1, 0)
+    assert_equal 3, redis.wait(1, TIMEOUT.to_i * 1000)
   end
 
   def test_scan
diff --git a/test/helper.rb b/test/helper.rb
@@ -297,7 +297,7 @@ def redis_cluster_mock(commands, options = {})
     end
 
     def redis_cluster_down
-      trib = ClusterOrchestrator.new(_default_nodes)
+      trib = ClusterOrchestrator.new(_default_nodes, timeout: TIMEOUT)
       trib.down
       yield
     ensure
@@ -306,7 +306,7 @@ def redis_cluster_down
     end
 
     def redis_cluster_failover
-      trib = ClusterOrchestrator.new(_default_nodes)
+      trib = ClusterOrchestrator.new(_default_nodes, timeout: TIMEOUT)
       trib.failover
       yield
     ensure
@@ -318,7 +318,7 @@ def redis_cluster_failover
     # @param src [String] <ip>:<port>
     # @param dest [String] <ip>:<port>
     def redis_cluster_resharding(slot, src:, dest:)
-      trib = ClusterOrchestrator.new(_default_nodes)
+      trib = ClusterOrchestrator.new(_default_nodes, timeout: TIMEOUT)
       trib.start_resharding(slot, src, dest)
       yield
       trib.finish_resharding(slot, dest)
diff --git a/test/support/cluster/orchestrator.rb b/test/support/cluster/orchestrator.rb
@@ -5,10 +5,10 @@
 class ClusterOrchestrator
   SLOT_SIZE = 16384
 
-  def initialize(node_addrs)
+  def initialize(node_addrs, timeout: 30.0)
     raise 'Redis Cluster requires at least 3 master nodes.' if node_addrs.size < 3
-    timeout_sec = Float(ENV['TIMEOUT'] || 30.0)
-    @clients = node_addrs.map { |addr| Redis.new(url: addr, timeout: timeout_sec) }
+    @clients = node_addrs.map { |addr| Redis.new(url: addr, timeout: timeout) }
+    @timeout = timeout
   end
 
   def rebuild
@@ -21,6 +21,8 @@ def rebuild
     replicate(@clients)
     save_config(@clients)
     wait_cluster_building(@clients)
+    wait_replication(@clients)
+    wait_cluster_recovering(@clients)
   end
 
   def down
@@ -30,8 +32,11 @@ def down
 
   def failover
     master, slave = take_replication_pairs(@clients)
+    wait_replication_delay(@clients, @timeout)
     slave.cluster(:failover, :takeover)
     wait_failover(to_node_key(master), to_node_key(slave), @clients)
+    wait_replication_delay(@clients, @timeout)
+    wait_cluster_recovering(@clients)
   end
 
   def start_resharding(slot, src_node_key, dest_node_key)
@@ -117,14 +122,12 @@ def meet_each_other(clients)
     end
   end
 
-  def wait_meeting(clients)
-    first_cliient = clients.first
-    size = clients.size
+  def wait_meeting(clients, max_attempts: 600)
+    size = clients.size.to_s
 
-    loop do
-      info = hashify_cluster_info(first_cliient)
-      break if info['cluster_known_nodes'].to_i == size
-      sleep 0.1
+    wait_for_state(clients, max_attempts) do |client|
+      info = hashify_cluster_info(client)
+      info['cluster_known_nodes'] == size
     end
   end
 
@@ -157,27 +160,61 @@ def save_config(clients)
     clients.each { |c| c.cluster(:saveconfig) }
   end
 
-  def wait_cluster_building(clients, max_attempts: 200)
-    attempt_count = 0
+  def wait_cluster_building(clients, max_attempts: 600)
+    wait_for_state(clients, max_attempts) do |client|
+      info = hashify_cluster_info(client)
+      info['cluster_state'] == 'ok'
+    end
+  end
 
-    clients.each do |client|
-      loop do
-        info = hashify_cluster_info(client)
-        attempt_count += 1
-        break if info['cluster_state'] == 'ok' || attempt_count > max_attempts
-        sleep 0.1
-      end
+  def wait_replication(clients, max_attempts: 600)
+    wait_for_state(clients, max_attempts) do |client|
+      flags = hashify_cluster_node_flags(client)
+      flags.values.select { |f| f == 'slave' }.size == 3
+    end
+  end
+
+  def wait_failover(master_key, slave_key, clients, max_attempts: 600)
+    wait_for_state(clients, max_attempts) do |client|
+      flags = hashify_cluster_node_flags(client)
+      flags[master_key] == 'slave' && flags[slave_key] == 'master'
     end
   end
 
-  def wait_failover(master_key, slave_key, clients, max_attempts: 200)
-    attempt_count = 0
+  def wait_replication_delay(clients, timeout_sec)
+    timeout_msec = timeout_sec.to_i * 1000
+    wait_for_state(clients, clients.size + 1) do |client|
+      client.wait(1, timeout_msec) if client.role.first == 'master'
+      true
+    end
+  end
 
+  def wait_cluster_recovering(clients, max_attempts: 600)
+    key = 0
+    wait_for_state(clients, max_attempts) do |client|
+      begin
+        client.get(key) if client.role.first == 'master'
+        true
+      rescue Redis::CommandError => err
+        if err.message.start_with?('CLUSTERDOWN')
+          false
+        elsif err.message.start_with?('MOVED')
+          key += 1
+          false
+        else
+          true
+        end
+      end
+    end
+  end
+
+  def wait_for_state(clients, max_attempts)
+    attempt_count = 1
     clients.each do |client|
-      loop do
-        flags = hashify_cluster_node_flags(client)
+      attempt_count.step(max_attempts) do |i|
+        break if i >= max_attempts
         attempt_count += 1
-        break if (flags[master_key] == 'slave' && flags[slave_key] == 'master') || attempt_count > max_attempts
+        break if yield(client)
         sleep 0.1
       end
     end