Skip to content

Commit 09484ea

Browse files
author
KJ Tsanaktsidis
authored
Fix underlying RedisClient circuit breakers never firing (#309)
1 parent 38311a8 commit 09484ea

18 files changed

+547
-489
lines changed

lib/redis_client/cluster/node.rb

Lines changed: 160 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ class Node
2222
SLOT_SIZE = 16_384
2323
MIN_SLOT = 0
2424
MAX_SLOT = SLOT_SIZE - 1
25-
IGNORE_GENERIC_CONFIG_KEYS = %i[url host port path].freeze
2625
DEAD_FLAGS = %w[fail? fail handshake noaddr noflags].freeze
2726
ROLE_FLAGS = %w[master slave].freeze
2827
EMPTY_ARRAY = [].freeze
28+
EMPTY_HASH = {}.freeze
2929

3030
ReloadNeeded = Class.new(::RedisClient::Error)
3131

@@ -92,119 +92,19 @@ def build_connection_prelude
9292
end
9393
end
9494

95-
class << self
96-
def load_info(options, concurrent_worker, slow_command_timeout: -1, **kwargs) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
97-
raise ::RedisClient::Cluster::InitialSetupError, [] if options.nil? || options.empty?
98-
99-
startup_size = options.size > MAX_STARTUP_SAMPLE ? MAX_STARTUP_SAMPLE : options.size
100-
startup_options = options.to_a.sample(startup_size).to_h
101-
startup_nodes = ::RedisClient::Cluster::Node.new(startup_options, concurrent_worker, **kwargs)
102-
work_group = concurrent_worker.new_group(size: startup_size)
103-
104-
startup_nodes.each_with_index do |raw_client, i|
105-
work_group.push(i, raw_client) do |client|
106-
regular_timeout = client.read_timeout
107-
client.read_timeout = slow_command_timeout > 0.0 ? slow_command_timeout : regular_timeout
108-
reply = client.call('CLUSTER', 'NODES')
109-
client.read_timeout = regular_timeout
110-
parse_cluster_node_reply(reply)
111-
rescue StandardError => e
112-
e
113-
ensure
114-
client&.close
115-
end
116-
end
117-
118-
node_info_list = errors = nil
119-
120-
work_group.each do |i, v|
121-
case v
122-
when StandardError
123-
errors ||= Array.new(startup_size)
124-
errors[i] = v
125-
else
126-
node_info_list ||= Array.new(startup_size)
127-
node_info_list[i] = v
128-
end
129-
end
130-
131-
work_group.close
132-
133-
raise ::RedisClient::Cluster::InitialSetupError, errors if node_info_list.nil?
134-
135-
grouped = node_info_list.compact.group_by do |info_list|
136-
info_list.sort_by!(&:id)
137-
info_list.each_with_object(String.new(capacity: 128 * info_list.size)) do |e, a|
138-
a << e.id << e.node_key << e.role << e.primary_id << e.config_epoch
139-
end
140-
end
141-
142-
grouped.max_by { |_, v| v.size }[1].first.freeze
143-
end
144-
145-
private
146-
147-
# @see https://redis.io/commands/cluster-nodes/
148-
# @see https://github.com/redis/redis/blob/78960ad57b8a5e6af743d789ed8fd767e37d42b8/src/cluster.c#L4660-L4683
149-
def parse_cluster_node_reply(reply) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
150-
reply.each_line("\n", chomp: true).filter_map do |line|
151-
fields = line.split
152-
flags = fields[2].split(',')
153-
next unless fields[7] == 'connected' && (flags & DEAD_FLAGS).empty?
154-
155-
slots = if fields[8].nil?
156-
EMPTY_ARRAY
157-
else
158-
fields[8..].reject { |str| str.start_with?('[') }
159-
.map { |str| str.split('-').map { |s| Integer(s) } }
160-
.map { |a| a.size == 1 ? a << a.first : a }
161-
.map(&:sort)
162-
end
163-
164-
::RedisClient::Cluster::Node::Info.new(
165-
id: fields[0],
166-
node_key: parse_node_key(fields[1]),
167-
role: (flags & ROLE_FLAGS).first,
168-
primary_id: fields[3],
169-
ping_sent: fields[4],
170-
pong_recv: fields[5],
171-
config_epoch: fields[6],
172-
link_state: fields[7],
173-
slots: slots
174-
)
175-
end
176-
end
177-
178-
# As redirection node_key is dependent on `cluster-preferred-endpoint-type` config,
179-
# node_key should use hostname if present in CLUSTER NODES output.
180-
#
181-
# See https://redis.io/commands/cluster-nodes/ for details on the output format.
182-
# node_address matches fhe format: <ip:port@cport[,hostname[,auxiliary_field=value]*]>
183-
def parse_node_key(node_address)
184-
ip_chunk, hostname, _auxiliaries = node_address.split(',')
185-
ip_port_string = ip_chunk.split('@').first
186-
return ip_port_string if hostname.nil? || hostname.empty?
187-
188-
port = ip_port_string.split(':')[1]
189-
"#{hostname}:#{port}"
190-
end
191-
end
192-
19395
def initialize(
194-
options,
19596
concurrent_worker,
196-
node_info_list: [],
197-
with_replica: false,
198-
replica_affinity: :random,
97+
config:,
19998
pool: nil,
20099
**kwargs
201100
)
202101

203102
@concurrent_worker = concurrent_worker
204-
@slots = build_slot_node_mappings(node_info_list)
205-
@replications = build_replication_mappings(node_info_list)
206-
klass = make_topology_class(with_replica, replica_affinity)
207-
@topology = klass.new(@replications, options, pool, @concurrent_worker, **kwargs)
103+
@slots = build_slot_node_mappings(EMPTY_ARRAY)
104+
@replications = build_replication_mappings(EMPTY_ARRAY)
105+
klass = make_topology_class(config.use_replica?, config.replica_affinity)
106+
@topology = klass.new(pool, @concurrent_worker, **kwargs)
107+
@config = config
208108
@mutex = Mutex.new
209109
end
210110

@@ -255,6 +155,14 @@ def clients_for_scanning(seed: nil)
255155
@topology.clients_for_scanning(seed: seed).values.sort_by { |c| "#{c.config.host}-#{c.config.port}" }
256156
end
257157

158+
def clients
159+
@topology.clients.values
160+
end
161+
162+
def primary_clients
163+
@topology.primary_clients.values
164+
end
165+
258166
def replica_clients
259167
@topology.replica_clients.values
260168
end
@@ -292,6 +200,20 @@ def update_slot(slot, node_key)
292200
end
293201
end
294202

203+
def reload!
204+
with_reload_lock do
205+
with_startup_clients(MAX_STARTUP_SAMPLE) do |startup_clients|
206+
@node_info = refetch_node_info_list(startup_clients)
207+
@node_configs = @node_info.to_h do |node_info|
208+
[node_info.node_key, @config.client_config_for_node(node_info.node_key)]
209+
end
210+
@slots = build_slot_node_mappings(@node_info)
211+
@replications = build_replication_mappings(@node_info)
212+
@topology.process_topology_update!(@replications, @node_configs)
213+
end
214+
end
215+
end
216+
295217
private
296218

297219
def make_topology_class(with_replica, replica_affinity)
@@ -378,6 +300,137 @@ def try_map(clients, &block) # rubocop:disable Metrics/AbcSize, Metrics/Cyclomat
378300

379301
[results, errors]
380302
end
303+
304+
def refetch_node_info_list(startup_clients) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
305+
startup_size = startup_clients.size
306+
work_group = @concurrent_worker.new_group(size: startup_size)
307+
308+
startup_clients.each_with_index do |raw_client, i|
309+
work_group.push(i, raw_client) do |client|
310+
regular_timeout = client.read_timeout
311+
client.read_timeout = @config.slow_command_timeout > 0.0 ? @config.slow_command_timeout : regular_timeout
312+
reply = client.call('CLUSTER', 'NODES')
313+
client.read_timeout = regular_timeout
314+
parse_cluster_node_reply(reply)
315+
rescue StandardError => e
316+
e
317+
ensure
318+
client&.close
319+
end
320+
end
321+
322+
node_info_list = errors = nil
323+
324+
work_group.each do |i, v|
325+
case v
326+
when StandardError
327+
errors ||= Array.new(startup_size)
328+
errors[i] = v
329+
else
330+
node_info_list ||= Array.new(startup_size)
331+
node_info_list[i] = v
332+
end
333+
end
334+
335+
work_group.close
336+
337+
raise ::RedisClient::Cluster::InitialSetupError, errors if node_info_list.nil?
338+
339+
grouped = node_info_list.compact.group_by do |info_list|
340+
info_list.sort_by!(&:id)
341+
info_list.each_with_object(String.new(capacity: 128 * info_list.size)) do |e, a|
342+
a << e.id << e.node_key << e.role << e.primary_id << e.config_epoch
343+
end
344+
end
345+
346+
grouped.max_by { |_, v| v.size }[1].first
347+
end
348+
349+
def parse_cluster_node_reply(reply) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
350+
reply.each_line("\n", chomp: true).filter_map do |line|
351+
fields = line.split
352+
flags = fields[2].split(',')
353+
next unless fields[7] == 'connected' && (flags & DEAD_FLAGS).empty?
354+
355+
slots = if fields[8].nil?
356+
EMPTY_ARRAY
357+
else
358+
fields[8..].reject { |str| str.start_with?('[') }
359+
.map { |str| str.split('-').map { |s| Integer(s) } }
360+
.map { |a| a.size == 1 ? a << a.first : a }
361+
.map(&:sort)
362+
end
363+
364+
::RedisClient::Cluster::Node::Info.new(
365+
id: fields[0],
366+
node_key: parse_node_key(fields[1]),
367+
role: (flags & ROLE_FLAGS).first,
368+
primary_id: fields[3],
369+
ping_sent: fields[4],
370+
pong_recv: fields[5],
371+
config_epoch: fields[6],
372+
link_state: fields[7],
373+
slots: slots
374+
)
375+
end
376+
end
377+
378+
# As redirection node_key is dependent on `cluster-preferred-endpoint-type` config,
379+
# node_key should use hostname if present in CLUSTER NODES output.
380+
#
381+
# See https://redis.io/commands/cluster-nodes/ for details on the output format.
382+
# node_address matches fhe format: <ip:port@cport[,hostname[,auxiliary_field=value]*]>
383+
def parse_node_key(node_address)
384+
ip_chunk, hostname, _auxiliaries = node_address.split(',')
385+
ip_port_string = ip_chunk.split('@').first
386+
return ip_port_string if hostname.nil? || hostname.empty?
387+
388+
port = ip_port_string.split(':')[1]
389+
"#{hostname}:#{port}"
390+
end
391+
392+
def with_startup_clients(count) # rubocop:disable Metrics/AbcSize
393+
if @config.connect_with_original_config
394+
# If connect_with_original_config is set, that means we need to build actual client objects
395+
# and close them, so that we e.g. re-resolve a DNS entry with the cluster nodes in it.
396+
begin
397+
# Memoize the startup clients, so we maintain RedisClient's internal circuit breaker configuration
398+
# if it's set.
399+
@startup_clients ||= @config.startup_nodes.values.sample(count).map do |node_config|
400+
::RedisClient::Cluster::Node::Config.new(**node_config).new_client
401+
end
402+
yield @startup_clients
403+
ensure
404+
# Close the startup clients when we're done, so we don't maintain pointless open connections to
405+
# the cluster though
406+
@startup_clients&.each(&:close)
407+
end
408+
else
409+
# (re-)connect using nodes we already know about.
410+
# If this is the first time we're connecting to the cluster, we need to seed the topology with the
411+
# startup clients though.
412+
@topology.process_topology_update!({}, @config.startup_nodes) if @topology.clients.empty?
413+
yield @topology.clients.values.sample(count)
414+
end
415+
end
416+
417+
def with_reload_lock
418+
# What should happen with concurrent calls #reload? This is a realistic possibility if the cluster goes into
419+
# a CLUSTERDOWN state, and we're using a pooled backend. Every thread will independently discover this, and
420+
# call reload!.
421+
# For now, if a reload is in progress, wait for that to complete, and consider that the same as us having
422+
# performed the reload.
423+
# Probably in the future we should add a circuit breaker to #reload itself, and stop trying if the cluster is
424+
# obviously not working.
425+
wait_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
426+
@mutex.synchronize do
427+
return if @last_reloaded_at && @last_reloaded_at > wait_start
428+
429+
r = yield
430+
@last_reloaded_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
431+
r
432+
end
433+
end
381434
end
382435
end
383436
end
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# frozen_string_literal: true
2+
3+
class RedisClient
4+
class Cluster
5+
class Node
6+
class BaseTopology
7+
IGNORE_GENERIC_CONFIG_KEYS = %i[url host port path].freeze
8+
attr_reader :clients, :primary_clients, :replica_clients
9+
10+
def initialize(pool, concurrent_worker, **kwargs)
11+
@pool = pool
12+
@clients = {}
13+
@client_options = kwargs.reject { |k, _| IGNORE_GENERIC_CONFIG_KEYS.include?(k) }
14+
@concurrent_worker = concurrent_worker
15+
@replications = EMPTY_HASH
16+
@primary_node_keys = EMPTY_ARRAY
17+
@replica_node_keys = EMPTY_ARRAY
18+
@primary_clients = EMPTY_ARRAY
19+
@replica_clients = EMPTY_ARRAY
20+
end
21+
22+
def any_primary_node_key(seed: nil)
23+
random = seed.nil? ? Random : Random.new(seed)
24+
@primary_node_keys.sample(random: random)
25+
end
26+
27+
def process_topology_update!(replications, options) # rubocop:disable Metrics/AbcSize
28+
@replications = replications.freeze
29+
@primary_node_keys = @replications.keys.sort.select { |k| options.key?(k) }.freeze
30+
@replica_node_keys = @replications.values.flatten.sort.select { |k| options.key?(k) }.freeze
31+
32+
# Disconnect from nodes that we no longer want, and connect to nodes we're not connected to yet
33+
disconnect_from_unwanted_nodes(options)
34+
connect_to_new_nodes(options)
35+
36+
@primary_clients, @replica_clients = @clients.partition { |k, _| @primary_node_keys.include?(k) }.map(&:to_h)
37+
@primary_clients.freeze
38+
@replica_clients.freeze
39+
end
40+
41+
private
42+
43+
def disconnect_from_unwanted_nodes(options)
44+
(@clients.keys - options.keys).each do |node_key|
45+
@clients.delete(node_key).close
46+
end
47+
end
48+
49+
def connect_to_new_nodes(options)
50+
(options.keys - @clients.keys).each do |node_key|
51+
option = options[node_key].merge(@client_options)
52+
config = ::RedisClient::Cluster::Node::Config.new(scale_read: !@primary_node_keys.include?(node_key), **option)
53+
client = @pool.nil? ? config.new_client : config.new_pool(**@pool)
54+
@clients[node_key] = client
55+
end
56+
end
57+
end
58+
end
59+
end
60+
end

0 commit comments

Comments
 (0)