Skip to content

Commit dfcf08c

Browse files
Fix handling of circular MOVED redirects in cluster slot mapping (#3899)
* Detect circular MOVED redirects and avoids updating the slot map when the redirected node is already the shard primary. * Addressing PR comment --------- Co-authored-by: petyaslavova <petya.slavova@redis.com>
1 parent 334cb3b commit dfcf08c

File tree

4 files changed

+77
-31
lines changed

4 files changed

+77
-31
lines changed

redis/asyncio/cluster.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1456,27 +1456,30 @@ def _update_moved_slots(self) -> None:
14561456
e.host, e.port, PRIMARY, **self.connection_kwargs
14571457
)
14581458
self.set_nodes(self.nodes_cache, {redirected_node.name: redirected_node})
1459-
if redirected_node in self.slots_cache[e.slot_id]:
1459+
slot_nodes = self.slots_cache[e.slot_id]
1460+
if redirected_node not in slot_nodes:
1461+
# The new slot owner is a new server, or a server from a different
1462+
# shard. We need to remove all current nodes from the slot's list
1463+
# (including replications) and add just the new node.
1464+
self.slots_cache[e.slot_id] = [redirected_node]
1465+
elif redirected_node is not slot_nodes[0]:
14601466
# The MOVED error resulted from a failover, and the new slot owner
14611467
# had previously been a replica.
1462-
old_primary = self.slots_cache[e.slot_id][0]
1468+
old_primary = slot_nodes[0]
14631469
# Update the old primary to be a replica and add it to the end of
14641470
# the slot's node list
14651471
old_primary.server_type = REPLICA
1466-
self.slots_cache[e.slot_id].append(old_primary)
1472+
slot_nodes.append(old_primary)
14671473
# Remove the old replica, which is now a primary, from the slot's
14681474
# node list
1469-
self.slots_cache[e.slot_id].remove(redirected_node)
1475+
slot_nodes.remove(redirected_node)
14701476
# Override the old primary with the new one
1471-
self.slots_cache[e.slot_id][0] = redirected_node
1477+
slot_nodes[0] = redirected_node
14721478
if self.default_node == old_primary:
14731479
# Update the default node with the new primary
14741480
self.default_node = redirected_node
1475-
else:
1476-
# The new slot owner is a new server, or a server from a different
1477-
# shard. We need to remove all current nodes from the slot's list
1478-
# (including replications) and add just the new node.
1479-
self.slots_cache[e.slot_id] = [redirected_node]
1481+
# else: circular MOVED to current primary -> no-op
1482+
14801483
# Reset moved_exception
14811484
self._moved_exception = None
14821485

redis/cluster.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1693,27 +1693,30 @@ def _update_moved_slots(self):
16931693
# This is a new node, we will add it to the nodes cache
16941694
redirected_node = ClusterNode(e.host, e.port, PRIMARY)
16951695
self.nodes_cache[redirected_node.name] = redirected_node
1696-
if redirected_node in self.slots_cache[e.slot_id]:
1696+
slot_nodes = self.slots_cache[e.slot_id]
1697+
if redirected_node not in slot_nodes:
1698+
# The new slot owner is a new server, or a server from a different
1699+
# shard. We need to remove all current nodes from the slot's list
1700+
# (including replications) and add just the new node.
1701+
self.slots_cache[e.slot_id] = [redirected_node]
1702+
elif redirected_node is not slot_nodes[0]:
16971703
# The MOVED error resulted from a failover, and the new slot owner
16981704
# had previously been a replica.
1699-
old_primary = self.slots_cache[e.slot_id][0]
1705+
old_primary = slot_nodes[0]
17001706
# Update the old primary to be a replica and add it to the end of
17011707
# the slot's node list
17021708
old_primary.server_type = REPLICA
1703-
self.slots_cache[e.slot_id].append(old_primary)
1709+
slot_nodes.append(old_primary)
17041710
# Remove the old replica, which is now a primary, from the slot's
17051711
# node list
1706-
self.slots_cache[e.slot_id].remove(redirected_node)
1712+
slot_nodes.remove(redirected_node)
17071713
# Override the old primary with the new one
1708-
self.slots_cache[e.slot_id][0] = redirected_node
1714+
slot_nodes[0] = redirected_node
17091715
if self.default_node == old_primary:
17101716
# Update the default node with the new primary
17111717
self.default_node = redirected_node
1712-
else:
1713-
# The new slot owner is a new server, or a server from a different
1714-
# shard. We need to remove all current nodes from the slot's list
1715-
# (including replications) and add just the new node.
1716-
self.slots_cache[e.slot_id] = [redirected_node]
1718+
# else: circular MOVED to current primary -> no-op
1719+
17171720
# Reset moved_exception
17181721
self._moved_exception = None
17191722

tests/test_asyncio/test_cluster.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,19 +233,25 @@ def mock_all_nodes_resp(rc: RedisCluster, response: Any) -> RedisCluster:
233233

234234

235235
async def moved_redirection_helper(
236-
create_redis: Callable[..., RedisCluster], failover: bool = False
236+
create_redis: Callable[..., RedisCluster],
237+
failover: bool = False,
238+
circular_moved=False,
237239
) -> None:
238240
"""
239-
Test that the client handles MOVED response after a failover.
240-
Redirection after a failover means that the redirection address is of a
241-
replica that was promoted to a primary.
241+
Test that the client correctly handles MOVED responses in the following scenarios:
242+
1. Slot migration to a different shard (failover=False, circular_moved=False) —
243+
a standard slot move between shards.
244+
2. Failover event (failover=True, circular_moved=False) —
245+
the redirect target is a replica that has just been promoted to primary.
246+
3. Circular MOVED (failover=False, circular_moved=True) —
247+
the redirect points to a node already known to be the primary of its shard.
242248
243249
At first call it should return a MOVED ResponseError that will point
244250
the client to the next server it should talk to.
245251
246252
Verify that:
247253
1. it tries to talk to the redirected node
248-
2. it updates the slot's primary to the redirected node
254+
2. it updates the slot's primary to the redirected node, if required
249255
250256
For a failover, also verify:
251257
3. the redirected node's server type updated to 'primary'
@@ -261,6 +267,8 @@ async def moved_redirection_helper(
261267
warnings.warn("Skipping this test since it requires to have a replica")
262268
return
263269
redirect_node = rc.nodes_manager.slots_cache[slot][1]
270+
elif circular_moved:
271+
redirect_node = prev_primary
264272
else:
265273
# Use one of the primaries to be the redirected node
266274
redirect_node = rc.get_primaries()[0]
@@ -287,6 +295,10 @@ def ok_response(self, *args, **options):
287295
if failover:
288296
assert rc.get_node(host=r_host, port=r_port).server_type == PRIMARY
289297
assert prev_primary.server_type == REPLICA
298+
elif circular_moved:
299+
fetched_node = rc.get_node(host=r_host, port=r_port)
300+
assert fetched_node == prev_primary
301+
assert fetched_node.server_type == PRIMARY
290302

291303

292304
class TestRedisClusterObj:
@@ -613,6 +625,17 @@ async def test_moved_redirection_after_failover(
613625
"""
614626
await moved_redirection_helper(create_redis, failover=True)
615627

628+
async def test_moved_redirection_circular_moved(
629+
self, create_redis: Callable[..., RedisCluster]
630+
) -> None:
631+
"""
632+
Verify that the client does not update its slot map when receiving a circular MOVED response
633+
(i.e., a MOVED redirect pointing back to the same node), and retries again the same node.
634+
"""
635+
await moved_redirection_helper(
636+
create_redis, failover=False, circular_moved=True
637+
)
638+
616639
async def test_refresh_using_specific_nodes(
617640
self, create_redis: Callable[..., RedisCluster]
618641
) -> None:

tests/test_cluster.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -273,18 +273,22 @@ def find_node_ip_based_on_port(cluster_client, port):
273273
return node.host
274274

275275

276-
def moved_redirection_helper(request, failover=False):
276+
def moved_redirection_helper(request, failover=False, circular_moved=False):
277277
"""
278-
Test that the client handles MOVED response after a failover.
279-
Redirection after a failover means that the redirection address is of a
280-
replica that was promoted to a primary.
278+
Test that the client correctly handles MOVED responses in the following scenarios:
279+
1. Slot migration to a different shard (failover=False, circular_moved=False) —
280+
a standard slot move between shards.
281+
2. Failover event (failover=True, circular_moved=False) —
282+
the redirect target is a replica that has just been promoted to primary.
283+
3. Circular MOVED (failover=False, circular_moved=True) —
284+
the redirect points to a node already known to be the primary of its shard.
281285
282286
At first call it should return a MOVED ResponseError that will point
283287
the client to the next server it should talk to.
284288
285289
Verify that:
286290
1. it tries to talk to the redirected node
287-
2. it updates the slot's primary to the redirected node
291+
2. it updates the slot's primary to the redirected node, if required
288292
289293
For a failover, also verify:
290294
3. the redirected node's server type updated to 'primary'
@@ -300,8 +304,10 @@ def moved_redirection_helper(request, failover=False):
300304
warnings.warn("Skipping this test since it requires to have a replica")
301305
return
302306
redirect_node = rc.nodes_manager.slots_cache[slot][1]
307+
elif circular_moved:
308+
redirect_node = prev_primary
303309
else:
304-
# Use one of the primaries to be the redirected node
310+
# Use one of the other primaries to be the redirected node
305311
redirect_node = rc.get_primaries()[0]
306312
r_host = redirect_node.host
307313
r_port = redirect_node.port
@@ -324,6 +330,10 @@ def ok_response(connection, *args, **options):
324330
if failover:
325331
assert rc.get_node(host=r_host, port=r_port).server_type == PRIMARY
326332
assert prev_primary.server_type == REPLICA
333+
elif circular_moved:
334+
fetched_node = rc.get_node(host=r_host, port=r_port)
335+
assert fetched_node == prev_primary
336+
assert fetched_node.server_type == PRIMARY
327337

328338

329339
@pytest.mark.onlycluster
@@ -547,6 +557,13 @@ def test_moved_redirection_after_failover(self, request):
547557
"""
548558
moved_redirection_helper(request, failover=True)
549559

560+
def test_moved_redirection_circular_moved(self, request):
561+
"""
562+
Verify that the client does not update its slot map when receiving a circular MOVED response
563+
(i.e., a MOVED redirect pointing back to the same node), and retries again the same node.
564+
"""
565+
moved_redirection_helper(request, failover=False, circular_moved=True)
566+
550567
def test_refresh_using_specific_nodes(self, request):
551568
"""
552569
Test making calls on specific nodes when the cluster has failed over to

0 commit comments

Comments
 (0)