test

dumbbell · dumbbell · commit f95aa4bd4774 · 2025-09-30T10:32:56.000+02:00
diff --git a/deps/rabbit/test/clustering_recovery_SUITE.erl b/deps/rabbit/test/clustering_recovery_SUITE.erl
@@ -861,30 +861,7 @@ temporary_queue_after_partition_recovery_2(Config, QueueDeclare) ->
                         Conn1, Ch1)
               end),
 
-    KhepriTimeout = rabbit_ct_broker_helpers:rpc(Config, Node2, khepri_app, get_default_timeout, []),
-    ct:pal("Sleep > ~b ms", [KhepriTimeout]),
-    timer:sleep(KhepriTimeout + 10000),
-
-    %% Close the second consuming client to trigger the queue deletion during
-    %% the network partition. This time, the partition is solved while the
-    %% queue process tries to delete the record.
-    ct:pal("Close connection 2"),
-    _ = spawn(fun() ->
-                      rabbit_ct_client_helpers:close_connection_and_channel(
-                        Conn2, Ch2)
-              end),
-
-    %% We resolve the network partition.
-    lists:foreach(
-      fun(Node) ->
-              ct:pal("Allow traffic with ~s", [Node]),
-              rabbit_ct_broker_helpers:allow_traffic_between(
-                Node2, Node)
-      end, Majority),
-    ct:pal("Cluster status"),
-    clustering_utils:assert_cluster_status({Nodes, Nodes}, Nodes),
-
-    ct:pal("Wait for connection DOWN"),
+    ct:pal("Wait for connection 1 DOWN"),
     receive
         {'DOWN', CMRef1, _, _, Reason1_1} ->
             ct:pal("Connection ~p exited: ~p", [Conn1, Reason1_1]),
@@ -893,7 +870,7 @@ temporary_queue_after_partition_recovery_2(Config, QueueDeclare) ->
     after Timeout ->
               ct:fail("Connection ~p still running", [Conn1])
     end,
-    ct:pal("Wait for queue DOWN"),
+    ct:pal("Wait for queue 1 DOWN"),
     receive
         {'DOWN', QMRef1, _, _, Reason1_2} ->
             ct:pal("Queue ~p exited: ~p", [QPid1, Reason1_2]),
@@ -903,19 +880,35 @@ temporary_queue_after_partition_recovery_2(Config, QueueDeclare) ->
               ct:fail("Queue ~p still running", [QPid1])
     end,
 
-    %% The first queue was deleted from the metadata store on all nodes.
-    lists:foreach(
-      fun(Node) ->
-              ?awaitMatch(
-                 {error, not_found},
-                 begin
-                     Ret = rabbit_ct_broker_helpers:rpc(
-                             Config, Node, rabbit_amqqueue, lookup, [QName1]),
-                     ct:pal("Queue lookup on node ~0p: ~p", [Node, Ret]),
-                     Ret
-                 end, Timeout)
-      end, Nodes),
+    %% We sleep to let the queue record deletion reach the timeout. It should
+    %% retry indefinitely.
+    KhepriTimeout = rabbit_ct_broker_helpers:rpc(
+                      Config, Node2, khepri_app, get_default_timeout, []),
+    ct:pal("Sleep > ~b ms", [KhepriTimeout]),
+    timer:sleep(KhepriTimeout + 10000),
+
+    %% The queue process exited but the queue record should still be there. The
+    %% temporary process is still trying to delete it but can't during the
+    %% network partition.
+    ?awaitMatch(
+       {ok, _},
+       begin
+           Ret = rabbit_ct_broker_helpers:rpc(
+                   Config, Node2, rabbit_amqqueue, lookup, [QName1]),
+           ct:pal("Queue lookup on node ~0p: ~p", [Node2, Ret]),
+           Ret
+       end, Timeout),
 
+    %% Close the second consuming client to trigger the queue deletion during
+    %% the network partition. This time, the partition is solved while the
+    %% queue process tries to delete the record.
+    ct:pal("Close connection 2"),
+    _ = spawn(fun() ->
+                      rabbit_ct_client_helpers:close_connection_and_channel(
+                        Conn2, Ch2)
+              end),
+
+    ct:pal("Wait for connection 2 DOWN"),
     receive
         {'DOWN', CMRef2, _, _, Reason2_1} ->
             ct:pal("Connection ~p exited: ~p", [Conn2, Reason2_1]),
@@ -924,6 +917,7 @@ temporary_queue_after_partition_recovery_2(Config, QueueDeclare) ->
     after Timeout ->
               ct:fail("Connection ~p still running", [Conn2])
     end,
+    ct:pal("Wait for queue 2 DOWN"),
     receive
         {'DOWN', QMRef2, _, _, Reason2_2} ->
             ct:pal("Queue ~p exited: ~p", [QPid2, Reason2_2]),
@@ -933,6 +927,41 @@ temporary_queue_after_partition_recovery_2(Config, QueueDeclare) ->
               ct:fail("Queue ~p still running", [QPid2])
     end,
 
+    %% Again, the queue process exited but the queue record should still be
+    %% there. The temporary process is still trying to delete it but can't
+    %% during the network partition.
+    ?awaitMatch(
+       {ok, _},
+       begin
+           Ret = rabbit_ct_broker_helpers:rpc(
+                   Config, Node2, rabbit_amqqueue, lookup, [QName2]),
+           ct:pal("Queue lookup on node ~0p: ~p", [Node2, Ret]),
+           Ret
+       end, Timeout),
+
+    %% We resolve the network partition.
+    lists:foreach(
+      fun(Node) ->
+              ct:pal("Allow traffic with ~s", [Node]),
+              rabbit_ct_broker_helpers:allow_traffic_between(
+                Node2, Node)
+      end, Majority),
+    ct:pal("Cluster status"),
+    clustering_utils:assert_cluster_status({Nodes, Nodes}, Nodes),
+
+    %% The first queue was deleted from the metadata store on all nodes.
+    lists:foreach(
+      fun(Node) ->
+              ?awaitMatch(
+                 {error, not_found},
+                 begin
+                     Ret = rabbit_ct_broker_helpers:rpc(
+                             Config, Node, rabbit_amqqueue, lookup, [QName1]),
+                     ct:pal("Queue lookup on node ~0p: ~p", [Node, Ret]),
+                     Ret
+                 end, Timeout)
+      end, Nodes),
+
     %% The second queue was deleted from the metadata store on all nodes.
     lists:foreach(
       fun(Node) ->