From febe42d742176b71613dc4c454924b66dc20df20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Fri, 7 Feb 2025 16:44:37 +0100 Subject: [PATCH 01/10] rabbit_db: `force_reset` command is unsupported with Khepri [Why] The `force_reset` command simply removes local files on disk for the local node. In the case of Ra, this can't work because the rest of the cluster does not know about the forced-reset node. Therefore the leader will continue to send `append_entry` commands to the reset node. If that forced-reset node restarts and receives these messages, it will either join the cluster again (because it's on an older Raft term) or it will hit an assertion and exit (because it's on the same Raft term). [How] Given we can't really support this scenario and it has little value, the command will now return an error if someone attemps a `force_reset` with a node running Khepri. This also deprecates the command: once Mnesia support is removed, the command will be removed at the same time. This is noted in the rabbitmqctl.8 manpage. (cherry picked from commit c78aec7d48e1c07ea6fcdf078bf2c4fe10df379b) --- deps/rabbit/docs/rabbitmqctl.8 | 9 +++++++- deps/rabbit/src/rabbit_db.erl | 8 ++++--- deps/rabbit/src/rabbit_khepri.erl | 20 +----------------- .../test/clustering_management_SUITE.erl | 21 +++++-------------- .../src/rabbit_ct_broker_helpers.erl | 5 ----- 5 files changed, 19 insertions(+), 44 deletions(-) diff --git a/deps/rabbit/docs/rabbitmqctl.8 b/deps/rabbit/docs/rabbitmqctl.8 index fd7b5f31ef60..a61fc9348999 100644 --- a/deps/rabbit/docs/rabbitmqctl.8 +++ b/deps/rabbit/docs/rabbitmqctl.8 @@ -346,7 +346,7 @@ next time it is started: .sp .Dl rabbitmqctl force_boot .\" ------------------------------------------------------------------ -.It Cm force_reset +.It Cm force_reset Em (deprecated) .Pp Forcefully returns a RabbitMQ node to its virgin state. .Pp @@ -359,6 +359,13 @@ management database state and cluster configuration. It should only be used as a last resort if the database or cluster configuration has been corrupted. .Pp +The +.Cm force_reset +command is +.Sy deprecated . +It remains available when the Mnesia metadata store is used. +It is unsupported with the Khepri metadata store. +.Pp For .Cm reset and diff --git a/deps/rabbit/src/rabbit_db.erl b/deps/rabbit/src/rabbit_db.erl index a506c91259a2..2bf52b3a01c8 100644 --- a/deps/rabbit/src/rabbit_db.erl +++ b/deps/rabbit/src/rabbit_db.erl @@ -163,11 +163,13 @@ force_reset_using_mnesia() -> #{domain => ?RMQLOG_DOMAIN_DB}), rabbit_mnesia:force_reset(). +-spec force_reset_using_khepri() -> no_return(). + force_reset_using_khepri() -> - ?LOG_DEBUG( - "DB: resetting node forcefully (using Khepri)", + ?LOG_ERROR( + "DB: resetting node forcefully is unsupported with Khepri", #{domain => ?RMQLOG_DOMAIN_DB}), - rabbit_khepri:force_reset(). + throw({error, "Forced reset is unsupported with Khepri"}). -spec force_load_on_next_boot() -> Ret when Ret :: ok. diff --git a/deps/rabbit/src/rabbit_khepri.erl b/deps/rabbit/src/rabbit_khepri.erl index a370914a3a40..5ad603665adb 100644 --- a/deps/rabbit/src/rabbit_khepri.erl +++ b/deps/rabbit/src/rabbit_khepri.erl @@ -168,8 +168,7 @@ -export([check_cluster_consistency/0, check_cluster_consistency/2, node_info/0]). --export([reset/0, - force_reset/0]). +-export([reset/0]). -export([cluster_status_from_khepri/0, cli_cluster_status/0]). @@ -601,23 +600,6 @@ reset() -> %% @private -force_reset() -> - case rabbit:is_running() of - false -> - ok = khepri:stop(?RA_CLUSTER_NAME), - DataDir = maps:get(data_dir, ra_system:fetch(?RA_SYSTEM)), - ok = rabbit_ra_systems:ensure_ra_system_stopped(?RA_SYSTEM), - ok = rabbit_file:recursive_delete( - filelib:wildcard(DataDir ++ "/*")), - - _ = file:delete(rabbit_guid:filename()), - ok; - true -> - throw({error, rabbitmq_unexpectedly_running}) - end. - -%% @private - force_shrink_member_to_current_member() -> ok = ra_server_proc:force_shrink_members_to_current_member( {?RA_CLUSTER_NAME, node()}). diff --git a/deps/rabbit/test/clustering_management_SUITE.erl b/deps/rabbit/test/clustering_management_SUITE.erl index b3ebd74eb080..bacd813a2ea9 100644 --- a/deps/rabbit/test/clustering_management_SUITE.erl +++ b/deps/rabbit/test/clustering_management_SUITE.erl @@ -947,22 +947,11 @@ force_reset_node_in_khepri(Config) -> stop_join_start(Config, Rabbit, Hare), stop_app(Config, Rabbit), - ok = force_reset(Config, Rabbit), - assert_cluster_status({[Rabbit, Hare], [Rabbit, Hare], [Hare]}, [Hare]), - %% Khepri is stopped, so it won't report anything. - assert_status({[Rabbit], [], [Rabbit], [Rabbit], []}, [Rabbit]), - %% Hare thinks that Rabbit is still clustered - assert_cluster_status({[Rabbit, Hare], [Rabbit, Hare], [Hare]}, - [Hare]), - ok = start_app(Config, Rabbit), - assert_not_clustered(Rabbit), - %% We can rejoin Rabbit and Hare. Unlike with Mnesia, we try to solve the - %% inconsistency instead of returning an error. - ok = stop_app(Config, Rabbit), - ?assertEqual(ok, join_cluster(Config, Rabbit, Hare, false)), - ok = start_app(Config, Rabbit), - assert_cluster_status({[Rabbit, Hare], [Rabbit, Hare], [Rabbit, Hare]}, - [Rabbit, Hare]). + {error, 69, Msg} = force_reset(Config, Rabbit), + ?assertEqual( + match, + re:run( + Msg, "Forced reset is unsupported with Khepri", [{capture, none}])). status_with_alarm(Config) -> [Rabbit, Hare] = rabbit_ct_broker_helpers:get_node_configs(Config, diff --git a/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl b/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl index 87fee4f5ae1d..3b3176d2fee8 100644 --- a/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl +++ b/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl @@ -55,7 +55,6 @@ kill_node_after/3, reset_node/2, - force_reset_node/2, forget_cluster_node/3, forget_cluster_node/4, @@ -2055,10 +2054,6 @@ reset_node(Config, Node) -> Name = get_node_config(Config, Node, nodename), rabbit_control_helper:command(reset, Name). -force_reset_node(Config, Node) -> - Name = get_node_config(Config, Node, nodename), - rabbit_control_helper:command(force_reset, Name). - forget_cluster_node(Config, Node, NodeToForget) -> forget_cluster_node(Config, Node, NodeToForget, []). forget_cluster_node(Config, Node, NodeToForget, Opts) -> From d1a1f97971123e57adae1c7a5c353a982d885069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Tue, 11 Feb 2025 14:50:54 +0100 Subject: [PATCH 02/10] rabbit_stream_queue_SUITE: Swap uses of node 2 and 3 in `format` [Why] We hit some transient errors with the previous order when doing mixed-version testing. Swapping the nodes seems to fix the problem. (cherry picked from commit 5cbda4c838591373b254d091f9775f1cf6e6ba40) --- deps/rabbit/test/rabbit_stream_queue_SUITE.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl index 2111e8e51cbf..9b7f8c2c3896 100644 --- a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl +++ b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl @@ -1563,13 +1563,13 @@ format(Config) -> case length(Nodes) of 3 -> [_, Server2, Server3] = Nodes, - ok = rabbit_control_helper:command(stop_app, Server2), ok = rabbit_control_helper:command(stop_app, Server3), + ok = rabbit_control_helper:command(stop_app, Server2), Fmt2 = rabbit_ct_broker_helpers:rpc(Config, Server, rabbit_stream_queue, ?FUNCTION_NAME, [QRecord, #{}]), - ok = rabbit_control_helper:command(start_app, Server2), ok = rabbit_control_helper:command(start_app, Server3), + ok = rabbit_control_helper:command(start_app, Server2), ?assertEqual(stream, proplists:get_value(type, Fmt2)), ?assertEqual(minority, proplists:get_value(state, Fmt2)), ?assertEqual(Server, proplists:get_value(leader, Fmt2)), From 3061b4409929a4ee9046cbb11e0ebe90c586358d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Wed, 12 Feb 2025 17:13:24 +0100 Subject: [PATCH 03/10] Skip peer discovery clustering tests if multiple Khepri machine versions ... are being used at the same time. [Why] Depending on which node clusters with which, a node running an older version of the Khepri Ra machine may not be able to apply Ra commands and could be stuck. There is no real solution and this clearly an unsupported scenario. An old node won't always be able to join a newer cluster. [How] In the testsuites, we skip clustering tests if we detect that multiple Khepri Ra machine versions are being used. (cherry picked from commit 1f1a13521b5c26904673faac1384ad28199c2fdf) --- .../src/rabbit_ct_broker_helpers.erl | 9 +++++- .../test/system_SUITE.erl | 24 ++++++++++++++-- .../test/system_SUITE.erl | 28 ++++++++++++++++--- 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl b/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl index 3b3176d2fee8..ef1de028a9e2 100644 --- a/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl +++ b/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl @@ -173,7 +173,8 @@ user/1, configured_metadata_store/1, - await_metadata_store_consistent/2 + await_metadata_store_consistent/2, + do_nodes_run_same_ra_machine_version/2 ]). %% Internal functions exported to be used by rpc:call/4. @@ -1070,6 +1071,12 @@ ra_last_applied(ServerId) -> #{last_applied := LastApplied} = ra:key_metrics(ServerId), LastApplied. +do_nodes_run_same_ra_machine_version(Config, RaMachineMod) -> + [MacVer1 | MacVerN] = MacVers = rpc_all(Config, RaMachineMod, version, []), + ct:pal("Ra machine versions of ~s: ~0p", [RaMachineMod, MacVers]), + is_integer(MacVer1) andalso + lists:all(fun(MacVer) -> MacVer =:= MacVer1 end, MacVerN). + rewrite_node_config_file(Config, Node) -> NodeConfig = get_node_config(Config, Node), I = if diff --git a/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl b/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl index 194d6b2e4132..044860906269 100644 --- a/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl +++ b/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl @@ -81,9 +81,27 @@ init_per_testcase(Testcase, Config) case Config3 of _ when is_list(Config3) -> try - _ = rabbit_ct_broker_helpers:rpc_all( - Config3, rabbit_peer_discovery_backend, api_version, []), - Config3 + SameMacVer = ( + rabbit_ct_broker_helpers: + do_nodes_run_same_ra_machine_version( + Config3, khepri_machine)), + case SameMacVer of + true -> + _ = rabbit_ct_broker_helpers:rpc_all( + Config3, + rabbit_peer_discovery_backend, api_version, []), + Config3; + false -> + Config5 = rabbit_ct_helpers:run_steps( + Config3, + rabbit_ct_client_helpers:teardown_steps() + ++ + rabbit_ct_broker_helpers:teardown_steps()), + rabbit_ct_helpers:testcase_finished(Config5, Testcase), + {skip, + "Nodes are using different Khepri Ra machine " + "versions; clustering will likely fail"} + end catch error:{exception, undef, [{rabbit_peer_discovery_backend, api_version, _, _} diff --git a/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl b/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl index 2f7c0bcda85e..3d68526c25a4 100644 --- a/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl +++ b/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl @@ -90,9 +90,27 @@ init_per_testcase(Testcase, Config) case Config3 of _ when is_list(Config3) -> try - _ = rabbit_ct_broker_helpers:rpc_all( - Config3, rabbit_peer_discovery_backend, api_version, []), - Config3 + SameMacVer = ( + rabbit_ct_broker_helpers: + do_nodes_run_same_ra_machine_version( + Config3, khepri_machine)), + case SameMacVer of + true -> + _ = rabbit_ct_broker_helpers:rpc_all( + Config3, + rabbit_peer_discovery_backend, api_version, []), + Config3; + false -> + Config5 = rabbit_ct_helpers:run_steps( + Config3, + rabbit_ct_client_helpers:teardown_steps() + ++ + rabbit_ct_broker_helpers:teardown_steps()), + rabbit_ct_helpers:testcase_finished(Config5, Testcase), + {skip, + "Nodes are using different Khepri Ra machine " + "versions; clustering will likely fail"} + end catch error:{exception, undef, [{rabbit_peer_discovery_backend, api_version, _, _} @@ -237,7 +255,9 @@ wait_for_etcd(EtcdEndpoints) -> Timeout = 60000, rabbit_ct_helpers:await_condition( fun() -> - case eetcd:open(test, EtcdEndpoints) of + Ret = eetcd:open(test, EtcdEndpoints), + ct:pal("Ret = ~p", [Ret]), + case Ret of {ok, _Pid} -> true; _ -> false end From 1dbe0d68ec6ce7019d3683287b5fe44eedffd17d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Thu, 13 Feb 2025 10:25:07 +0100 Subject: [PATCH 04/10] clustering_management_SUITE: Use old node as seed node [Why] During mixed-version testing, the old node might not be able to join or rejoin a cluster if the other nodes run a newer Khepri machine version. [How] The old node is used as the cluster seed node and is never touched otherwise. Other nodes are restarted or join the cluster later. (cherry picked from commit e76233a222990ac7575d1a0217ef58e7e20efce8) --- .../test/clustering_management_SUITE.erl | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/deps/rabbit/test/clustering_management_SUITE.erl b/deps/rabbit/test/clustering_management_SUITE.erl index bacd813a2ea9..0289ca64bbe4 100644 --- a/deps/rabbit/test/clustering_management_SUITE.erl +++ b/deps/rabbit/test/clustering_management_SUITE.erl @@ -331,7 +331,7 @@ restart_cluster_node(Config) -> assert_clustered([Rabbit, Hare]). join_and_part_cluster_in_khepri(Config) -> - [Rabbit, Hare, Bunny] = cluster_members(Config), + [Rabbit, Bunny, Hare] = cluster_members(Config), assert_not_clustered(Rabbit), assert_not_clustered(Hare), assert_not_clustered(Bunny), @@ -441,38 +441,38 @@ join_to_start_interval(Config) -> assert_clustered([Rabbit, Hare]). join_cluster_in_minority(Config) -> - [Rabbit, Hare, Bunny] = cluster_members(Config), + [Rabbit, Bunny, Hare] = cluster_members(Config), assert_not_clustered(Rabbit), assert_not_clustered(Hare), assert_not_clustered(Bunny), - stop_join_start(Config, Bunny, Rabbit), + stop_join_start(Config, Rabbit, Bunny), assert_clustered([Rabbit, Bunny]), - ok = rabbit_ct_broker_helpers:stop_node(Config, Bunny), + ok = rabbit_ct_broker_helpers:stop_node(Config, Rabbit), ok = stop_app(Config, Hare), - ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)), + ?assertEqual(ok, join_cluster(Config, Hare, Bunny, false)), - ok = rabbit_ct_broker_helpers:start_node(Config, Bunny), + ok = rabbit_ct_broker_helpers:start_node(Config, Rabbit), ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)), ?assertEqual(ok, start_app(Config, Hare)), assert_clustered([Rabbit, Bunny, Hare]). join_cluster_with_rabbit_stopped(Config) -> - [Rabbit, Hare, Bunny] = cluster_members(Config), + [Rabbit, Bunny, Hare] = cluster_members(Config), assert_not_clustered(Rabbit), assert_not_clustered(Hare), assert_not_clustered(Bunny), - stop_join_start(Config, Bunny, Rabbit), + stop_join_start(Config, Rabbit, Bunny), assert_clustered([Rabbit, Bunny]), - ok = stop_app(Config, Bunny), + ok = stop_app(Config, Rabbit), ok = stop_app(Config, Hare), - ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)), + ?assertEqual(ok, join_cluster(Config, Hare, Bunny, false)), - ok = start_app(Config, Bunny), + ok = start_app(Config, Rabbit), ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)), ?assertEqual(ok, start_app(Config, Hare)), @@ -1113,7 +1113,7 @@ await_running_count_in_khepri(Config) -> await_running_count, [5, 1000])). start_nodes_in_reverse_order(Config) -> - [Rabbit, Hare, Bunny] = cluster_members(Config), + [Rabbit, Bunny, Hare] = cluster_members(Config), assert_not_clustered(Rabbit), assert_not_clustered(Hare), assert_not_clustered(Bunny), @@ -1136,7 +1136,7 @@ start_nodes_in_reverse_order(Config) -> %% Test booting nodes in the wrong order for Mnesia. Interesting... start_nodes_in_stop_order(Config) -> - [Rabbit, Hare, Bunny] = cluster_members(Config), + [Rabbit, Bunny, Hare] = cluster_members(Config), assert_not_clustered(Rabbit), assert_not_clustered(Hare), assert_not_clustered(Bunny), @@ -1161,7 +1161,7 @@ start_nodes_in_stop_order(Config) -> end. start_nodes_in_stop_order_in_khepri(Config) -> - [Rabbit, Hare, Bunny] = cluster_members(Config), + [Rabbit, Bunny, Hare] = cluster_members(Config), assert_not_clustered(Rabbit), assert_not_clustered(Hare), assert_not_clustered(Bunny), @@ -1184,7 +1184,7 @@ start_nodes_in_stop_order_in_khepri(Config) -> %% TODO test force_boot with Khepri involved start_nodes_in_stop_order_with_force_boot(Config) -> - [Rabbit, Hare, Bunny] = cluster_members(Config), + [Rabbit, Bunny, Hare] = cluster_members(Config), assert_not_clustered(Rabbit), assert_not_clustered(Hare), assert_not_clustered(Bunny), From d32de9143db6900cef0eb48995e8a1dbc34b284d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Thu, 13 Feb 2025 10:39:54 +0100 Subject: [PATCH 05/10] clustering_management_SUITE: Skip `start_with_invalid_schema_in_path` with Khepri [Why] This test plays with the Mnesia database explicitly. (cherry picked from commit f088c4f5444f123cdbd8e08fc73cd48390fe0765) --- deps/rabbit/test/clustering_management_SUITE.erl | 1 - 1 file changed, 1 deletion(-) diff --git a/deps/rabbit/test/clustering_management_SUITE.erl b/deps/rabbit/test/clustering_management_SUITE.erl index 0289ca64bbe4..2fc9be09fe54 100644 --- a/deps/rabbit/test/clustering_management_SUITE.erl +++ b/deps/rabbit/test/clustering_management_SUITE.erl @@ -76,7 +76,6 @@ groups() -> status_with_alarm, pid_file_and_await_node_startup_in_khepri, await_running_count_in_khepri, - start_with_invalid_schema_in_path, persistent_cluster_id, stop_start_cluster_node, restart_cluster_node, From c4eb581cf3e0917ba05dea7d9520edf0ababaddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Thu, 13 Feb 2025 15:37:39 +0100 Subject: [PATCH 06/10] Increase the TCP ports range used by parallel-ct-set-* [Why] We see nodes trying to use busy ports in CI from time to time. (cherry picked from commit e76c2271317075c28b0c8dfd97fe28b50c157001) --- deps/rabbit/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deps/rabbit/Makefile b/deps/rabbit/Makefile index c8da33210061..5d6ac4f6183a 100644 --- a/deps/rabbit/Makefile +++ b/deps/rabbit/Makefile @@ -241,10 +241,10 @@ define ct_master.erl peer:call(Pid2, net_kernel, set_net_ticktime, [5]), peer:call(Pid3, net_kernel, set_net_ticktime, [5]), peer:call(Pid4, net_kernel, set_net_ticktime, [5]), - peer:call(Pid1, persistent_term, put, [rabbit_ct_tcp_port_base, 23000]), - peer:call(Pid2, persistent_term, put, [rabbit_ct_tcp_port_base, 25000]), - peer:call(Pid3, persistent_term, put, [rabbit_ct_tcp_port_base, 27000]), - peer:call(Pid4, persistent_term, put, [rabbit_ct_tcp_port_base, 29000]), + peer:call(Pid1, persistent_term, put, [rabbit_ct_tcp_port_base, 16000]), + peer:call(Pid2, persistent_term, put, [rabbit_ct_tcp_port_base, 20000]), + peer:call(Pid3, persistent_term, put, [rabbit_ct_tcp_port_base, 24000]), + peer:call(Pid4, persistent_term, put, [rabbit_ct_tcp_port_base, 28000]), [{[_], {ok, Results}}] = ct_master_fork:run("$1"), peer:stop(Pid4), peer:stop(Pid3), From 7f6a797816320927b63cd5c227c279b7ebcd82b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Fri, 14 Feb 2025 11:41:57 +0100 Subject: [PATCH 07/10] rabbit_stream_queue_SUITE: Fix recursion issue ... in retry_if_coordinator_unavailable(). (cherry picked from commit ee0b5b5f323abd23f1ec758aea5b5ab344b3c393) --- deps/rabbit/test/rabbit_stream_queue_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl index 9b7f8c2c3896..3183a8783ebf 100644 --- a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl +++ b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl @@ -2741,7 +2741,7 @@ retry_if_coordinator_unavailable(Config, Server, Cmd, Retry) -> case re:run(Msg, ".*coordinator_unavailable.*", [{capture, none}]) of match -> ct:pal("Attempt to execute command ~p failed, coordinator unavailable", [Cmd]), - retry_if_coordinator_unavailable(Config, Ch, Cmd, Retry - 1); + retry_if_coordinator_unavailable(Config, Server, Cmd, Retry - 1); _ -> exit(Error) end From a1f918a5b0642103d864e5b736b9c36962c58f33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Fri, 14 Feb 2025 14:56:20 +0100 Subject: [PATCH 08/10] amqp_auth_SUITE: Handle error in init_per_group/2 (cherry picked from commit b7c9e648ea7f72d9ede3cfa2efec1d9f25f97c9e) --- deps/rabbit/test/amqp_auth_SUITE.erl | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/deps/rabbit/test/amqp_auth_SUITE.erl b/deps/rabbit/test/amqp_auth_SUITE.erl index f3cbdbf7d996..f272f6ce22b7 100644 --- a/deps/rabbit/test/amqp_auth_SUITE.erl +++ b/deps/rabbit/test/amqp_auth_SUITE.erl @@ -112,12 +112,17 @@ init_per_group(Group, Config0) -> Config1, rabbit_ct_broker_helpers:setup_steps() ++ rabbit_ct_client_helpers:setup_steps()), - Vhost = <<"test vhost">>, - User = <<"test user">>, - ok = rabbit_ct_broker_helpers:add_vhost(Config, Vhost), - ok = rabbit_ct_broker_helpers:add_user(Config, User), - [{test_vhost, Vhost}, - {test_user, User}] ++ Config. + case Config of + _ when is_list(Config) -> + Vhost = <<"test vhost">>, + User = <<"test user">>, + ok = rabbit_ct_broker_helpers:add_vhost(Config, Vhost), + ok = rabbit_ct_broker_helpers:add_user(Config, User), + [{test_vhost, Vhost}, + {test_user, User}] ++ Config; + {skip, _} = Skip -> + Skip + end. end_per_group(_Group, Config) -> ok = rabbit_ct_broker_helpers:delete_user(Config, ?config(test_user, Config)), From 3c0d8924a044574778db61b32be3ae989189124d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Fri, 14 Feb 2025 15:23:50 +0100 Subject: [PATCH 09/10] unit_credit_flow_SUITE: Greatly reduce time trap (cherry picked from commit 64b68e5d9ceb85bf7b6fb3391c4ed0136b361b8d) --- deps/rabbit/test/unit_credit_flow_SUITE.erl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deps/rabbit/test/unit_credit_flow_SUITE.erl b/deps/rabbit/test/unit_credit_flow_SUITE.erl index 189d0287290d..bdc3a0679b85 100644 --- a/deps/rabbit/test/unit_credit_flow_SUITE.erl +++ b/deps/rabbit/test/unit_credit_flow_SUITE.erl @@ -11,6 +11,9 @@ -compile(export_all). +suite() -> + [{timetrap, {minutes, 3}}]. + all() -> [ {group, sequential_tests} From 02c7b04d4e56b10be44f3ff876ea816291467eca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Fri, 14 Feb 2025 15:36:07 +0100 Subject: [PATCH 10/10] GitHub workflows: List open TCP ports This may help debug nodes that try to open busy ports. (cherry picked from commit a5f30ea02ea1576e432c4e6086e0093b80db4b6d) --- .github/workflows/test-make-target.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-make-target.yaml b/.github/workflows/test-make-target.yaml index bce876e24b44..ebf88b3ae108 100644 --- a/.github/workflows/test-make-target.yaml +++ b/.github/workflows/test-make-target.yaml @@ -82,6 +82,7 @@ jobs: - name: RUN TESTS if: inputs.plugin != 'rabbitmq_cli' run: | + sudo netstat -ntp make -C deps/${{ inputs.plugin }} ${{ inputs.make_target }} RABBITMQ_METADATA_STORE=${{ inputs.metadata_store }} # rabbitmq_cli needs a correct broker version for two of its tests.