rabbitmq · Ayanda-D · Jun 27, 2024 · Jun 27, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/deps/rabbit/src/amqqueue.erl b/deps/rabbit/src/amqqueue.erl
@@ -70,6 +70,7 @@
          set_immutable/1,
          qnode/1,
          to_printable/1,
+         to_printable/2,
          macros/0]).
 
 -define(record_version, amqqueue_v2).
@@ -564,6 +565,14 @@ to_printable(#amqqueue{name = QName = #resource{name = Name},
        <<"virtual_host">> => VHost,
        <<"type">> => Type}.
 
+-spec to_printable(rabbit_types:r(), atom() | binary()) -> #{binary() => any()}.
+to_printable(QName = #resource{name = Name, virtual_host = VHost}, Type) ->
+    _ = rabbit_queue_type:discover(Type),
+     #{<<"readable_name">> => rabbit_data_coercion:to_binary(rabbit_misc:rs(QName)),
+       <<"name">> => Name,
+       <<"virtual_host">> => VHost,
+       <<"type">> => Type}.
+
 % private
 
 macros() ->

diff --git a/deps/rabbit/src/rabbit_quorum_queue.erl b/deps/rabbit/src/rabbit_quorum_queue.erl
@@ -82,6 +82,9 @@
          file_handle_other_reservation/0,
          file_handle_release_reservation/0]).
 
+-export([leader_health_check/2,
+         run_leader_health_check/4]).
+
 -ifdef(TEST).
 -export([filter_promotable/2,
          ra_machine_config/1]).
@@ -144,6 +147,8 @@
 -define(SNAPSHOT_INTERVAL, 8192). %% the ra default is 4096
 % -define(UNLIMITED_PREFETCH_COUNT, 2000). %% something large for ra
 -define(MIN_CHECKPOINT_INTERVAL, 8192). %% the ra default is 16384
+-define(LEADER_HEALTH_CHECK_TIMEOUT, 1_000).
+-define(GLOBAL_LEADER_HEALTH_CHECK_TIMEOUT, 60_000).
 
 %%----------- QQ policies ---------------------------------------------------
 
@@ -2145,3 +2150,76 @@ file_handle_other_reservation() ->
 file_handle_release_reservation() ->
     ok.
 
+leader_health_check(QueueNameOrRegEx, VHost) ->
+    %% Set a process limit threshold to 40% of ErlangVM process limit, beyond which
+    %% we cannot spawn any new processes for executing QQ leader health checks.
+    ProcessLimitThreshold = round(0.4 * erlang:system_info(process_limit)),
+
+    leader_health_check(QueueNameOrRegEx, VHost, ProcessLimitThreshold).
+
+leader_health_check(QueueNameOrRegEx, VHost, ProcessLimitThreshold) ->
+    Qs =
+        case VHost of
+            global ->
+                rabbit_amqqueue:list();
+            VHost when is_binary(VHost) ->
+                rabbit_amqqueue:list(VHost)
+        end,
+    check_process_limit_safety(length(Qs), ProcessLimitThreshold),
+    ParentPID = self(),
+    HealthCheckRef = make_ref(),
+    HealthCheckPids =
+        lists:flatten(
+            [begin
+                {resource, _VHostN, queue, QueueName} = QResource = amqqueue:get_name(Q),
+                case re:run(QueueName, QueueNameOrRegEx, [{capture, none}]) of
+                    match ->
+                        {ClusterName, _} = rabbit_amqqueue:pid_of(Q),
+                        _Pid = spawn(fun() -> run_leader_health_check(ClusterName, QResource, HealthCheckRef, ParentPID) end);
+                    _ ->
+                        []
+                end
+            end || Q <- Qs, amqqueue:get_type(Q) == ?MODULE]),
+    Result = wait_for_leader_health_checks(HealthCheckRef, length(HealthCheckPids), []),
+    _ = spawn(fun() -> maybe_log_leader_health_check_result(Result) end),
+    Result.
+
+run_leader_health_check(ClusterName, QResource, HealthCheckRef, From) ->
+    Leader = ra_leaderboard:lookup_leader(ClusterName),
+    case ra_server_proc:ping(Leader, ?LEADER_HEALTH_CHECK_TIMEOUT) of
+        {pong,leader} ->
+            From ! {ok, HealthCheckRef, QResource};
+        _ ->
+            From ! {error, HealthCheckRef, QResource}
+    end,
+    ok.
+
+wait_for_leader_health_checks(_Ref, 0, UnhealthyAcc = []) -> UnhealthyAcc;
+wait_for_leader_health_checks(Ref, N, UnhealthyAcc) ->
+    receive
+        {ok, Ref, _QResource} when N == 1 ->
+            UnhealthyAcc;
+        {error, Ref, QResource} when N == 1 ->
+            [amqqueue:to_printable(QResource, ?MODULE) | UnhealthyAcc];
+        {ok, Ref, _QResource} ->
+            wait_for_leader_health_checks(Ref, N - 1, UnhealthyAcc);
+        {error, Ref, QResource} ->
+            wait_for_leader_health_checks(Ref, N - 1, [amqqueue:to_printable(QResource, ?MODULE) | UnhealthyAcc])
+    after
+        ?GLOBAL_LEADER_HEALTH_CHECK_TIMEOUT ->
+            UnhealthyAcc
+    end.
+
+check_process_limit_safety(QCount, ProcessLimitThreshold) ->
+    case (erlang:system_info(process_count) + QCount) >= ProcessLimitThreshold of
+        true ->
+            rabbit_log:warning("Leader health check not permitted, process limit threshold will be exceeded."),
+            throw({error, leader_health_check_process_limit_exceeded});
+        false ->
+            ok
+    end.
+
+maybe_log_leader_health_check_result([]) -> ok;
+maybe_log_leader_health_check_result(Result) ->
+    Qs = lists:map(fun(R) -> catch maps:get(<<"readable_name">>, R) end, Result),
+    rabbit_log:warning("Leader health check result (unhealthy leaders detected): ~tp", [Qs]).
diff --git a/deps/rabbit/test/quorum_queue_SUITE.erl b/deps/rabbit/test/quorum_queue_SUITE.erl
@@ -192,7 +192,8 @@ all_tests() ->
      priority_queue_2_1_ratio,
      requeue_multiple_true,
      requeue_multiple_false,
-     subscribe_from_each
+     subscribe_from_each,
+     leader_health_check
     ].
 
 memory_tests() ->
@@ -4145,6 +4146,129 @@ amqpl_headers(Config) ->
     ok = amqp_channel:cast(Ch, #'basic.ack'{delivery_tag = DeliveryTag,
                                             multiple = true}).
 
+leader_health_check(Config) ->
+    VHost1 = <<"vhost1">>,
+    VHost2 = <<"vhost2">>,
+
+    set_up_vhost(Config, VHost1),
+    set_up_vhost(Config, VHost2),
+
+    %% check empty vhost
+    ?assertEqual([],
+        rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+            [<<".*">>, VHost1])),
+    ?assertEqual([],
+        rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+            [<<".*">>, global])),
+
+    Conn1 = rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0, VHost1),
+    {ok, Ch1} = amqp_connection:open_channel(Conn1),
+
+    Conn2 = rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0, VHost2),
+    {ok, Ch2} = amqp_connection:open_channel(Conn2),
+
+    Qs1 = [<<"Q.1">>, <<"Q.2">>, <<"Q.3">>],
+    Qs2 = [<<"Q.4">>, <<"Q.5">>, <<"Q.6">>],
+
+    %% in vhost1
+    [?assertEqual({'queue.declare_ok', Q, 0, 0},
+                 declare(Ch1, Q, [{<<"x-queue-type">>, longstr, <<"quorum">>}]))
+        || Q <- Qs1],
+
+    %% in vhost2
+    [?assertEqual({'queue.declare_ok', Q, 0, 0},
+                 declare(Ch2, Q, [{<<"x-queue-type">>, longstr, <<"quorum">>}]))
+        || Q <- Qs2],
+
+    %% test sucessful health checks in vhost1, vhost2, global
+    ?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<".*">>, VHost1])),
+    ?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.*">>, VHost1])),
+    [?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [Q, VHost1])) || Q <- Qs1],
+
+    ?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<".*">>, VHost2])),
+    ?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.*">>, VHost2])),
+    [?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [Q, VHost2])) || Q <- Qs2],
+
+    ?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<".*">>, global])),
+    ?assertEqual([], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.*">>, global])),
+
+    %% clear leaderboard
+    Qs = rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_amqqueue, list, []),
+
+    [{_Q1_ClusterName, _Q1Res},
+     {_Q2_ClusterName, _Q2Res},
+     {_Q3_ClusterName, _Q3Res},
+     {_Q4_ClusterName, _Q4Res},
+     {_Q5_ClusterName, _Q5Res},
+     {_Q6_ClusterName, _Q6Res}] = QQ_Clusters =
+        lists:usort(
+            [begin
+                {ClusterName, _} = amqqueue:get_pid(Q),
+                {ClusterName, amqqueue:get_name(Q)}
+            end
+                || Q <- Qs, amqqueue:get_type(Q) == rabbit_quorum_queue]),
+
+    [Q1Data, Q2Data, Q3Data, Q4Data, Q5Data, Q6Data] = QQ_Data =
+        [begin
+            rabbit_ct_broker_helpers:rpc(Config, 0, ra_leaderboard, clear, [Q_ClusterName]),
+            _QData = amqqueue:to_printable(Q_Res, rabbit_quorum_queue)
+         end
+            || {Q_ClusterName, Q_Res} <- QQ_Clusters],
+
+    %% test failed health checks in vhost1, vhost2, global
+    ?assertEqual([Q1Data], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.1">>, VHost1])),
+    ?assertEqual([Q2Data], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.2">>, VHost1])),
+    ?assertEqual([Q3Data], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.3">>, VHost1])),
+    ?assertEqual([Q1Data, Q2Data, Q3Data],
+        lists:usort(rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                        [<<".*">>, VHost1]))),
+    ?assertEqual([Q1Data, Q2Data, Q3Data],
+        lists:usort(rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                        [<<"Q.*">>, VHost1]))),
+
+    ?assertEqual([Q4Data], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.4">>, VHost2])),
+    ?assertEqual([Q5Data], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.5">>, VHost2])),
+    ?assertEqual([Q6Data], rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                                      [<<"Q.6">>, VHost2])),
+    ?assertEqual([Q4Data, Q5Data, Q6Data],
+        lists:usort(rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                        [<<".*">>, VHost2]))),
+    ?assertEqual([Q4Data, Q5Data, Q6Data],
+        lists:usort(rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                        [<<"Q.*">>, VHost2]))),
+
+    ?assertEqual(QQ_Data,
+        lists:usort(rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                        [<<"Q.*">>, global]))),
+    ?assertEqual(QQ_Data,
+        lists:usort(rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_quorum_queue, leader_health_check,
+                        [<<"Q.*">>, global]))),
+
+    %% cleanup
+    [?assertMatch(#'queue.delete_ok'{},
+                 amqp_channel:call(Ch1, #'queue.delete'{queue = Q}))
+        || Q <- Qs1],
+    [?assertMatch(#'queue.delete_ok'{},
+                 amqp_channel:call(Ch1, #'queue.delete'{queue = Q}))
+        || Q <- Qs2],
+
+    amqp_connection:close(Conn1),
+    amqp_connection:close(Conn2).
+
+
 leader_locator_client_local(Config) ->
     [Server1 | _] = Servers = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
     Q = ?config(queue_name, Config),
@@ -4465,6 +4589,11 @@ declare_passive(Ch, Q, Args) ->
                                            auto_delete = false,
                                            passive = true,
                                            arguments = Args}).
+
+set_up_vhost(Config, VHost) ->
+    rabbit_ct_broker_helpers:add_vhost(Config, VHost),
+    rabbit_ct_broker_helpers:set_full_permissions(Config, <<"guest">>, VHost).
+
 assert_queue_type(Server, Q, Expected) ->
     assert_queue_type(Server, <<"/">>, Q, Expected).
 

diff --git a/deps/rabbitmq_cli/lib/rabbitmq/cli/core/output.ex b/deps/rabbitmq_cli/lib/rabbitmq/cli/core/output.ex
@@ -18,6 +18,10 @@ defmodule RabbitMQ.CLI.Core.Output do
     :ok
   end
 
+  def format_output({:ok, :check_passed, output}, formatter, options) do
+    {:ok, formatter.format_output(output, options)}
+  end
+
   def format_output({:ok, output}, formatter, options) do
     {:ok, formatter.format_output(output, options)}
   end

diff --git a/deps/rabbitmq_cli/lib/rabbitmq/cli/diagnostics/commands/leader_health_check_command.ex b/deps/rabbitmq_cli/lib/rabbitmq/cli/diagnostics/commands/leader_health_check_command.ex
@@ -0,0 +1,105 @@
+## This Source Code Form is subject to the terms of the Mozilla Public
+## License, v. 2.0. If a copy of the MPL was not distributed with this
+## file, You can obtain one at https://mozilla.org/MPL/2.0/.
+##
+## Copyright (c) 2007-2025 VMware, Inc. or its affiliates.  All rights reserved.
+
+defmodule RabbitMQ.CLI.Diagnostics.Commands.LeaderHealthCheckCommand do
+  alias RabbitMQ.CLI.Core.DocGuide
+
+  @behaviour RabbitMQ.CLI.CommandBehaviour
+
+  import RabbitMQ.CLI.Core.Platform, only: [line_separator: 0]
+
+  def switches(), do: [global: :boolean]
+
+  def scopes(), do: [:diagnostics]
+
+  def merge_defaults(args, opts) do
+    {args, Map.merge(%{global: false, vhost: "/"}, opts)}
+  end
+
+  use RabbitMQ.CLI.Core.AcceptsOnePositionalArgument
+  use RabbitMQ.CLI.Core.RequiresRabbitAppRunning
+
+  def run([pattern] = _args, %{node: node_name, vhost: vhost, global: global_opt}) do
+    vhost = if global_opt, do: :global, else: vhost
+
+    case :rabbit_misc.rpc_call(node_name, :rabbit_quorum_queue, :leader_health_check, [pattern, vhost]) do
+      [] ->
+        :ok
+
+      unhealthy_queues_or_error ->
+        {:error, unhealthy_queues_or_error}
+    end
+  end
+
+  def output(:ok, %{node: node_name, formatter: "json"}) do
+    {:ok,
+     %{
+       "result" => "ok",
+       "message" =>
+         "Node #{node_name} reported all quorum queue leaders as healthy"
+     }}
+  end
+
+  def output(:ok, %{silent: true}) do
+    {:ok, :check_passed}
+  end
+
+  def output(:ok, %{node: node_name}) do
+    {:ok, "Node #{node_name} reported all quorum queue leaders as healthy"}
+  end
+
+  def output({:error, unhealthy_queues}, %{node: node_name, formatter: "json"}) when is_list(unhealthy_queues) do
+    {:ok, :check_passed,
+     %{
+       "result" => "error",
+       "queues" => unhealthy_queues,
+       "message" => "Node #{node_name} reported unhealthy quorum queue leaders"
+     }}
+  end
+
+  def output({:error, unhealthy_queues}, %{silent: true}) when is_list(unhealthy_queues) do
+    {:ok, :check_passed}
+  end
+
+  def output({:error, unhealthy_queues}, %{vhost: _vhost}) when is_list(unhealthy_queues) do
+    lines = queue_lines(unhealthy_queues)
+
+    {:ok, :check_passed, Enum.join(lines, line_separator())}
+  end
+
+  def formatter(), do: RabbitMQ.CLI.Formatters.PrettyTable
+
+  def usage() do
+    "leader_health_check [--vhost <vhost>] [--global] <pattern>"
+  end
+
+  def usage_additional do
+    [
+      ["<pattern>", "regular expression pattern used to match quorum queues"],
+      ["--global", "run leader health check for all queues in all virtual hosts on the node"]
+    ]
+  end
+
+  def help_section(), do: :observability_and_health_checks
+
+  def usage_doc_guides() do
+    [
+      DocGuide.quorum_queues()
+    ]
+  end
+
+  def description(), do: "Checks availability and health status of quorum queue leaders"
+
+  def banner([name], %{global: true}),
+    do: "Checking availability and health status of leaders for quorum queues matching #{name} in all vhosts ..."
+
+  def banner([name], %{vhost: vhost}),
+    do: "Checking availability and health status of leaders for quorum queues matching #{name} in vhost #{vhost} ..."
+
+  def queue_lines(qs) do
+    for q <- qs, do: "Leader for #{q["readable_name"]} is unhealthy and unavailable"
+  end
+end