-
Notifications
You must be signed in to change notification settings - Fork 4k
CLI: new health check that detects QQs without an elected leader #13433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
c26edbe
6cc03b0
76d66a1
857e2a7
6cf9339
96b8bce
239a69b
48ba3e1
7873737
b7dec89
c7da4d5
1736845
1084179
68739a6
5f5e992
ebffd7d
663fc98
df82f12
b2acbae
7a8e166
9bdb81f
6158568
ea07938
a45aa81
bb43c0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -82,6 +82,9 @@ | |
| file_handle_other_reservation/0, | ||
| file_handle_release_reservation/0]). | ||
|
|
||
| -export([leader_health_check/2, | ||
| run_leader_health_check/4]). | ||
|
|
||
| -ifdef(TEST). | ||
| -export([filter_promotable/2, | ||
| ra_machine_config/1]). | ||
|
|
@@ -144,6 +147,8 @@ | |
| -define(SNAPSHOT_INTERVAL, 8192). %% the ra default is 4096 | ||
| % -define(UNLIMITED_PREFETCH_COUNT, 2000). %% something large for ra | ||
| -define(MIN_CHECKPOINT_INTERVAL, 8192). %% the ra default is 16384 | ||
| -define(LEADER_HEALTH_CHECK_TIMEOUT, 1_000). | ||
| -define(GLOBAL_LEADER_HEALTH_CHECK_TIMEOUT, 60_000). | ||
|
|
||
| %%----------- QQ policies --------------------------------------------------- | ||
|
|
||
|
|
@@ -2145,3 +2150,76 @@ file_handle_other_reservation() -> | |
| file_handle_release_reservation() -> | ||
| ok. | ||
|
|
||
| leader_health_check(QueueNameOrRegEx, VHost) -> | ||
| %% Set a process limit threshold to 40% of ErlangVM process limit, beyond which | ||
| %% we cannot spawn any new processes for executing QQ leader health checks. | ||
| ProcessLimitThreshold = round(0.4 * erlang:system_info(process_limit)), | ||
|
||
|
|
||
| leader_health_check(QueueNameOrRegEx, VHost, ProcessLimitThreshold). | ||
|
|
||
| leader_health_check(QueueNameOrRegEx, VHost, ProcessLimitThreshold) -> | ||
| Qs = | ||
| case VHost of | ||
| global -> | ||
| rabbit_amqqueue:list(); | ||
|
||
| VHost when is_binary(VHost) -> | ||
| rabbit_amqqueue:list(VHost) | ||
| end, | ||
| check_process_limit_safety(length(Qs), ProcessLimitThreshold), | ||
| ParentPID = self(), | ||
| HealthCheckRef = make_ref(), | ||
| HealthCheckPids = | ||
| lists:flatten( | ||
| [begin | ||
| {resource, _VHostN, queue, QueueName} = QResource = amqqueue:get_name(Q), | ||
| case re:run(QueueName, QueueNameOrRegEx, [{capture, none}]) of | ||
| match -> | ||
| {ClusterName, _} = rabbit_amqqueue:pid_of(Q), | ||
| _Pid = spawn(fun() -> run_leader_health_check(ClusterName, QResource, HealthCheckRef, ParentPID) end); | ||
| _ -> | ||
| [] | ||
| end | ||
| end || Q <- Qs, amqqueue:get_type(Q) == ?MODULE]), | ||
| Result = wait_for_leader_health_checks(HealthCheckRef, length(HealthCheckPids), []), | ||
| _ = spawn(fun() -> maybe_log_leader_health_check_result(Result) end), | ||
| Result. | ||
|
|
||
| run_leader_health_check(ClusterName, QResource, HealthCheckRef, From) -> | ||
| Leader = ra_leaderboard:lookup_leader(ClusterName), | ||
| case ra_server_proc:ping(Leader, ?LEADER_HEALTH_CHECK_TIMEOUT) of | ||
| {pong,leader} -> | ||
| From ! {ok, HealthCheckRef, QResource}; | ||
| _ -> | ||
| From ! {error, HealthCheckRef, QResource} | ||
| end, | ||
| ok. | ||
|
|
||
| wait_for_leader_health_checks(_Ref, 0, UnhealthyAcc = []) -> UnhealthyAcc; | ||
| wait_for_leader_health_checks(Ref, N, UnhealthyAcc) -> | ||
| receive | ||
| {ok, Ref, _QResource} when N == 1 -> | ||
| UnhealthyAcc; | ||
| {error, Ref, QResource} when N == 1 -> | ||
| [amqqueue:to_printable(QResource, ?MODULE) | UnhealthyAcc]; | ||
| {ok, Ref, _QResource} -> | ||
| wait_for_leader_health_checks(Ref, N - 1, UnhealthyAcc); | ||
| {error, Ref, QResource} -> | ||
| wait_for_leader_health_checks(Ref, N - 1, [amqqueue:to_printable(QResource, ?MODULE) | UnhealthyAcc]) | ||
| after | ||
| ?GLOBAL_LEADER_HEALTH_CHECK_TIMEOUT -> | ||
| UnhealthyAcc | ||
| end. | ||
|
|
||
| check_process_limit_safety(QCount, ProcessLimitThreshold) -> | ||
| case (erlang:system_info(process_count) + QCount) >= ProcessLimitThreshold of | ||
| true -> | ||
| rabbit_log:warning("Leader health check not permitted, process limit threshold will be exceeded."), | ||
| throw({error, leader_health_check_process_limit_exceeded}); | ||
| false -> | ||
| ok | ||
| end. | ||
|
|
||
| maybe_log_leader_health_check_result([]) -> ok; | ||
| maybe_log_leader_health_check_result(Result) -> | ||
| Qs = lists:map(fun(R) -> catch maps:get(<<"readable_name">>, R) end, Result), | ||
| rabbit_log:warning("Leader health check result (unhealthy leaders detected): ~tp", [Qs]). | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| ## This Source Code Form is subject to the terms of the Mozilla Public | ||
| ## License, v. 2.0. If a copy of the MPL was not distributed with this | ||
| ## file, You can obtain one at https://mozilla.org/MPL/2.0/. | ||
| ## | ||
| ## Copyright (c) 2007-2025 VMware, Inc. or its affiliates. All rights reserved. | ||
|
|
||
| defmodule RabbitMQ.CLI.Diagnostics.Commands.LeaderHealthCheckCommand do | ||
| alias RabbitMQ.CLI.Core.DocGuide | ||
|
|
||
| @behaviour RabbitMQ.CLI.CommandBehaviour | ||
|
|
||
| import RabbitMQ.CLI.Core.Platform, only: [line_separator: 0] | ||
|
|
||
| def switches(), do: [global: :boolean] | ||
|
|
||
| def scopes(), do: [:diagnostics] | ||
|
|
||
| def merge_defaults(args, opts) do | ||
| {args, Map.merge(%{global: false, vhost: "/"}, opts)} | ||
| end | ||
|
|
||
| use RabbitMQ.CLI.Core.AcceptsOnePositionalArgument | ||
| use RabbitMQ.CLI.Core.RequiresRabbitAppRunning | ||
|
|
||
| def run([pattern] = _args, %{node: node_name, vhost: vhost, global: global_opt}) do | ||
| vhost = if global_opt, do: :global, else: vhost | ||
|
|
||
| case :rabbit_misc.rpc_call(node_name, :rabbit_quorum_queue, :leader_health_check, [pattern, vhost]) do | ||
| [] -> | ||
| :ok | ||
|
|
||
| unhealthy_queues_or_error -> | ||
| {:error, unhealthy_queues_or_error} | ||
| end | ||
| end | ||
|
|
||
| def output(:ok, %{node: node_name, formatter: "json"}) do | ||
| {:ok, | ||
| %{ | ||
| "result" => "ok", | ||
| "message" => | ||
| "Node #{node_name} reported all quorum queue leaders as healthy" | ||
| }} | ||
| end | ||
|
|
||
| def output(:ok, %{silent: true}) do | ||
| {:ok, :check_passed} | ||
| end | ||
|
|
||
| def output(:ok, %{node: node_name}) do | ||
| {:ok, "Node #{node_name} reported all quorum queue leaders as healthy"} | ||
| end | ||
|
|
||
| def output({:error, unhealthy_queues}, %{node: node_name, formatter: "json"}) when is_list(unhealthy_queues) do | ||
| {:ok, :check_passed, | ||
| %{ | ||
| "result" => "error", | ||
| "queues" => unhealthy_queues, | ||
| "message" => "Node #{node_name} reported unhealthy quorum queue leaders" | ||
| }} | ||
| end | ||
|
|
||
| def output({:error, unhealthy_queues}, %{silent: true}) when is_list(unhealthy_queues) do | ||
| {:ok, :check_passed} | ||
| end | ||
|
|
||
| def output({:error, unhealthy_queues}, %{vhost: _vhost}) when is_list(unhealthy_queues) do | ||
| lines = queue_lines(unhealthy_queues) | ||
|
|
||
| {:ok, :check_passed, Enum.join(lines, line_separator())} | ||
| end | ||
|
|
||
| def formatter(), do: RabbitMQ.CLI.Formatters.PrettyTable | ||
|
|
||
| def usage() do | ||
| "leader_health_check [--vhost <vhost>] [--global] <pattern>" | ||
| end | ||
|
|
||
| def usage_additional do | ||
| [ | ||
| ["<pattern>", "regular expression pattern used to match quorum queues"], | ||
| ["--global", "run leader health check for all queues in all virtual hosts on the node"] | ||
| ] | ||
| end | ||
|
|
||
| def help_section(), do: :observability_and_health_checks | ||
|
|
||
| def usage_doc_guides() do | ||
| [ | ||
| DocGuide.quorum_queues() | ||
| ] | ||
| end | ||
|
|
||
| def description(), do: "Checks availability and health status of quorum queue leaders" | ||
|
|
||
| def banner([name], %{global: true}), | ||
| do: "Checking availability and health status of leaders for quorum queues matching #{name} in all vhosts ..." | ||
|
|
||
| def banner([name], %{vhost: vhost}), | ||
| do: "Checking availability and health status of leaders for quorum queues matching #{name} in vhost #{vhost} ..." | ||
|
|
||
| def queue_lines(qs) do | ||
| for q <- qs, do: "Leader for #{q["readable_name"]} is unhealthy and unavailable" | ||
| end | ||
| end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Timeouts lower than 5s are guaranteed to result in false positives.