Skip to content

Commit b6006e4

Browse files
committed
rabbit_feature_flags: Improve error reporting from compat. check
[Why] So far, no matter what the error was, and no matter if it was an actual incompatibility or something unrelated like a timeout or an Erlang distribution failure, the `check_node_compatibility_task()` function always logged and reported the same "nodes are incompatible" message. This makes it unclear what is wrong. Are my two RabbitMQ nodes really incompatible? Or was there a network issue? [How] Now, the function logs a more precise message explaining the source of the error. It will also return two different return errors: * `incompatible_feature_flags` for an actual incompatibility * `aborted_feature_flags_compat_check`, plus the error term, for any error not coming from the Feature flags subsystem itself. In the end, regardless of the error, the nodes will still be considered incompatible and possibly one of them will refuse to start. But now, the user should better understand why. Reported-by: @dcorbacho
1 parent a9a96a4 commit b6006e4

File tree

1 file changed

+85
-37
lines changed

1 file changed

+85
-37
lines changed

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 85 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -342,51 +342,99 @@ check_node_compatibility_task(NodeA, NodeB) ->
342342
[NodeA, NodeB],
343343
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
344344
NodesA = list_nodes_clustered_with(NodeA),
345-
NodesB = list_nodes_clustered_with(NodeB),
346-
AreCompatible = case collect_inventory_on_nodes(NodesA) of
347-
{ok, InventoryA} ->
348-
?LOG_DEBUG(
349-
"Feature flags: inventory of node `~ts`:~n~tp",
350-
[NodeA, InventoryA],
351-
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
352-
case collect_inventory_on_nodes(NodesB) of
353-
{ok, InventoryB} ->
354-
?LOG_DEBUG(
355-
"Feature flags: inventory of node "
356-
"`~ts`:~n~tp",
357-
[NodeB, InventoryB],
358-
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
359-
are_compatible(InventoryA, InventoryB);
360-
_ ->
361-
false
362-
end;
363-
_ ->
364-
false
365-
end,
366-
case AreCompatible of
367-
true ->
368-
?LOG_NOTICE(
369-
"Feature flags: nodes `~ts` and `~ts` are compatible",
370-
[NodeA, NodeB],
345+
case NodesA of
346+
_ when is_list(NodesA) ->
347+
NodesB = list_nodes_clustered_with(NodeB),
348+
case NodesB of
349+
_ when is_list(NodesB) ->
350+
check_node_compatibility_task1(
351+
NodeA, NodesA,
352+
NodeB, NodesB);
353+
Error ->
354+
?LOG_WARNING(
355+
"Feature flags: "
356+
"error while querying cluster members from "
357+
"node `~ts`:~n~tp",
358+
[NodeB, Error],
359+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
360+
{error, {aborted_feature_flags_compat_check, Error}}
361+
end;
362+
Error ->
363+
?LOG_WARNING(
364+
"Feature flags: "
365+
"error while querying cluster members from node `~ts`:~n~tp",
366+
[NodeA, Error],
371367
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
372-
ok;
373-
false ->
368+
{error, {aborted_feature_flags_compat_check, Error}}
369+
end.
370+
371+
check_node_compatibility_task1(NodeA, NodesA, NodeB, NodesB)
372+
when is_list(NodesA) andalso is_list(NodesB) ->
373+
case collect_inventory_on_nodes(NodesA) of
374+
{ok, InventoryA} ->
375+
?LOG_DEBUG(
376+
"Feature flags: inventory of node `~ts`:~n~tp",
377+
[NodeA, InventoryA],
378+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
379+
case collect_inventory_on_nodes(NodesB) of
380+
{ok, InventoryB} ->
381+
?LOG_DEBUG(
382+
"Feature flags: inventory of node "
383+
"`~ts`:~n~tp",
384+
[NodeB, InventoryB],
385+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
386+
case are_compatible(InventoryA, InventoryB) of
387+
true ->
388+
?LOG_NOTICE(
389+
"Feature flags: "
390+
"nodes `~ts` and `~ts` are compatible",
391+
[NodeA, NodeB],
392+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
393+
ok;
394+
false ->
395+
?LOG_WARNING(
396+
"Feature flags: "
397+
"nodes `~ts` and `~ts` are incompatible",
398+
[NodeA, NodeB],
399+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
400+
{error, incompatible_feature_flags}
401+
end;
402+
Error ->
403+
?LOG_WARNING(
404+
"Feature flags: "
405+
"error while collecting inventory from "
406+
"nodes ~0tp:~n~tp",
407+
[NodesB, Error],
408+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
409+
{error, {aborted_feature_flags_compat_check, Error}}
410+
end;
411+
Error ->
374412
?LOG_WARNING(
375-
"Feature flags: nodes `~ts` and `~ts` are incompatible",
376-
[NodeA, NodeB],
413+
"Feature flags: "
414+
"error while collecting inventory from nodes ~0tp:~n~tp",
415+
[NodesA, Error],
377416
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
378-
{error, incompatible_feature_flags}
417+
{error, {aborted_feature_flags_compat_check, Error}}
379418
end.
380419

381-
-spec list_nodes_clustered_with(Node) -> [Node] when
382-
Node :: node().
420+
-spec list_nodes_clustered_with(Node) -> Ret when
421+
Node :: node(),
422+
Ret :: Members | Error,
423+
Members :: [node()],
424+
Error :: {error, term()}.
383425

384426
list_nodes_clustered_with(Node) ->
385-
%% If Mnesia is stopped on the given node, it will return an empty list.
386-
%% In this case, only consider that stopped node.
427+
%% If `running_nodes()' returns an empty list, it means the `rabbit'
428+
%% application is not running on `Node'. In this case, we consider this
429+
%% node alone for now.
430+
%%
431+
%% It could be that RabbitMQ is starting on that node for instance;
432+
%% indeed, feature flags compatibility is checked as part of RabbitMQ
433+
%% booting. If that's not the case, collecting the feature flags inventory
434+
%% later will fail anyway.
387435
case rpc_call(Node, ?MODULE, running_nodes, [], ?TIMEOUT) of
388-
[] -> [Node];
389-
List -> List
436+
[] -> [Node];
437+
ListOrError -> ListOrError
390438
end.
391439

392440
-spec are_compatible(Inventory, Inventory) -> AreCompatible when

0 commit comments

Comments
 (0)