Skip to content

Commit 6855ebc

Browse files
authored
Merge pull request #9729 from rabbitmq/relax-feature-flag-compat-check-during-join_cluster
Relax feature flag compat check during join cluster
2 parents 297981e + f69c082 commit 6855ebc

File tree

6 files changed

+186
-29
lines changed

6 files changed

+186
-29
lines changed

deps/rabbit/src/rabbit_db_cluster.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ can_join(RemoteNode) ->
5757
"DB: checking if `~ts` can join cluster using remote node `~ts`",
5858
[node(), RemoteNode],
5959
#{domain => ?RMQLOG_DOMAIN_DB}),
60-
case rabbit_feature_flags:check_node_compatibility(RemoteNode) of
60+
case rabbit_feature_flags:check_node_compatibility(RemoteNode, true) of
6161
ok ->
6262
case rabbit_khepri:is_enabled(RemoteNode) of
6363
true -> can_join_using_khepri(RemoteNode);

deps/rabbit/src/rabbit_feature_flags.erl

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
init/0,
104104
get_state/1,
105105
get_stability/1,
106-
check_node_compatibility/1,
106+
check_node_compatibility/1, check_node_compatibility/2,
107107
sync_feature_flags_with_cluster/2,
108108
refresh_feature_flags_after_app_load/0,
109109
enabled_feature_flags_list_file/0
@@ -1302,7 +1302,9 @@ does_node_support(Node, FeatureNames, Timeout) ->
13021302
false
13031303
end.
13041304

1305-
-spec check_node_compatibility(node()) -> ok | {error, any()}.
1305+
-spec check_node_compatibility(RemoteNode) -> Ret when
1306+
RemoteNode :: node(),
1307+
Ret :: ok | {error, any()}.
13061308
%% @doc
13071309
%% Checks if a node is compatible with the local node.
13081310
%%
@@ -1314,11 +1316,40 @@ does_node_support(Node, FeatureNames, Timeout) ->
13141316
%% local node</li>
13151317
%% </ol>
13161318
%%
1317-
%% @param Node the name of the remote node to test.
1319+
%% @param RemoteNode the name of the remote node to test.
1320+
%% @returns `ok' if they are compatible, `{error, Reason}' if they are not.
1321+
1322+
check_node_compatibility(RemoteNode) ->
1323+
check_node_compatibility(RemoteNode, false).
1324+
1325+
-spec check_node_compatibility(RemoteNode, LocalNodeAsVirgin) -> Ret when
1326+
RemoteNode :: node(),
1327+
LocalNodeAsVirgin :: boolean(),
1328+
Ret :: ok | {error, any()}.
1329+
%% @doc
1330+
%% Checks if a node is compatible with the local node.
1331+
%%
1332+
%% To be compatible, the following two conditions must be met:
1333+
%% <ol>
1334+
%% <li>feature flags enabled on the local node must be supported by the
1335+
%% remote node</li>
1336+
%% <li>feature flags enabled on the remote node must be supported by the
1337+
%% local node</li>
1338+
%% </ol>
1339+
%%
1340+
%% Unlike {@link check_node_compatibility/1}, the local node's feature flags
1341+
%% inventory is evaluated as if the node was virgin if `LocalNodeAsVirgin' is
1342+
%% true. This is useful if the local node will be reset as part of joining a
1343+
%% remote cluster for instance.
1344+
%%
1345+
%% @param RemoteNode the name of the remote node to test.
1346+
%% @param LocalNodeAsVirgin flag to indicate if the local node should be
1347+
%% evaluated as if it was virgin.
13181348
%% @returns `ok' if they are compatible, `{error, Reason}' if they are not.
13191349

1320-
check_node_compatibility(Node) ->
1321-
rabbit_ff_controller:check_node_compatibility(Node).
1350+
check_node_compatibility(RemoteNode, LocalNodeAsVirgin) ->
1351+
rabbit_ff_controller:check_node_compatibility(
1352+
RemoteNode, LocalNodeAsVirgin).
13221353

13231354
run_feature_flags_mod_on_remote_node(Node, Function, Args, Timeout) ->
13241355
rabbit_ff_controller:rpc_call(Node, ?MODULE, Function, Args, Timeout).

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
-export([is_supported/1, is_supported/2,
3737
enable/1,
3838
enable_default/0,
39-
check_node_compatibility/1,
39+
check_node_compatibility/2,
4040
sync_cluster/1,
4141
refresh_after_app_load/0,
4242
get_forced_feature_flag_names/0]).
@@ -134,20 +134,30 @@ enable_default() ->
134134
Ret
135135
end.
136136

137-
check_node_compatibility(RemoteNode) ->
137+
check_node_compatibility(RemoteNode, LocalNodeAsVirgin) ->
138138
ThisNode = node(),
139-
?LOG_DEBUG(
140-
"Feature flags: CHECKING COMPATIBILITY between nodes `~ts` and `~ts`",
141-
[ThisNode, RemoteNode],
142-
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
139+
case LocalNodeAsVirgin of
140+
true ->
141+
?LOG_DEBUG(
142+
"Feature flags: CHECKING COMPATIBILITY between nodes `~ts` "
143+
"and `~ts`; consider node `~ts` as virgin",
144+
[ThisNode, RemoteNode, ThisNode],
145+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS});
146+
false ->
147+
?LOG_DEBUG(
148+
"Feature flags: CHECKING COMPATIBILITY between nodes `~ts` "
149+
"and `~ts`",
150+
[ThisNode, RemoteNode],
151+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS})
152+
end,
143153
%% We don't go through the controller process to check nodes compatibility
144154
%% because this function is used while `rabbit' is stopped usually.
145155
%%
146156
%% There is no benefit in starting a controller just for this check
147157
%% because it would not guaranty that the compatibility remains true after
148158
%% this function finishes and before the node starts and synchronizes
149159
%% feature flags.
150-
check_node_compatibility_task(ThisNode, RemoteNode).
160+
check_node_compatibility_task(ThisNode, RemoteNode, LocalNodeAsVirgin).
151161

152162
sync_cluster(Nodes) ->
153163
?LOG_DEBUG(
@@ -382,12 +392,14 @@ notify_waiting_controller({ControlerPid, _} = From) ->
382392
%% Code to check compatibility between nodes.
383393
%% --------------------------------------------------------------------
384394

385-
-spec check_node_compatibility_task(Node, Node) -> Ret when
386-
Node :: node(),
395+
-spec check_node_compatibility_task(NodeA, NodeB, NodeAAsVirigin) -> Ret when
396+
NodeA :: node(),
397+
NodeB :: node(),
398+
NodeAAsVirigin :: boolean(),
387399
Ret :: ok | {error, Reason},
388400
Reason :: incompatible_feature_flags.
389401

390-
check_node_compatibility_task(NodeA, NodeB) ->
402+
check_node_compatibility_task(NodeA, NodeB, NodeAAsVirigin) ->
391403
?LOG_NOTICE(
392404
"Feature flags: checking nodes `~ts` and `~ts` compatibility...",
393405
[NodeA, NodeB],
@@ -400,7 +412,8 @@ check_node_compatibility_task(NodeA, NodeB) ->
400412
_ when is_list(NodesB) ->
401413
check_node_compatibility_task1(
402414
NodeA, NodesA,
403-
NodeB, NodesB);
415+
NodeB, NodesB,
416+
NodeAAsVirigin);
404417
Error ->
405418
?LOG_WARNING(
406419
"Feature flags: "
@@ -419,10 +432,12 @@ check_node_compatibility_task(NodeA, NodeB) ->
419432
{error, {aborted_feature_flags_compat_check, Error}}
420433
end.
421434

422-
check_node_compatibility_task1(NodeA, NodesA, NodeB, NodesB)
435+
check_node_compatibility_task1(NodeA, NodesA, NodeB, NodesB, NodeAAsVirigin)
423436
when is_list(NodesA) andalso is_list(NodesB) ->
424437
case collect_inventory_on_nodes(NodesA) of
425-
{ok, InventoryA} ->
438+
{ok, InventoryA0} ->
439+
InventoryA = virtually_reset_inventory(
440+
InventoryA0, NodeAAsVirigin),
426441
?LOG_DEBUG(
427442
"Feature flags: inventory of node `~ts`:~n~tp",
428443
[NodeA, InventoryA],
@@ -488,6 +503,42 @@ list_nodes_clustered_with(Node) ->
488503
ListOrError -> ListOrError
489504
end.
490505

506+
virtually_reset_inventory(
507+
#{feature_flags := FeatureFlags,
508+
states_per_node := StatesPerNode} = Inventory,
509+
true = _NodeAsVirgin) ->
510+
[Node | _] = maps:keys(StatesPerNode),
511+
FeatureStates0 = maps:get(Node, StatesPerNode),
512+
FeatureStates1 = maps:map(
513+
fun(FeatureName, _FeatureState) ->
514+
FeatureProps = maps:get(
515+
FeatureName, FeatureFlags),
516+
state_after_virtual_state(
517+
FeatureName, FeatureProps)
518+
end, FeatureStates0),
519+
StatesPerNode1 = maps:map(
520+
fun(_Node, _FeatureStates) ->
521+
FeatureStates1
522+
end, StatesPerNode),
523+
Inventory1 = Inventory#{states_per_node => StatesPerNode1},
524+
Inventory1;
525+
virtually_reset_inventory(
526+
Inventory,
527+
false = _NodeAsVirgin) ->
528+
Inventory.
529+
530+
state_after_virtual_state(_FeatureName, FeatureProps)
531+
when ?IS_FEATURE_FLAG(FeatureProps) ->
532+
Stability = rabbit_feature_flags:get_stability(FeatureProps),
533+
case Stability of
534+
required -> true;
535+
_ -> false
536+
end;
537+
state_after_virtual_state(FeatureName, FeatureProps)
538+
when ?IS_DEPRECATION(FeatureProps) ->
539+
not rabbit_deprecated_features:should_be_permitted(
540+
FeatureName, FeatureProps).
541+
491542
-spec are_compatible(Inventory, Inventory) -> AreCompatible when
492543
Inventory :: rabbit_feature_flags:cluster_inventory(),
493544
AreCompatible :: boolean().

deps/rabbit/src/rabbit_khepri.erl

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -871,10 +871,7 @@ check_cluster_consistency(Node, CheckNodesConsistency) ->
871871
Error
872872
end;
873873
{_OTP, _Rabbit, {ok, Status}} ->
874-
case rabbit_db_cluster:check_compatibility(Node) of
875-
ok -> {ok, Status};
876-
Error -> Error
877-
end
874+
{ok, Status}
878875
end.
879876

880877
remote_node_info(Node) ->

deps/rabbit/src/rabbit_mnesia.erl

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,24 @@ cluster_nodes(WhichNodes) -> cluster_status(WhichNodes).
407407
cluster_status_from_mnesia() ->
408408
case is_running() of
409409
false ->
410-
{error, mnesia_not_running};
410+
case rabbit_khepri:get_feature_state() of
411+
enabled ->
412+
%% To keep this API compatible with older remote nodes who
413+
%% don't know about Khepri, we take the cluster status
414+
%% from `rabbit_khepri' and reformat the return value to
415+
%% ressemble the node from this module.
416+
%%
417+
%% Both nodes won't be compatible, but let's leave that
418+
%% decision to the Feature flags subsystem.
419+
case rabbit_khepri:cluster_status_from_khepri() of
420+
{ok, {All, Running}} ->
421+
{ok, {All, All, Running}};
422+
{error, _} = Error ->
423+
Error
424+
end;
425+
_ ->
426+
{error, mnesia_not_running}
427+
end;
411428
true ->
412429
%% If the tables are not present, it means that
413430
%% `init_db/3' hasn't been run yet. In other words, either
@@ -475,8 +492,23 @@ members() ->
475492
end.
476493

477494
node_info() ->
495+
%% Once Khepri is enabled, the Mnesia protocol is irrelevant obviously.
496+
%%
497+
%% That said, older remote nodes who don't known about Khepri will request
498+
%% this information anyway as part of calling `node_info/0'. Here, we
499+
%% simply return `unsupported' as the Mnesia protocol. Older versions of
500+
%% RabbitMQ will skip the protocol negotiation and use other ways.
501+
%%
502+
%% The goal is mostly to let older nodes which check Mnesia before feature
503+
%% flags to reach the feature flags check. This one will correctly
504+
%% indicate that they are incompatible. That's why we return `unsupported'
505+
%% here, even if we could return the actual Mnesia protocol.
506+
MnesiaProtocol = case rabbit_khepri:get_feature_state() of
507+
enabled -> unsupported;
508+
_ -> mnesia:system_info(protocol_version)
509+
end,
478510
{rabbit_misc:otp_release(), rabbit_misc:version(),
479-
mnesia:system_info(protocol_version),
511+
MnesiaProtocol,
480512
cluster_status_from_mnesia()}.
481513

482514
-spec node_type() -> rabbit_db_cluster:node_type().
@@ -694,10 +726,7 @@ check_cluster_consistency(Node, CheckNodesConsistency) ->
694726
Error
695727
end;
696728
{_OTP, _Rabbit, _Protocol, {ok, Status}} ->
697-
case rabbit_db_cluster:check_compatibility(Node) of
698-
ok -> {ok, Status};
699-
Error -> Error
700-
end
729+
{ok, Status}
701730
end.
702731

703732
remote_node_info(Node) ->

deps/rabbit/test/feature_flags_v2_SUITE.erl

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
failed_enable_feature_flag_with_post_enable/1,
5050
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1,
5151
have_required_feature_flag_in_cluster_and_add_member_without_it/1,
52+
have_unknown_feature_flag_in_cluster_and_add_member_with_it_enabled/1,
5253
error_during_migration_after_initial_success/1,
5354
controller_waits_for_own_task_to_finish_before_exiting/1,
5455
controller_waits_for_remote_task_to_finish_before_exiting/1
@@ -98,6 +99,7 @@ groups() ->
9899
failed_enable_feature_flag_with_post_enable,
99100
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled,
100101
have_required_feature_flag_in_cluster_and_add_member_without_it,
102+
have_unknown_feature_flag_in_cluster_and_add_member_with_it_enabled,
101103
error_during_migration_after_initial_success,
102104
controller_waits_for_own_task_to_finish_before_exiting,
103105
controller_waits_for_remote_task_to_finish_before_exiting
@@ -1506,6 +1508,53 @@ have_required_feature_flag_in_cluster_and_add_member_without_it(
15061508
|| Node <- AllNodes],
15071509
ok.
15081510

1511+
have_unknown_feature_flag_in_cluster_and_add_member_with_it_enabled(
1512+
Config) ->
1513+
[NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config),
1514+
connect_nodes(Nodes),
1515+
override_running_nodes([NewNode]),
1516+
override_running_nodes(Nodes),
1517+
1518+
FeatureName = ?FUNCTION_NAME,
1519+
FeatureFlags = #{FeatureName =>
1520+
#{provided_by => rabbit,
1521+
stability => stable}},
1522+
?assertEqual(ok, inject_on_nodes([NewNode], FeatureFlags)),
1523+
1524+
ct:pal(
1525+
"Checking the feature flag is unsupported on the cluster but enabled on "
1526+
"the standalone node"),
1527+
ok = run_on_node(
1528+
NewNode,
1529+
fun() ->
1530+
?assertEqual(ok, rabbit_feature_flags:enable(FeatureName)),
1531+
?assert(rabbit_feature_flags:is_enabled(FeatureName)),
1532+
ok
1533+
end,
1534+
[]),
1535+
_ = [ok =
1536+
run_on_node(
1537+
Node,
1538+
fun() ->
1539+
?assertNot(rabbit_feature_flags:is_supported(FeatureName)),
1540+
?assertNot(rabbit_feature_flags:is_enabled(FeatureName)),
1541+
ok
1542+
end,
1543+
[])
1544+
|| Node <- Nodes],
1545+
1546+
%% Check compatibility between NewNodes and Nodes.
1547+
ok = run_on_node(
1548+
NewNode,
1549+
fun() ->
1550+
?assertEqual(
1551+
ok,
1552+
rabbit_feature_flags:check_node_compatibility(
1553+
FirstNode, true)),
1554+
ok
1555+
end, []),
1556+
ok.
1557+
15091558
error_during_migration_after_initial_success(Config) ->
15101559
AllNodes = [NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config),
15111560
connect_nodes(Nodes),

0 commit comments

Comments
 (0)