Skip to content

Commit f69c082

Browse files
committed
rabbit_feature_flags: New check_node_compatibility/2 variant
... that considers the local node as if it was reset. [Why] When a node joins a cluster, we check its compatibility with the cluster, reset the node, copy the feature flags states from the remote cluster and add that node to the cluster. However, the compatibility check is performed with the current feature flags states, even though they are about to be reset. Therefore, a node with an enabled feature flag that is unsupported by the cluster will refuse to join. It's incorrect because after the reset and the states copy, it could have join the cluster just fine. [How] We introduce a new variant of `check_node_compatibility/2` that takes an argument to indicate if the local node should be considered as a virgin node (i.e. like after a reset). This way, the joining node will always be able to join, regardless of its initial feature flags states, as long as it doesn't require a feature flag that is unsupported by the cluster. This also removes the need to use `$RABBITMQ_FEATURE_FLAGS` environment variable to force a new node to leave stable feature flags disabled to allow it to join a cluster running an older version. References #9677.
1 parent 30ab653 commit f69c082

File tree

4 files changed

+150
-19
lines changed

4 files changed

+150
-19
lines changed

deps/rabbit/src/rabbit_db_cluster.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ can_join(RemoteNode) ->
5757
"DB: checking if `~ts` can join cluster using remote node `~ts`",
5858
[node(), RemoteNode],
5959
#{domain => ?RMQLOG_DOMAIN_DB}),
60-
case rabbit_feature_flags:check_node_compatibility(RemoteNode) of
60+
case rabbit_feature_flags:check_node_compatibility(RemoteNode, true) of
6161
ok ->
6262
case rabbit_khepri:is_enabled(RemoteNode) of
6363
true -> can_join_using_khepri(RemoteNode);

deps/rabbit/src/rabbit_feature_flags.erl

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
init/0,
104104
get_state/1,
105105
get_stability/1,
106-
check_node_compatibility/1,
106+
check_node_compatibility/1, check_node_compatibility/2,
107107
sync_feature_flags_with_cluster/2,
108108
refresh_feature_flags_after_app_load/0,
109109
enabled_feature_flags_list_file/0
@@ -1302,7 +1302,9 @@ does_node_support(Node, FeatureNames, Timeout) ->
13021302
false
13031303
end.
13041304

1305-
-spec check_node_compatibility(node()) -> ok | {error, any()}.
1305+
-spec check_node_compatibility(RemoteNode) -> Ret when
1306+
RemoteNode :: node(),
1307+
Ret :: ok | {error, any()}.
13061308
%% @doc
13071309
%% Checks if a node is compatible with the local node.
13081310
%%
@@ -1314,11 +1316,40 @@ does_node_support(Node, FeatureNames, Timeout) ->
13141316
%% local node</li>
13151317
%% </ol>
13161318
%%
1317-
%% @param Node the name of the remote node to test.
1319+
%% @param RemoteNode the name of the remote node to test.
1320+
%% @returns `ok' if they are compatible, `{error, Reason}' if they are not.
1321+
1322+
check_node_compatibility(RemoteNode) ->
1323+
check_node_compatibility(RemoteNode, false).
1324+
1325+
-spec check_node_compatibility(RemoteNode, LocalNodeAsVirgin) -> Ret when
1326+
RemoteNode :: node(),
1327+
LocalNodeAsVirgin :: boolean(),
1328+
Ret :: ok | {error, any()}.
1329+
%% @doc
1330+
%% Checks if a node is compatible with the local node.
1331+
%%
1332+
%% To be compatible, the following two conditions must be met:
1333+
%% <ol>
1334+
%% <li>feature flags enabled on the local node must be supported by the
1335+
%% remote node</li>
1336+
%% <li>feature flags enabled on the remote node must be supported by the
1337+
%% local node</li>
1338+
%% </ol>
1339+
%%
1340+
%% Unlike {@link check_node_compatibility/1}, the local node's feature flags
1341+
%% inventory is evaluated as if the node was virgin if `LocalNodeAsVirgin' is
1342+
%% true. This is useful if the local node will be reset as part of joining a
1343+
%% remote cluster for instance.
1344+
%%
1345+
%% @param RemoteNode the name of the remote node to test.
1346+
%% @param LocalNodeAsVirgin flag to indicate if the local node should be
1347+
%% evaluated as if it was virgin.
13181348
%% @returns `ok' if they are compatible, `{error, Reason}' if they are not.
13191349

1320-
check_node_compatibility(Node) ->
1321-
rabbit_ff_controller:check_node_compatibility(Node).
1350+
check_node_compatibility(RemoteNode, LocalNodeAsVirgin) ->
1351+
rabbit_ff_controller:check_node_compatibility(
1352+
RemoteNode, LocalNodeAsVirgin).
13221353

13231354
run_feature_flags_mod_on_remote_node(Node, Function, Args, Timeout) ->
13241355
rabbit_ff_controller:rpc_call(Node, ?MODULE, Function, Args, Timeout).

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
-export([is_supported/1, is_supported/2,
3737
enable/1,
3838
enable_default/0,
39-
check_node_compatibility/1,
39+
check_node_compatibility/2,
4040
sync_cluster/1,
4141
refresh_after_app_load/0,
4242
get_forced_feature_flag_names/0]).
@@ -134,20 +134,30 @@ enable_default() ->
134134
Ret
135135
end.
136136

137-
check_node_compatibility(RemoteNode) ->
137+
check_node_compatibility(RemoteNode, LocalNodeAsVirgin) ->
138138
ThisNode = node(),
139-
?LOG_DEBUG(
140-
"Feature flags: CHECKING COMPATIBILITY between nodes `~ts` and `~ts`",
141-
[ThisNode, RemoteNode],
142-
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
139+
case LocalNodeAsVirgin of
140+
true ->
141+
?LOG_DEBUG(
142+
"Feature flags: CHECKING COMPATIBILITY between nodes `~ts` "
143+
"and `~ts`; consider node `~ts` as virgin",
144+
[ThisNode, RemoteNode, ThisNode],
145+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS});
146+
false ->
147+
?LOG_DEBUG(
148+
"Feature flags: CHECKING COMPATIBILITY between nodes `~ts` "
149+
"and `~ts`",
150+
[ThisNode, RemoteNode],
151+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS})
152+
end,
143153
%% We don't go through the controller process to check nodes compatibility
144154
%% because this function is used while `rabbit' is stopped usually.
145155
%%
146156
%% There is no benefit in starting a controller just for this check
147157
%% because it would not guaranty that the compatibility remains true after
148158
%% this function finishes and before the node starts and synchronizes
149159
%% feature flags.
150-
check_node_compatibility_task(ThisNode, RemoteNode).
160+
check_node_compatibility_task(ThisNode, RemoteNode, LocalNodeAsVirgin).
151161

152162
sync_cluster(Nodes) ->
153163
?LOG_DEBUG(
@@ -382,12 +392,14 @@ notify_waiting_controller({ControlerPid, _} = From) ->
382392
%% Code to check compatibility between nodes.
383393
%% --------------------------------------------------------------------
384394

385-
-spec check_node_compatibility_task(Node, Node) -> Ret when
386-
Node :: node(),
395+
-spec check_node_compatibility_task(NodeA, NodeB, NodeAAsVirigin) -> Ret when
396+
NodeA :: node(),
397+
NodeB :: node(),
398+
NodeAAsVirigin :: boolean(),
387399
Ret :: ok | {error, Reason},
388400
Reason :: incompatible_feature_flags.
389401

390-
check_node_compatibility_task(NodeA, NodeB) ->
402+
check_node_compatibility_task(NodeA, NodeB, NodeAAsVirigin) ->
391403
?LOG_NOTICE(
392404
"Feature flags: checking nodes `~ts` and `~ts` compatibility...",
393405
[NodeA, NodeB],
@@ -400,7 +412,8 @@ check_node_compatibility_task(NodeA, NodeB) ->
400412
_ when is_list(NodesB) ->
401413
check_node_compatibility_task1(
402414
NodeA, NodesA,
403-
NodeB, NodesB);
415+
NodeB, NodesB,
416+
NodeAAsVirigin);
404417
Error ->
405418
?LOG_WARNING(
406419
"Feature flags: "
@@ -419,10 +432,12 @@ check_node_compatibility_task(NodeA, NodeB) ->
419432
{error, {aborted_feature_flags_compat_check, Error}}
420433
end.
421434

422-
check_node_compatibility_task1(NodeA, NodesA, NodeB, NodesB)
435+
check_node_compatibility_task1(NodeA, NodesA, NodeB, NodesB, NodeAAsVirigin)
423436
when is_list(NodesA) andalso is_list(NodesB) ->
424437
case collect_inventory_on_nodes(NodesA) of
425-
{ok, InventoryA} ->
438+
{ok, InventoryA0} ->
439+
InventoryA = virtually_reset_inventory(
440+
InventoryA0, NodeAAsVirigin),
426441
?LOG_DEBUG(
427442
"Feature flags: inventory of node `~ts`:~n~tp",
428443
[NodeA, InventoryA],
@@ -488,6 +503,42 @@ list_nodes_clustered_with(Node) ->
488503
ListOrError -> ListOrError
489504
end.
490505

506+
virtually_reset_inventory(
507+
#{feature_flags := FeatureFlags,
508+
states_per_node := StatesPerNode} = Inventory,
509+
true = _NodeAsVirgin) ->
510+
[Node | _] = maps:keys(StatesPerNode),
511+
FeatureStates0 = maps:get(Node, StatesPerNode),
512+
FeatureStates1 = maps:map(
513+
fun(FeatureName, _FeatureState) ->
514+
FeatureProps = maps:get(
515+
FeatureName, FeatureFlags),
516+
state_after_virtual_state(
517+
FeatureName, FeatureProps)
518+
end, FeatureStates0),
519+
StatesPerNode1 = maps:map(
520+
fun(_Node, _FeatureStates) ->
521+
FeatureStates1
522+
end, StatesPerNode),
523+
Inventory1 = Inventory#{states_per_node => StatesPerNode1},
524+
Inventory1;
525+
virtually_reset_inventory(
526+
Inventory,
527+
false = _NodeAsVirgin) ->
528+
Inventory.
529+
530+
state_after_virtual_state(_FeatureName, FeatureProps)
531+
when ?IS_FEATURE_FLAG(FeatureProps) ->
532+
Stability = rabbit_feature_flags:get_stability(FeatureProps),
533+
case Stability of
534+
required -> true;
535+
_ -> false
536+
end;
537+
state_after_virtual_state(FeatureName, FeatureProps)
538+
when ?IS_DEPRECATION(FeatureProps) ->
539+
not rabbit_deprecated_features:should_be_permitted(
540+
FeatureName, FeatureProps).
541+
491542
-spec are_compatible(Inventory, Inventory) -> AreCompatible when
492543
Inventory :: rabbit_feature_flags:cluster_inventory(),
493544
AreCompatible :: boolean().

deps/rabbit/test/feature_flags_v2_SUITE.erl

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
failed_enable_feature_flag_with_post_enable/1,
5050
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1,
5151
have_required_feature_flag_in_cluster_and_add_member_without_it/1,
52+
have_unknown_feature_flag_in_cluster_and_add_member_with_it_enabled/1,
5253
error_during_migration_after_initial_success/1,
5354
controller_waits_for_own_task_to_finish_before_exiting/1,
5455
controller_waits_for_remote_task_to_finish_before_exiting/1
@@ -98,6 +99,7 @@ groups() ->
9899
failed_enable_feature_flag_with_post_enable,
99100
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled,
100101
have_required_feature_flag_in_cluster_and_add_member_without_it,
102+
have_unknown_feature_flag_in_cluster_and_add_member_with_it_enabled,
101103
error_during_migration_after_initial_success,
102104
controller_waits_for_own_task_to_finish_before_exiting,
103105
controller_waits_for_remote_task_to_finish_before_exiting
@@ -1506,6 +1508,53 @@ have_required_feature_flag_in_cluster_and_add_member_without_it(
15061508
|| Node <- AllNodes],
15071509
ok.
15081510

1511+
have_unknown_feature_flag_in_cluster_and_add_member_with_it_enabled(
1512+
Config) ->
1513+
[NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config),
1514+
connect_nodes(Nodes),
1515+
override_running_nodes([NewNode]),
1516+
override_running_nodes(Nodes),
1517+
1518+
FeatureName = ?FUNCTION_NAME,
1519+
FeatureFlags = #{FeatureName =>
1520+
#{provided_by => rabbit,
1521+
stability => stable}},
1522+
?assertEqual(ok, inject_on_nodes([NewNode], FeatureFlags)),
1523+
1524+
ct:pal(
1525+
"Checking the feature flag is unsupported on the cluster but enabled on "
1526+
"the standalone node"),
1527+
ok = run_on_node(
1528+
NewNode,
1529+
fun() ->
1530+
?assertEqual(ok, rabbit_feature_flags:enable(FeatureName)),
1531+
?assert(rabbit_feature_flags:is_enabled(FeatureName)),
1532+
ok
1533+
end,
1534+
[]),
1535+
_ = [ok =
1536+
run_on_node(
1537+
Node,
1538+
fun() ->
1539+
?assertNot(rabbit_feature_flags:is_supported(FeatureName)),
1540+
?assertNot(rabbit_feature_flags:is_enabled(FeatureName)),
1541+
ok
1542+
end,
1543+
[])
1544+
|| Node <- Nodes],
1545+
1546+
%% Check compatibility between NewNodes and Nodes.
1547+
ok = run_on_node(
1548+
NewNode,
1549+
fun() ->
1550+
?assertEqual(
1551+
ok,
1552+
rabbit_feature_flags:check_node_compatibility(
1553+
FirstNode, true)),
1554+
ok
1555+
end, []),
1556+
ok.
1557+
15091558
error_during_migration_after_initial_success(Config) ->
15101559
AllNodes = [NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config),
15111560
connect_nodes(Nodes),

0 commit comments

Comments
 (0)