-
Notifications
You must be signed in to change notification settings - Fork 4k
Add opt in initial check run #14087
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add opt in initial check run #14087
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,7 +40,7 @@ | |
| %% Boot steps. | ||
| -export([update_cluster_tags/0, maybe_insert_default_data/0, boot_delegate/0, recover/0, | ||
| pg_local_amqp_session/0, | ||
| pg_local_amqp_connection/0]). | ||
| pg_local_amqp_connection/0, check_initial_run/0]). | ||
|
|
||
| -rabbit_boot_step({pre_boot, [{description, "rabbit boot start"}]}). | ||
|
|
||
|
|
@@ -199,10 +199,16 @@ | |
| {requires, [core_initialized]}, | ||
| {enables, routing_ready}]}). | ||
|
|
||
| -rabbit_boot_step({initial_run_check, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd also rename this to |
||
| [{description, "check if this is the first time the node starts"}, | ||
| {mfa, {?MODULE, check_initial_run, []}}, | ||
| {requires, recovery}, | ||
| {enables, empty_db_check}]}). | ||
|
|
||
| -rabbit_boot_step({empty_db_check, | ||
| [{description, "empty DB check"}, | ||
| {mfa, {?MODULE, maybe_insert_default_data, []}}, | ||
| {requires, recovery}, | ||
| {requires, initial_run_check}, | ||
| {enables, routing_ready}]}). | ||
|
|
||
|
|
||
|
|
@@ -235,6 +241,7 @@ | |
| {requires, [core_initialized, recovery]}, | ||
| {enables, routing_ready}]}). | ||
|
|
||
|
|
||
| -rabbit_boot_step({pre_flight, | ||
| [{description, "ready to communicate with peers and clients"}, | ||
| {requires, [core_initialized, recovery, routing_ready]}]}). | ||
|
|
@@ -1151,6 +1158,44 @@ update_cluster_tags() -> | |
| #{domain => ?RMQLOG_DOMAIN_GLOBAL}), | ||
| rabbit_runtime_parameters:set_global(cluster_tags, Tags, <<"internal_user">>). | ||
|
|
||
|
|
||
| -spec check_initial_run() -> 'ok' | no_return(). | ||
|
|
||
| check_initial_run() -> | ||
|
||
| case application:get_env(rabbit, verify_initial_run, false) of | ||
| false -> | ||
| %% Feature is disabled, skip the check | ||
| ?LOG_DEBUG("Initial run verification is disabled", | ||
| #{domain => ?RMQLOG_DOMAIN_GLOBAL}), | ||
| ok; | ||
| true -> | ||
| %% Feature is enabled, perform the check | ||
| DataDir = data_dir(), | ||
| MarkerFile = filename:join(DataDir, "node_initialized.marker"), | ||
| case filelib:is_file(MarkerFile) of | ||
| true -> | ||
| %% Not the first run, check if tables need default data | ||
| case rabbit_table:needs_default_data() of | ||
| true -> | ||
| ?LOG_ERROR("Node has already been initialized, but database appears empty. " | ||
| "This could indicate data loss or a split-brain scenario.", | ||
| #{domain => ?RMQLOG_DOMAIN_GLOBAL}), | ||
| throw({error, cluster_already_initialized_but_tables_empty}); | ||
| false -> | ||
| ?LOG_INFO("Node has already been initialized, proceeding with normal startup", | ||
| #{domain => ?RMQLOG_DOMAIN_GLOBAL}), | ||
| ok | ||
| end; | ||
| false -> | ||
| %% First time starting, create the marker file | ||
| ?LOG_INFO("First node startup detected, creating initialization marker", | ||
| #{domain => ?RMQLOG_DOMAIN_GLOBAL}), | ||
| ok = filelib:ensure_dir(MarkerFile), | ||
| ok = file:write_file(MarkerFile, <<>>, [exclusive]), % Empty file. | ||
| ok | ||
| end | ||
| end. | ||
|
|
||
| -spec maybe_insert_default_data() -> 'ok'. | ||
|
|
||
| maybe_insert_default_data() -> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,168 @@ | ||
| %% This Source Code Form is subject to the terms of the Mozilla Public | ||
| %% License, v. 2.0. If a copy of the MPL was not distributed with this | ||
| %% file, You can obtain one at https://mozilla.org/MPL/2.0/. | ||
| %% | ||
| %% Copyright (c) 2007-2024 Broadcom. All Rights Reserved. The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. All rights reserved. | ||
| %% | ||
|
|
||
| %% Test suite for the verify_initial_run feature. | ||
| %% This feature helps detect potential data loss scenarios by maintaining | ||
| %% a marker file to track if a node has been initialized before. | ||
|
|
||
| -module(node_initial_run_SUITE). | ||
|
|
||
| -include_lib("common_test/include/ct.hrl"). | ||
| -include_lib("eunit/include/eunit.hrl"). | ||
|
|
||
| -compile(export_all). | ||
|
|
||
| all() -> | ||
| [ | ||
| {group, single_node_mnesia}, | ||
| {group, single_node_khepri} | ||
| ]. | ||
|
|
||
| groups() -> | ||
| [ | ||
| {single_node_mnesia, [], [ | ||
| verify_initial_run_disabled, | ||
| verify_initial_run_enabled | ||
| ]}, | ||
| {single_node_khepri, [], [ | ||
| verify_initial_run_disabled, | ||
| verify_initial_run_enabled | ||
| ]} | ||
| ]. | ||
|
|
||
| %% ------------------------------------------------------------------- | ||
| %% Testsuite setup/teardown. | ||
| %% ------------------------------------------------------------------- | ||
|
|
||
| init_per_suite(Config) -> | ||
| rabbit_ct_helpers:log_environment(), | ||
| rabbit_ct_helpers:run_setup_steps(Config). | ||
|
|
||
| end_per_suite(Config) -> | ||
| rabbit_ct_helpers:run_teardown_steps(Config). | ||
|
|
||
| init_per_group(Groupname, Config) -> | ||
| Config0 = rabbit_ct_helpers:set_config(Config, [ | ||
| {metadata_store, meta_store(Groupname)}, | ||
| {rmq_nodes_clustered, false}, | ||
| {rmq_nodename_suffix, Groupname}, | ||
| {rmq_nodes_count, 1} | ||
| ]), | ||
| rabbit_ct_helpers:run_steps( | ||
| Config0, | ||
| rabbit_ct_broker_helpers:setup_steps() ++ | ||
| rabbit_ct_client_helpers:setup_steps() | ||
| ). | ||
|
|
||
| end_per_group(_, Config) -> | ||
| rabbit_ct_helpers:run_steps( | ||
| Config, | ||
| rabbit_ct_client_helpers:teardown_steps() ++ | ||
| rabbit_ct_broker_helpers:teardown_steps() | ||
| ). | ||
|
|
||
| init_per_testcase(Testcase, Config) -> | ||
| rabbit_ct_helpers:testcase_started(Config, Testcase), | ||
| Config. | ||
|
|
||
| end_per_testcase(Testcase, Config) -> | ||
| rabbit_ct_helpers:testcase_finished(Config, Testcase). | ||
|
|
||
| %% ------------------------------------------------------------------- | ||
| %% Test cases | ||
| %% ------------------------------------------------------------------- | ||
|
|
||
| verify_initial_run_disabled(Config) -> | ||
| % When feature is disabled (default), node should start normally | ||
| DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir), | ||
| MarkerFile = filename:join(DataDir, "node_initialized.marker"), | ||
| % Setting is disabled so no marker file should be present | ||
| ?assertNot(filelib:is_file(MarkerFile)), | ||
|
|
||
| % Restarting the node should work fine | ||
| ok = stop_app(Config), | ||
| set_env(Config, false), | ||
| ok = start_app(Config), | ||
| % Still no marker file | ||
| ?assertNot(filelib:is_file(MarkerFile)), | ||
| ok. | ||
|
|
||
| verify_initial_run_enabled(Config) -> | ||
| DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir), | ||
| MarkerFile = filename:join(DataDir, "node_initialized.marker"), | ||
|
|
||
| ok = stop_app(Config), | ||
| set_env(Config, true), | ||
| ok = start_app(Config), | ||
| % Setting is enabled so marker file should be present after initial startup | ||
| ?assert(filelib:is_file(MarkerFile)), | ||
|
|
||
| % Restarting the node should be fine, as there is a marker file | ||
| % and corresponding schema data (consistent state) | ||
|
|
||
| ok = stop_app(Config), | ||
| ok = start_app(Config), | ||
|
|
||
| SchemaFile = schema_file(Config), | ||
|
|
||
| ?assert(filelib:is_file(MarkerFile)), | ||
|
|
||
| % Stop the node and remove the present schema to simulate data loss | ||
| ok = stop_app(Config), | ||
| file:delete(SchemaFile), | ||
| % Node should fail to start because marker exists but schema is missing, | ||
| % indicating potential data loss or corruption | ||
| ?assertMatch( | ||
| {error, 69, _}, | ||
| start_app(Config) | ||
| ), | ||
| ok. | ||
|
|
||
| %% ------------------------------------------------------------------- | ||
| %% Internal helpers | ||
| %% ------------------------------------------------------------------- | ||
|
|
||
| stop_app(Config) -> | ||
| Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename), | ||
| case rabbit_ct_broker_helpers:rabbitmqctl(Config, Node, ["stop_app"]) of | ||
| {ok, _} -> ok; | ||
| Error -> Error | ||
| end. | ||
|
|
||
| start_app(Config) -> | ||
| Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename), | ||
| case rabbit_ct_broker_helpers:rabbitmqctl(Config, Node, ["start_app"]) of | ||
| {ok, _} -> ok; | ||
| Error -> Error | ||
| end. | ||
|
|
||
| maybe_enable_verify_initial_run(Config, verify_initial_run_enabled) -> | ||
| rabbit_ct_helpers:merge_app_env( | ||
| Config, {rabbit, [{verify_initial_run, true}]} | ||
| ); | ||
| maybe_enable_verify_initial_run(Config, _) -> | ||
| Config. | ||
|
|
||
| meta_store(single_node_mnesia) -> | ||
| mnesia; | ||
| meta_store(single_node_khepri) -> | ||
| khepri. | ||
|
|
||
| schema_file(Config) -> | ||
| DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir), | ||
| MetaStore = rabbit_ct_helpers:get_config(Config, metadata_store), | ||
| case MetaStore of | ||
| mnesia -> | ||
| filename:join(DataDir, "schema.DAT"); | ||
| khepri -> | ||
| NodeName = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename), | ||
| filename:join([DataDir, "coordination", NodeName, "names.dets"]) | ||
| end. | ||
|
|
||
| set_env(Config, Bool) -> | ||
| Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename), | ||
| ok = rpc:call(Node, application, set_env, [rabbit, verify_initial_run, Bool]). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we rename this to
prevent_startup_if_node_was_reset? I have considered many names, such astrack_initial_run_and_seedingenforce_initialized_databaseand a few others but the most specific option is the one that describes the end goal: prevent a previously reset node from booting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, will do