rabbitmq · SimonUnge · Jun 12, 2025 · Jun 24, 2025 · michaelklishin · Jun 24, 2025
diff --git a/deps/rabbit/Makefile b/deps/rabbit/Makefile
@@ -264,7 +264,7 @@ PARALLEL_CT_SET_1_D = amqqueue_backward_compatibility channel_interceptor channe
 
 PARALLEL_CT_SET_2_A = cluster confirms_rejects consumer_timeout rabbit_access_control rabbit_confirms rabbit_core_metrics_gc rabbit_cuttlefish rabbit_db_binding rabbit_db_exchange
 PARALLEL_CT_SET_2_B = clustering_recovery crashing_queues deprecated_features direct_exchange_routing_v2 disconnect_detected_during_alarm exchanges unit_gen_server2
-PARALLEL_CT_SET_2_C = disk_monitor dynamic_qq unit_disk_monitor unit_file_handle_cache unit_log_management unit_operator_policy
+PARALLEL_CT_SET_2_C = disk_monitor dynamic_qq unit_disk_monitor unit_file_handle_cache unit_log_management unit_operator_policy node_initial_run
 PARALLEL_CT_SET_2_D = queue_length_limits queue_parallel quorum_queue_member_reconciliation rabbit_fifo rabbit_fifo_dlx rabbit_stream_coordinator
 
 PARALLEL_CT_SET_3_A = definition_import per_user_connection_channel_limit_partitions per_vhost_connection_limit_partitions policy priority_queue_recovery rabbit_fifo_v0 rabbit_stream_sac_coordinator unit_credit_flow unit_queue_consumers unit_queue_location unit_quorum_queue

diff --git a/deps/rabbit/ct.test.spec b/deps/rabbit/ct.test.spec
@@ -43,6 +43,7 @@
 ,   disk_monitor_SUITE
 ,   dynamic_qq_SUITE
 ,   exchanges_SUITE
+,   node_initial_run_SUITE
 ,   rabbit_stream_queue_SUITE
 ]}.
 

diff --git a/deps/rabbit/priv/schema/rabbit.schema b/deps/rabbit/priv/schema/rabbit.schema
@@ -1646,6 +1646,16 @@ end}.
     {datatype, string}
 ]}.
 
+
+%% Whether to verify if this is the first time a node starts.
+%% When enabled, nodes will create a marker file on first startup
+%% and refuse to start if the marker exists but tables are empty.
+%%
+
+{mapping, "verify_initial_run", "rabbit.verify_initial_run", [
+    {datatype, {enum, [true, false]}}
+]}.
+
 % ==========================
 % Logging section
 % ==========================

diff --git a/deps/rabbit/src/rabbit.erl b/deps/rabbit/src/rabbit.erl
@@ -40,7 +40,7 @@
 %% Boot steps.
 -export([update_cluster_tags/0, maybe_insert_default_data/0, boot_delegate/0, recover/0,
          pg_local_amqp_session/0,
-         pg_local_amqp_connection/0]).
+         pg_local_amqp_connection/0, check_initial_run/0]).
 
 -rabbit_boot_step({pre_boot, [{description, "rabbit boot start"}]}).
 
@@ -199,10 +199,16 @@
                     {requires,    [core_initialized]},
                     {enables,     routing_ready}]}).
 
+-rabbit_boot_step({initial_run_check,
+                   [{description, "check if this is the first time the node starts"},
+                    {mfa,         {?MODULE, check_initial_run, []}},
+                    {requires,    recovery},
+                    {enables,     empty_db_check}]}).
+
 -rabbit_boot_step({empty_db_check,
                    [{description, "empty DB check"},
                     {mfa,         {?MODULE, maybe_insert_default_data, []}},
-                    {requires,    recovery},
+                    {requires,    initial_run_check},
                     {enables,     routing_ready}]}).
 
 
@@ -235,6 +241,7 @@
                     {requires,    [core_initialized, recovery]},
                     {enables,     routing_ready}]}).
 
+
 -rabbit_boot_step({pre_flight,
                    [{description, "ready to communicate with peers and clients"},
                     {requires,    [core_initialized, recovery, routing_ready]}]}).
@@ -1151,6 +1158,44 @@ update_cluster_tags() ->
                        #{domain => ?RMQLOG_DOMAIN_GLOBAL}),
     rabbit_runtime_parameters:set_global(cluster_tags, Tags, <<"internal_user">>).
 
+
+-spec check_initial_run() -> 'ok' | no_return().
+
+check_initial_run() ->
+    case application:get_env(rabbit, verify_initial_run, false) of
+        false ->
+            %% Feature is disabled, skip the check
+            ?LOG_DEBUG("Initial run verification is disabled",
+                       #{domain => ?RMQLOG_DOMAIN_GLOBAL}),
+            ok;
+        true ->
+            %% Feature is enabled, perform the check
+            DataDir = data_dir(),
+            MarkerFile = filename:join(DataDir, "node_initialized.marker"),
+            case filelib:is_file(MarkerFile) of
+                true ->
+                    %% Not the first run, check if tables need default data
+                    case rabbit_table:needs_default_data() of
+                        true ->
+                            ?LOG_ERROR("Node has already been initialized, but database appears empty. "
+                                       "This could indicate data loss or a split-brain scenario.",
+                                      #{domain => ?RMQLOG_DOMAIN_GLOBAL}),
+                            throw({error, cluster_already_initialized_but_tables_empty});
+                        false ->
+                            ?LOG_INFO("Node has already been initialized, proceeding with normal startup",
+                                      #{domain => ?RMQLOG_DOMAIN_GLOBAL}),
+                            ok
+                    end;
+                false ->
+                    %% First time starting, create the marker file
+                    ?LOG_INFO("First node startup detected, creating initialization marker",
+                             #{domain => ?RMQLOG_DOMAIN_GLOBAL}),
+                    ok = filelib:ensure_dir(MarkerFile),
+                    ok = file:write_file(MarkerFile, <<>>, [exclusive]), % Empty file.
+                    ok
+            end
+    end.
+
 -spec maybe_insert_default_data() -> 'ok'.
 
 maybe_insert_default_data() ->

diff --git a/deps/rabbit/test/node_initial_run_SUITE.erl b/deps/rabbit/test/node_initial_run_SUITE.erl
@@ -0,0 +1,168 @@
+%% This Source Code Form is subject to the terms of the Mozilla Public
+%% License, v. 2.0. If a copy of the MPL was not distributed with this
+%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
+%%
+%% Copyright (c) 2007-2024 Broadcom. All Rights Reserved. The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
+%%
+
+%% Test suite for the verify_initial_run feature.
+%% This feature helps detect potential data loss scenarios by maintaining
+%% a marker file to track if a node has been initialized before.
+
+-module(node_initial_run_SUITE).
+
+-include_lib("common_test/include/ct.hrl").
+-include_lib("eunit/include/eunit.hrl").
+
+-compile(export_all).
+
+all() ->
+    [
+        {group, single_node_mnesia},
+        {group, single_node_khepri}
+    ].
+
+groups() ->
+    [
+        {single_node_mnesia, [], [
+            verify_initial_run_disabled,
+            verify_initial_run_enabled
+        ]},
+        {single_node_khepri, [], [
+            verify_initial_run_disabled,
+            verify_initial_run_enabled
+        ]}
+    ].
+
+%% -------------------------------------------------------------------
+%% Testsuite setup/teardown.
+%% -------------------------------------------------------------------
+
+init_per_suite(Config) ->
+    rabbit_ct_helpers:log_environment(),
+    rabbit_ct_helpers:run_setup_steps(Config).
+
+end_per_suite(Config) ->
+    rabbit_ct_helpers:run_teardown_steps(Config).
+
+init_per_group(Groupname, Config) ->
+    Config0 = rabbit_ct_helpers:set_config(Config, [
+        {metadata_store, meta_store(Groupname)},
+        {rmq_nodes_clustered, false},
+        {rmq_nodename_suffix, Groupname},
+        {rmq_nodes_count, 1}
+    ]),
+    rabbit_ct_helpers:run_steps(
+        Config0,
+        rabbit_ct_broker_helpers:setup_steps() ++
+            rabbit_ct_client_helpers:setup_steps()
+    ).
+
+end_per_group(_, Config) ->
+    rabbit_ct_helpers:run_steps(
+        Config,
+        rabbit_ct_client_helpers:teardown_steps() ++
+            rabbit_ct_broker_helpers:teardown_steps()
+    ).
+
+init_per_testcase(Testcase, Config) ->
+    rabbit_ct_helpers:testcase_started(Config, Testcase),
+    Config.
+
+end_per_testcase(Testcase, Config) ->
+    rabbit_ct_helpers:testcase_finished(Config, Testcase).
+
+%% -------------------------------------------------------------------
+%% Test cases
+%% -------------------------------------------------------------------
+
+verify_initial_run_disabled(Config) ->
+    % When feature is disabled (default), node should start normally
+    DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir),
+    MarkerFile = filename:join(DataDir, "node_initialized.marker"),
+    % Setting is disabled so no marker file should be present
+    ?assertNot(filelib:is_file(MarkerFile)),
+
+    % Restarting the node should work fine
+    ok = stop_app(Config),
+    set_env(Config, false),
+    ok = start_app(Config),
+    % Still no marker file
+    ?assertNot(filelib:is_file(MarkerFile)),
+    ok.
+
+verify_initial_run_enabled(Config) ->
+    DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir),
+    MarkerFile = filename:join(DataDir, "node_initialized.marker"),
+
+    ok = stop_app(Config),
+    set_env(Config, true),
+    ok = start_app(Config),
+    % Setting is enabled so marker file should be present after initial startup
+    ?assert(filelib:is_file(MarkerFile)),
+
+    % Restarting the node should be fine, as there is a marker file
+    % and corresponding schema data (consistent state)
+
+    ok = stop_app(Config),
+    ok = start_app(Config),
+
+    SchemaFile = schema_file(Config),
+
+    ?assert(filelib:is_file(MarkerFile)),
+
+    % Stop the node and remove the present schema to simulate data loss
+    ok = stop_app(Config),
+    file:delete(SchemaFile),
+    % Node should fail to start because marker exists but schema is missing,
+    % indicating potential data loss or corruption
+    ?assertMatch(
+        {error, 69, _},
+        start_app(Config)
+    ),
+    ok.
+
+%% -------------------------------------------------------------------
+%% Internal helpers
+%% -------------------------------------------------------------------
+
+stop_app(Config) ->
+    Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
+    case rabbit_ct_broker_helpers:rabbitmqctl(Config, Node, ["stop_app"]) of
+        {ok, _} -> ok;
+        Error -> Error
+    end.
+
+start_app(Config) ->
+    Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
+    case rabbit_ct_broker_helpers:rabbitmqctl(Config, Node, ["start_app"]) of
+        {ok, _} -> ok;
+        Error -> Error
+    end.
+
+maybe_enable_verify_initial_run(Config, verify_initial_run_enabled) ->
+    rabbit_ct_helpers:merge_app_env(
+        Config, {rabbit, [{verify_initial_run, true}]}
+    );
+maybe_enable_verify_initial_run(Config, _) ->
+    Config.
+
+meta_store(single_node_mnesia) ->
+    mnesia;
+meta_store(single_node_khepri) ->
+    khepri.
+
+schema_file(Config) ->
+    DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir),
+    MetaStore = rabbit_ct_helpers:get_config(Config, metadata_store),
+    case MetaStore of
+        mnesia ->
+            filename:join(DataDir, "schema.DAT");
+        khepri ->
+            NodeName = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
+            filename:join([DataDir, "coordination", NodeName, "names.dets"])
+    end.
+
+set_env(Config, Bool) ->
+    Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
+    ok = rpc:call(Node, application, set_env, [rabbit, verify_initial_run, Bool]).