Skip to content

Commit 0e04a6f

Browse files
committed
main: Refuse to start node when RF-rack-invalid keyspace exists
When a node is started with the option `rf_rack_valid_keyspaces` enabled, the initialization will fail if there is an RF-rack-invalid keyspace. We want to force the user to adjust their existing keyspaces when upgrading to 2025.* so that the invariant that every keyspace is RF-rack-valid is always satisfied. Fixes scylladb#23300
1 parent 41f862d commit 0e04a6f

File tree

5 files changed

+112
-0
lines changed

5 files changed

+112
-0
lines changed

main.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#include <seastar/util/closeable.hh>
1515
#include <seastar/core/abort_source.hh>
16+
#include "exceptions/exceptions.hh"
1617
#include "gms/inet_address.hh"
1718
#include "auth/allow_all_authenticator.hh"
1819
#include "auth/allow_all_authorizer.hh"
@@ -2176,6 +2177,14 @@ sharded<locator::shared_token_metadata> token_metadata;
21762177
return ss.local().join_cluster(proxy, service::start_hint_manager::yes, generation_number);
21772178
}).get();
21782179

2180+
// At this point, `locator::topology` should be stable, i.e. we should have complete information
2181+
// about the layout of the cluster (= list of nodes along with the racks/DCs).
2182+
if (cfg->rf_rack_valid_keyspaces()) {
2183+
startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
2184+
db.local().check_rf_rack_validity(token_metadata.local().get());
2185+
startlog.info("All keyspaces are RF-rack-valid");
2186+
}
2187+
21792188
dictionary_service dict_service(
21802189
dict_sampler,
21812190
sys_ks.local(),

replica/database.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
#include <fmt/ranges.h>
1212
#include <fmt/std.h>
13+
#include "locator/network_topology_strategy.hh"
14+
#include "locator/tablets.hh"
15+
#include "locator/token_metadata_fwd.hh"
1316
#include "utils/log.hh"
1417
#include "replica/database_fwd.hh"
1518
#include "utils/assert.hh"
@@ -3184,4 +3187,12 @@ database::on_effective_service_levels_cache_reloaded() {
31843187
co_return;
31853188
}
31863189

3190+
void database::check_rf_rack_validity(const locator::token_metadata_ptr tmptr) const {
3191+
SCYLLA_ASSERT(get_config().rf_rack_valid_keyspaces());
3192+
3193+
for (const auto& [name, info] : get_keyspaces()) {
3194+
locator::assert_rf_rack_valid_keyspace(name, tmptr, info.get_replication_strategy());
3195+
}
3196+
}
3197+
31873198
}

replica/database.hh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1951,6 +1951,14 @@ public:
19511951
/** This callback is going to be called just before the service level is changed **/
19521952
virtual future<> on_before_service_level_change(qos::service_level_options slo_before, qos::service_level_options slo_after, qos::service_level_info sl_info) override;
19531953
virtual future<> on_effective_service_levels_cache_reloaded() override;
1954+
1955+
// Verify that the existing keyspaces are all RF-rack-valid.
1956+
//
1957+
// Preconditions:
1958+
// * the option `rf_rack_valid_keyspaces` in enabled,
1959+
// * the `locator::topology` instance corresponding to the passed `locator::token_metadata_ptr`
1960+
// must contain a complete list of racks and data centers in the cluster.
1961+
void check_rf_rack_validity(const locator::token_metadata_ptr) const;
19541962
};
19551963

19561964
// A helper function to parse the directory name back

test/cluster/test_multidc.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,3 +384,81 @@ async def create_fail(rfs: Union[List[int], int], failed_dc: int, rf: int, rack_
384384
async with asyncio.TaskGroup() as tg:
385385
for task in [*valid_keyspaces, *invalid_keyspaces]:
386386
_ = tg.create_task(task)
387+
388+
async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager: ManagerClient):
389+
"""
390+
This test verifies that starting a Scylla node fails when there's an RF-rack-invalid keyspace.
391+
We aim to simulate the behavior of a node when upgrading to 2025.*.
392+
393+
For more context, see: scylladb/scylladb#23300.
394+
"""
395+
396+
cfg_false = {"rf_rack_valid_keyspaces": "false"}
397+
398+
s1 = await manager.server_add(config=cfg_false, property_file={"dc": "dc1", "rack": "r1"})
399+
_ = await manager.server_add(config=cfg_false, property_file={"dc": "dc1", "rack": "r2"})
400+
_ = await manager.server_add(config=cfg_false, property_file={"dc": "dc1", "rack": "r3"})
401+
_ = await manager.server_add(config=cfg_false, property_file={"dc": "dc2", "rack": "r4"})
402+
# Note: This rack should behave as if it never existed.
403+
_ = await manager.server_add(config={"join_ring": "false", "rf_rack_valid_keyspaces": "false"}, property_file={"dc": "dc1", "rack": "rzerotoken"})
404+
405+
# Current situation:
406+
# DC1: {r1, r2, r3}, DC2: {r4}
407+
408+
cql = manager.get_cql()
409+
410+
async def create_keyspace(rfs: List[int], tablets: bool) -> str:
411+
dcs = ", ".join([f"'dc{i + 1}': {rf}" for i, rf in enumerate(rfs)])
412+
name = unique_name()
413+
tablets = str(tablets).lower()
414+
await cql.run_async(f"CREATE KEYSPACE {name} WITH REPLICATION = {{'class': 'NetworkTopologyStrategy', {dcs}}} "
415+
f"AND tablets = {{'enabled': {tablets}}}")
416+
logger.info(f"Created keyspace {name} with {rfs} and tablets={tablets}")
417+
return name
418+
419+
valid_keyspaces = [
420+
# For each DC: RF \in {0, 1, #racks}.
421+
([0, 0], True),
422+
([1, 0], True),
423+
([3, 0], True),
424+
([1, 1], True),
425+
([3, 1], True),
426+
# Reminder: Keyspaces not using tablets are all valid.
427+
([0, 0], False),
428+
([1, 0], False),
429+
([2, 0], False),
430+
([3, 0], False),
431+
([4, 0], False),
432+
([0, 1], False),
433+
([1, 1], False),
434+
([2, 1], False),
435+
([3, 1], False),
436+
([4, 1], False),
437+
([0, 2], False),
438+
([1, 2], False),
439+
([2, 2], False),
440+
([3, 2], False),
441+
([4, 2], False)
442+
]
443+
444+
# Populate RF-rack-valid keyspaces.
445+
async with asyncio.TaskGroup() as tg:
446+
for rfs, tablets in valid_keyspaces:
447+
_ = tg.create_task(create_keyspace(rfs, tablets))
448+
449+
await manager.server_stop_gracefully(s1.server_id)
450+
await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
451+
452+
async def try_fail(rfs: List[int], dc: str, rf: int, rack_count: int):
453+
ks = await create_keyspace(rfs, True)
454+
err = r"The option `rf_rack_valid_keyspaces` is enabled. It requires that all keyspaces are RF-rack-valid. " \
455+
f"That condition is violated: keyspace '{ks}' doesn't satisfy it for DC '{dc}': RF={rf} vs. rack count={rack_count}."
456+
_ = await manager.server_start(s1.server_id, expected_error=err)
457+
await cql.run_async(f"DROP KEYSPACE {ks}")
458+
459+
# Test RF-rack-invalid keyspaces.
460+
await try_fail([2, 0], "dc1", 2, 3)
461+
await try_fail([3, 2], "dc2", 2, 1)
462+
await try_fail([4, 1], "dc1", 4, 3)
463+
464+
_ = await manager.server_start(s1.server_id)

test/lib/cql_test_env.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,6 +1050,12 @@ class single_node_cql_env : public cql_test_env {
10501050
throw;
10511051
}
10521052

1053+
if (cfg->rf_rack_valid_keyspaces()) {
1054+
startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
1055+
_db.local().check_rf_rack_validity(_token_metadata.local().get());
1056+
startlog.info("All keyspaces are RF-rack-valid");
1057+
}
1058+
10531059
utils::loading_cache_config perm_cache_config;
10541060
perm_cache_config.max_size = cfg->permissions_cache_max_entries();
10551061
perm_cache_config.expiry = std::chrono::milliseconds(cfg->permissions_validity_in_ms());

0 commit comments

Comments
 (0)