Skip to content

Commit b81a5d9

Browse files
authored
test: preserve lease primitives across failover (#94)
1 parent 6f04a5a commit b81a5d9

File tree

2 files changed

+180
-15
lines changed

2 files changed

+180
-15
lines changed

crates/allocdb-node/src/replicated_simulation_tests.rs

Lines changed: 167 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ fn core_config() -> Config {
2929
}
3030
}
3131

32+
fn bundle_core_config() -> Config {
33+
Config {
34+
max_bundle_size: 4,
35+
..core_config()
36+
}
37+
}
38+
3239
fn engine_config() -> EngineConfig {
3340
EngineConfig {
3441
max_submission_queue: 4,
@@ -63,6 +70,18 @@ fn reserve_payload(resource_id: u128, holder_id: u128, operation_id: u128) -> Ve
6370
})
6471
}
6572

73+
fn reserve_bundle_payload(resource_ids: &[u128], holder_id: u128, operation_id: u128) -> Vec<u8> {
74+
encode_client_request(ClientRequest {
75+
operation_id: OperationId(operation_id),
76+
client_id: ClientId(7),
77+
command: Command::ReserveBundle {
78+
resource_ids: resource_ids.iter().copied().map(ResourceId).collect(),
79+
holder_id: HolderId(holder_id),
80+
ttl_slots: 4,
81+
},
82+
})
83+
}
84+
6685
fn confirm_payload(
6786
reservation_id: u128,
6887
holder_id: u128,
@@ -80,6 +99,23 @@ fn confirm_payload(
8099
})
81100
}
82101

102+
fn release_payload(
103+
reservation_id: u128,
104+
holder_id: u128,
105+
lease_epoch: u64,
106+
operation_id: u128,
107+
) -> Vec<u8> {
108+
encode_client_request(ClientRequest {
109+
operation_id: OperationId(operation_id),
110+
client_id: ClientId(7),
111+
command: Command::Release {
112+
reservation_id: ReservationId(reservation_id),
113+
holder_id: HolderId(holder_id),
114+
lease_epoch,
115+
},
116+
})
117+
}
118+
83119
fn revoke_payload(reservation_id: u128, operation_id: u128) -> Vec<u8> {
84120
encode_client_request(ClientRequest {
85121
operation_id: OperationId(operation_id),
@@ -237,6 +273,14 @@ fn primary_harness(name: &str, seed: u64) -> ReplicatedSimulationHarness {
237273
harness
238274
}
239275

276+
fn bundle_primary_harness(name: &str, seed: u64) -> ReplicatedSimulationHarness {
277+
let mut harness =
278+
ReplicatedSimulationHarness::new(name, seed, bundle_core_config(), engine_config())
279+
.unwrap();
280+
harness.configure_primary(replica(1), 1).unwrap();
281+
harness
282+
}
283+
240284
fn replica_last_applied_lsn(harness: &ReplicatedSimulationHarness, replica_id: u64) -> Option<Lsn> {
241285
harness
242286
.replica(replica(replica_id))
@@ -298,6 +342,25 @@ fn pending_labels(harness: &ReplicatedSimulationHarness) -> Vec<&str> {
298342
.collect()
299343
}
300344

345+
fn replica_reservation_member_ids(
346+
harness: &ReplicatedSimulationHarness,
347+
replica_id: u64,
348+
reservation_id: u128,
349+
current_slot: u64,
350+
) -> Vec<ResourceId> {
351+
let node = harness.replica(replica(replica_id)).unwrap().unwrap();
352+
let reservation = node
353+
.engine()
354+
.unwrap()
355+
.db()
356+
.reservation(ReservationId(reservation_id), Slot(current_slot))
357+
.unwrap();
358+
node.engine()
359+
.unwrap()
360+
.db()
361+
.reservation_member_resource_ids(reservation)
362+
}
363+
301364
fn set_replica_link(
302365
harness: &mut ReplicatedSimulationHarness,
303366
left: u64,
@@ -1357,6 +1420,91 @@ fn primary_crash_after_reply_preserves_read_and_retry_on_new_primary() {
13571420
}
13581421

13591422
#[test]
1423+
fn committed_bundle_membership_survives_failover_and_suffix_rejoin() {
1424+
let mut harness = bundle_primary_harness("replicated-bundle-failover", 0x5a46_0001);
1425+
1426+
let create_first = harness
1427+
.client_submit(
1428+
replica(1),
1429+
Slot(1),
1430+
&create_payload(81, 801),
1431+
"bundle-create-first",
1432+
)
1433+
.unwrap();
1434+
commit_to_all_backups(&mut harness, "bundle-create-first", create_first.lsn);
1435+
1436+
let create_second = harness
1437+
.client_submit(
1438+
replica(1),
1439+
Slot(2),
1440+
&create_payload(82, 802),
1441+
"bundle-create-second",
1442+
)
1443+
.unwrap();
1444+
commit_to_all_backups(&mut harness, "bundle-create-second", create_second.lsn);
1445+
1446+
let bundle = harness
1447+
.client_submit(
1448+
replica(1),
1449+
Slot(3),
1450+
&reserve_bundle_payload(&[81, 82], 91, 803),
1451+
"bundle-reserve",
1452+
)
1453+
.unwrap();
1454+
commit_to_backup(&mut harness, "bundle-reserve", bundle.lsn, 2);
1455+
1456+
harness.crash_replica(replica(1)).unwrap();
1457+
harness.complete_view_change(replica(2), 2).unwrap();
1458+
1459+
let first_member = harness
1460+
.read_resource(replica(2), ResourceId(81), Some(bundle.lsn))
1461+
.unwrap()
1462+
.expect("new primary should preserve first bundle member");
1463+
assert_eq!(
1464+
first_member.current_state,
1465+
allocdb_core::ResourceState::Reserved
1466+
);
1467+
assert_eq!(first_member.current_reservation_id, Some(ReservationId(3)));
1468+
1469+
let second_member = harness
1470+
.read_resource(replica(2), ResourceId(82), Some(bundle.lsn))
1471+
.unwrap()
1472+
.expect("new primary should preserve second bundle member");
1473+
assert_eq!(
1474+
second_member.current_state,
1475+
allocdb_core::ResourceState::Reserved
1476+
);
1477+
assert_eq!(second_member.current_reservation_id, Some(ReservationId(3)));
1478+
1479+
assert_eq!(
1480+
replica_reservation_member_ids(&harness, 2, 3, 3),
1481+
vec![ResourceId(81), ResourceId(82)]
1482+
);
1483+
1484+
let method = harness.rejoin_replica(replica(3), replica(2)).unwrap();
1485+
assert_eq!(method, ReplicaRejoinMethod::SuffixOnly);
1486+
assert_eq!(
1487+
replica_reservation_member_ids(&harness, 3, 3, 3),
1488+
vec![ResourceId(81), ResourceId(82)]
1489+
);
1490+
1491+
let busy_member = harness
1492+
.client_submit(
1493+
replica(2),
1494+
Slot(4),
1495+
&reserve_payload(82, 92, 804),
1496+
"bundle-member-busy",
1497+
)
1498+
.unwrap();
1499+
commit_to_backup(&mut harness, "bundle-member-busy", busy_member.lsn, 3);
1500+
let busy_result = harness
1501+
.published_result(busy_member.lsn)
1502+
.expect("conflicting member reserve should publish deterministically");
1503+
assert_eq!(busy_result.outcome.result_code, ResultCode::ResourceBusy);
1504+
}
1505+
1506+
#[test]
1507+
#[allow(clippy::too_many_lines)]
13601508
fn committed_revoke_stays_non_reusable_across_failover_until_reclaim() {
13611509
let mut harness = primary_harness("replicated-revoke-failover", 0x5a46);
13621510

@@ -1421,11 +1569,28 @@ fn committed_revoke_stays_non_reusable_across_failover_until_reclaim() {
14211569
allocdb_core::ResourceState::Revoking
14221570
);
14231571

1572+
let stale_release = harness
1573+
.client_submit(
1574+
replica(2),
1575+
Slot(3),
1576+
&release_payload(2, 91, 1, 705),
1577+
"revoke-stale-release",
1578+
)
1579+
.unwrap();
1580+
commit_to_backup(&mut harness, "revoke-stale-release", stale_release.lsn, 3);
1581+
let stale_release_result = harness
1582+
.published_result(stale_release.lsn)
1583+
.expect("stale holder release should publish deterministically");
1584+
assert_eq!(
1585+
stale_release_result.outcome.result_code,
1586+
ResultCode::StaleEpoch
1587+
);
1588+
14241589
let early_reuse = harness
14251590
.client_submit(
14261591
replica(2),
14271592
Slot(3),
1428-
&reserve_payload(71, 99, 705),
1593+
&reserve_payload(71, 99, 706),
14291594
"revoke-early-reuse",
14301595
)
14311596
.unwrap();
@@ -1442,7 +1607,7 @@ fn committed_revoke_stays_non_reusable_across_failover_until_reclaim() {
14421607
.client_submit(
14431608
replica(2),
14441609
Slot(4),
1445-
&reclaim_payload(2, 706),
1610+
&reclaim_payload(2, 707),
14461611
"revoke-reclaim",
14471612
)
14481613
.unwrap();

docs/status.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -195,9 +195,9 @@
195195
- local cluster, qemu assets, Jepsen harness, and benchmarks: `cargo test -p allocdb-node local_cluster -- --nocapture`, `cargo test -p allocdb-node qemu_testbed -- --nocapture`, `cargo test -p allocdb-node jepsen -- --nocapture`, `cargo test -p allocdb-node --bin allocdb-jepsen -- --nocapture`, `cargo run -p allocdb-node --bin allocdb-jepsen -- plan`, `cargo run -p allocdb-bench -- --scenario all`
196196
- repo gate: `scripts/preflight.sh`
197197
## Current Focus
198-
- PR `#82` merged the `#70` maintainability follow-up, and the closing evidence included the live
199-
KubeVirt `reservation_contention-control` and full `1800s`
200-
`reservation_contention-crash-restart` reruns on `allocdb-a` with `blockers=0`
198+
- PR `#82` merged the `#70` maintainability follow-up, including live KubeVirt
199+
`reservation_contention-control` and full `1800s` `reservation_contention-crash-restart`
200+
reruns on `allocdb-a` with `blockers=0`
201201
- `M9-T01` through `M9-T05` are merged on `main` via PR `#81`, and the planning issues are closed
202202
on the `AllocDB` project
203203
- PR `#89` merged `M9-T06` on `main`: the trusted core now supports atomic bundle reservation,
@@ -208,13 +208,13 @@
208208
surfaces expose the current authority token for active reservations
209209
- PR `#92` merged `M9-T08` on `main`: the trusted core now has explicit `revoke` / `reclaim`
210210
commands, `revoking` and `revoked` states, and deterministic duplicate/recovery handling
211-
- issue `#86` / `M9-T09` is the active implementation slice on the current branch: the node API
212-
and wire codec now expose the approved lease-centric surface with `get_lease`, flattened
213-
committed results, `current_lease_id`, and ordered `member_resource_ids`, while keeping the
214-
trusted-core naming and apply path intact
215-
- targeted validation on the active `#86` branch currently includes
216-
`cargo test -p allocdb-node api -- --nocapture`
217-
- the active `#86` branch keeps the `T09` / `T10` boundary explicit: it finishes the lease
218-
transport/read/recovery surface without adding new replication or view-change behavior
219-
- the next planned code-bearing slices after `#86` remain `M9-T10` replication preservation and
220-
`M9-T11` broader regression coverage
211+
- PR `#93` merged `M9-T09` on `main`: the node API and wire codec now expose the approved
212+
lease-centric surface with `get_lease`, flattened committed results, `current_lease_id`, and
213+
ordered `member_resource_ids`, while keeping the trusted-core naming and apply path intact
214+
- issue `#87` / `M9-T10` is the active implementation slice on the current branch: preserve
215+
bundle ownership, fencing outcomes, and revoke safety across replication, failover, and replica
216+
rejoin without introducing a second apply path
217+
- targeted validation on the active `#87` branch currently centers on `cargo test -p allocdb-node replicated_simulation -- --nocapture`,
218+
`cargo test -p allocdb-node replica -- --nocapture`, and `./scripts/preflight.sh`
219+
- the active `#87` branch keeps the `T10` / `T11` boundary explicit: it proves replicated-path preservation for the approved lease primitives, while broader scenario expansion stays in `M9-T11`
220+
- the next planned code-bearing slice after `#87` remains `M9-T11` broader regression coverage

0 commit comments

Comments
 (0)