Skip to content

Commit 8c5b6bd

Browse files
committed
Remove bogus alarm
It's not actually an error to receive a `CommitAdvance` while coordinating for the same epoch. The `GetShare` from the coordinator could have been delayed in the network` and the node that received it already committed before the coordinator new it was done preparing. In essence, the following would happen: 1. The coordinator would send GetShare requests for the prior epoch 2. Enough nodes would reply so that the coordinator would start sending prepares. 3. Enough nodes would ack prepares to commit 4. Nexus would poll and send commits. Other nodes would get those commits, but not the coordinator 5. A node that hadn't yet received the `GetShare` would get a `CommitAdvance` or see the `Commit` from nexus and get it's configuration and recompute it's own share and commit. It may have been a prior coordinator with delayed deliveries to other nodes of `GetShare` messages. 6. The node that just committed finally receives the `GetShare` and sends back a `CommitAdvance` to the coordinator This is all valid, and was similar to a proptest counterexample
1 parent 681e851 commit 8c5b6bd

File tree

3 files changed

+15
-20
lines changed

3 files changed

+15
-20
lines changed

trust-quorum/src/alarm.rs

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
use serde::{Deserialize, Serialize};
88

9-
use crate::{Configuration, Epoch};
9+
use crate::{Configuration, Epoch, PlatformId};
1010

1111
#[derive(
1212
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize,
@@ -20,18 +20,11 @@ pub enum Alarm {
2020
/// coordinators will generate different key shares. However, since Nexus
2121
/// will not tell different nodes to coordinate the same configuration, this
2222
/// state should be impossible to reach.
23-
MismatchedConfigurations { config1: Configuration, config2: Configuration },
24-
25-
/// We received a `CommitAdvance` while coordinating for the same epoch
26-
///
27-
/// Reason: `CommitAdvance` is a reply for a key share request in an
28-
/// old epoch that we don't have the latest committed coordination.
29-
/// However we are actually the coordinator for the configuration in the
30-
/// `CommitAdvance`. While it's possible that another node could learn
31-
/// of the commit from Nexus before the coordinator, the coordinator will
32-
/// never ask for a key share for that epoch. Therefore this state should be
33-
/// impossible to reach.
34-
CommitAdvanceForCoordinatingEpoch { config: Configuration },
23+
MismatchedConfigurations {
24+
config1: Configuration,
25+
config2: Configuration,
26+
from: PlatformId,
27+
},
3528

3629
/// The `keyShareComputer` could not compute this node's share
3730
///

trust-quorum/src/node.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ impl Node {
325325
ctx.raise_alarm(Alarm::MismatchedConfigurations {
326326
config1: (*existing).clone(),
327327
config2: config.clone(),
328+
from: from.clone(),
328329
});
329330
}
330331
} else {
@@ -353,9 +354,6 @@ impl Node {
353354
"from" => %from,
354355
"epoch" => %config.epoch
355356
);
356-
ctx.raise_alarm(Alarm::CommitAdvanceForCoordinatingEpoch {
357-
config,
358-
});
359357
return;
360358
} else {
361359
info!(

trust-quorum/tests/cluster.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use proptest::prelude::*;
1515
use proptest::sample::Selector;
1616
use slog::{Logger, info, o};
1717
use std::collections::{BTreeMap, BTreeSet};
18+
use std::panic;
1819
use test_strategy::{Arbitrary, proptest};
1920
use trust_quorum::{
2021
Configuration, CoordinatorOperation, Envelope, Epoch, Node, NodeCallerCtx,
@@ -961,7 +962,7 @@ impl TestState {
961962
let alarms = ctx.alarms();
962963
prop_assert!(
963964
alarms.is_empty(),
964-
"Alarms found for {}: {:#?}",
965+
"Alarms found for {}: {:?}",
965966
id,
966967
alarms
967968
);
@@ -1096,27 +1097,30 @@ pub struct TestInput {
10961097
fn test_trust_quorum_protocol(input: TestInput) {
10971098
let logctx = test_setup_log("test_trust_quorum_protocol");
10981099

1100+
// let result = panic::catch_unwind(|| {
10991101
let mut state = TestState::new(logctx.log.clone());
1100-
11011102
// Perform the initial setup
11021103
state.create_nexus_initial_config(input.initial_config);
11031104
state.setup_initial_connections(input.initial_down_nodes);
11041105
state.send_reconfigure_msg();
11051106

11061107
// Check the results of the initial setup
1107-
state.postcondition_initial_configuration()?;
1108+
state.postcondition_initial_configuration().unwrap();
11081109

11091110
// Put the coordinator's outgoing messages on the wire if there are any
11101111
state.send_envelopes_from_coordinator();
11111112

11121113
// Start executing the actions
1113-
state.run_actions(input.actions)?;
1114+
state.run_actions(input.actions).unwrap();
11141115

11151116
info!(
11161117
state.log,
11171118
"Test complete";
11181119
"skipped_actions" => state.skipped_actions
11191120
);
1121+
// });
1122+
1123+
//prop_assert!(result.is_ok());
11201124

11211125
logctx.cleanup_successful();
11221126
}

0 commit comments

Comments
 (0)