Skip to content

Commit d4df3f7

Browse files
authored
TQ: Add support for alarms in the protocol (#8753)
This builds on #8741 An alarm represents a protocol invariant violation. It's unclear exactly what should be done about these other than recording them and allowing them to be reported upstack, which is what is done in this PR. An argument could be made for "freezing" the state machine such that trust quorum nodes stop working and the only thing they can do is report alarm status. However, that would block the trust quorum from operating at all, and it's unclear if this should cause an outage on that node. I'm also somewhat hesitant to put the alarms into the persistent state as that would prevent unlock in the case of a sled/rack reboot. On the flip side of just recording is the possible danger resulting from operating with an invariant violation. This could potentially be risky, and since we shouldn't ever see these maybe pausing for a support call is the right thing. TBD, once more work is done on the protocol.
1 parent 347bf8d commit d4df3f7

File tree

9 files changed

+171
-37
lines changed

9 files changed

+171
-37
lines changed

trust-quorum/gfss/src/shamir.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,17 @@ pub enum SplitError {
2323
TooFewTotalShares { n: u8, k: u8 },
2424
}
2525

26-
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
26+
#[derive(
27+
Debug,
28+
Clone,
29+
thiserror::Error,
30+
PartialEq,
31+
Eq,
32+
PartialOrd,
33+
Ord,
34+
Serialize,
35+
Deserialize,
36+
)]
2737
pub enum CombineError {
2838
#[error("must be at least 2 shares to combine")]
2939
TooFewShares,

trust-quorum/src/alarm.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Mechanism for reporting protocol invariant violations
6+
7+
use serde::{Deserialize, Serialize};
8+
9+
use crate::{Configuration, Epoch, PlatformId};
10+
11+
#[allow(clippy::large_enum_variant)]
12+
#[derive(
13+
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize,
14+
)]
15+
pub enum Alarm {
16+
/// Different configurations found for the same epoch
17+
///
18+
/// Reason: Nexus creates configurations and stores them in CRDB before
19+
/// sending them to a coordinator of its choosing. Nexus will not send the
20+
/// same reconfiguration request to different coordinators. If it does those
21+
/// coordinators will generate different key shares. However, since Nexus
22+
/// will not tell different nodes to coordinate the same configuration, this
23+
/// state should be impossible to reach.
24+
MismatchedConfigurations {
25+
config1: Configuration,
26+
config2: Configuration,
27+
from: PlatformId,
28+
},
29+
30+
/// The `keyShareComputer` could not compute this node's share
31+
///
32+
/// Reason: A threshold of valid key shares were received based on the the
33+
/// share digests in the Configuration. However, computation of the share
34+
/// still failed. This should be impossible.
35+
ShareComputationFailed { epoch: Epoch, err: gfss::shamir::CombineError },
36+
}

trust-quorum/src/compute_key_share.rs

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
//! other nodes so that it can compute its own key share.
1010
1111
use crate::crypto::Sha3_256Digest;
12-
use crate::{Configuration, Epoch, NodeHandlerCtx, PeerMsgKind, PlatformId};
12+
use crate::{
13+
Alarm, Configuration, Epoch, NodeHandlerCtx, PeerMsgKind, PlatformId,
14+
};
1315
use gfss::gf256::Gf256;
1416
use gfss::shamir::{self, Share};
1517
use slog::{Logger, error, o, warn};
@@ -101,6 +103,7 @@ impl KeyShareComputer {
101103
"epoch" => %epoch,
102104
"from" => %from
103105
);
106+
return false;
104107
}
105108

106109
// A valid share was received. Is it new?
@@ -116,12 +119,23 @@ impl KeyShareComputer {
116119
// What index are we in the configuration? This is our "x-coordinate"
117120
// for our key share calculation. We always start indexing from 1, since
118121
// 0 is the rack secret.
119-
let index = self
120-
.config
121-
.members
122-
.keys()
123-
.position(|id| id == ctx.platform_id())
124-
.expect("node exists");
122+
let index =
123+
self.config.members.keys().position(|id| id == ctx.platform_id());
124+
125+
let Some(index) = index else {
126+
let msg = concat!(
127+
"Failed to get index for ourselves in current configuration. ",
128+
"We are not a member, and must have been expunged."
129+
);
130+
error!(
131+
self.log,
132+
"{msg}";
133+
"platform_id" => %ctx.platform_id(),
134+
"config" => ?self.config
135+
);
136+
return false;
137+
};
138+
125139
let x_coordinate =
126140
Gf256::new(u8::try_from(index + 1).expect("index fits in u8"));
127141

@@ -137,11 +151,9 @@ impl KeyShareComputer {
137151
});
138152
true
139153
}
140-
Err(e) => {
141-
// TODO: put the node into into an `Alarm` state similar to
142-
// https://github.com/oxidecomputer/omicron/pull/8062 once we
143-
// have alarms?
144-
error!(self.log, "Failed to compute share: {}", e);
154+
Err(err) => {
155+
error!(self.log, "Failed to compute share: {}", err);
156+
ctx.raise_alarm(Alarm::ShareComputationFailed { epoch, err });
145157
false
146158
}
147159
}

trust-quorum/src/configuration.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ pub enum ConfigurationError {
3030
/// The configuration for a given epoch.
3131
///
3232
/// Only valid for non-lrtq configurations
33-
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33+
#[derive(
34+
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize,
35+
)]
3436
pub struct Configuration {
3537
/// Unique Id of the rack
3638
pub rack_id: RackUuid,

trust-quorum/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ mod persistent_state;
2323
mod validators;
2424
pub use configuration::Configuration;
2525
pub use coordinator_state::{CoordinatorOperation, CoordinatorState};
26+
mod alarm;
2627

28+
pub use alarm::Alarm;
2729
pub use crypto::RackSecret;
2830
pub use messages::*;
2931
pub use node::Node;

trust-quorum/src/node.rs

Lines changed: 54 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use crate::validators::{
2020
MismatchedRackIdError, ReconfigurationError, ValidatedReconfigureMsg,
2121
};
2222
use crate::{
23-
Configuration, CoordinatorState, Epoch, NodeHandlerCtx, PlatformId,
23+
Alarm, Configuration, CoordinatorState, Epoch, NodeHandlerCtx, PlatformId,
2424
messages::*,
2525
};
2626
use gfss::shamir::Share;
@@ -308,28 +308,33 @@ impl Node {
308308
"epoch" => %config.epoch
309309
);
310310
ctx.update_persistent_state(|ps| ps.commits.insert(config.epoch));
311+
return;
311312
}
312313

313314
// Do we have the configuration in our persistent state? If not save it.
314-
ctx.update_persistent_state(|ps| {
315-
if let Err(e) = ps.configs.insert_unique(config.clone()) {
316-
let existing =
317-
e.duplicates().first().expect("duplicate exists");
318-
if *existing != &config {
319-
error!(
320-
self.log,
321-
"Received a configuration mismatch";
322-
"from" => %from,
323-
"existing_config" => #?existing,
324-
"received_config" => #?config
325-
);
326-
// TODO: Alarm
327-
}
328-
false
329-
} else {
330-
true
315+
if let Some(existing) =
316+
ctx.persistent_state().configuration(config.epoch)
317+
{
318+
if existing != &config {
319+
error!(
320+
self.log,
321+
"Received a configuration mismatch";
322+
"from" => %from,
323+
"existing_config" => #?existing,
324+
"received_config" => #?config
325+
);
326+
ctx.raise_alarm(Alarm::MismatchedConfigurations {
327+
config1: (*existing).clone(),
328+
config2: config.clone(),
329+
from: from.clone(),
330+
});
331331
}
332-
});
332+
} else {
333+
ctx.update_persistent_state(|ps| {
334+
ps.configs.insert_unique(config.clone()).expect("new config");
335+
true
336+
});
337+
}
333338

334339
// Are we coordinating for an older epoch? If so, cancel.
335340
if let Some(cs) = &self.coordinator_state {
@@ -343,14 +348,14 @@ impl Node {
343348
"received_epoch" => %config.epoch
344349
);
345350
self.coordinator_state = None;
351+
// Intentionally fall through
346352
} else if coordinating_epoch == config.epoch {
347-
error!(
353+
info!(
348354
self.log,
349355
"Received CommitAdvance while coordinating for same epoch!";
350356
"from" => %from,
351357
"epoch" => %config.epoch
352358
);
353-
// TODO: Alarm
354359
return;
355360
} else {
356361
info!(
@@ -399,7 +404,8 @@ impl Node {
399404
}
400405
}
401406

402-
// We either were collectiong shares for an old epoch or haven't started yet.
407+
// We either were collectiong shares for an old epoch or haven't started
408+
// yet.
403409
self.key_share_computer =
404410
Some(KeyShareComputer::new(&self.log, ctx, config));
405411
}
@@ -414,6 +420,18 @@ impl Node {
414420
ctx.persistent_state().latest_committed_configuration()
415421
{
416422
if latest_committed_config.epoch > epoch {
423+
if !latest_committed_config.members.contains_key(&from) {
424+
info!(
425+
self.log,
426+
"Received a GetShare message from expunged node";
427+
"from" => %from,
428+
"latest_committed_epoch" =>
429+
%latest_committed_config.epoch,
430+
"requested_epoch" => %epoch
431+
);
432+
// TODO: Send an expunged message
433+
return;
434+
}
417435
info!(
418436
self.log,
419437
concat!(
@@ -432,6 +450,20 @@ impl Node {
432450
}
433451
}
434452

453+
// Do we have the configuration? Is the requesting peer a member?
454+
if let Some(config) = ctx.persistent_state().configuration(epoch) {
455+
if !config.members.contains_key(&from) {
456+
info!(
457+
self.log,
458+
"Received a GetShare message from expunged node";
459+
"from" => %from,
460+
"epoch" => %epoch
461+
);
462+
// TODO: Send an expunged message
463+
return;
464+
}
465+
}
466+
435467
// If we have the share for the requested epoch, we always return it. We
436468
// know that it is at least as new as the last committed epoch. We might
437469
// not have learned about the configuration being committed yet, but

trust-quorum/src/node_ctx.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,17 @@
44

55
//! Parameter to Node API calls that allows interaction with the system at large
66
7-
use crate::{Envelope, PeerMsg, PeerMsgKind, PersistentState, PlatformId};
7+
use crate::{
8+
Alarm, Envelope, PeerMsg, PeerMsgKind, PersistentState, PlatformId,
9+
};
810
use std::collections::BTreeSet;
911

1012
/// An API shared by [`NodeCallerCtx`] and [`NodeHandlerCtx`]
1113
pub trait NodeCommonCtx {
1214
fn platform_id(&self) -> &PlatformId;
1315
fn persistent_state(&self) -> &PersistentState;
1416
fn connected(&self) -> &BTreeSet<PlatformId>;
17+
fn alarms(&self) -> &BTreeSet<Alarm>;
1518
}
1619

1720
/// An API for an [`NodeCtx`] usable from a [`crate::Node`]
@@ -54,6 +57,9 @@ pub trait NodeHandlerCtx: NodeCommonCtx {
5457

5558
/// Remove a peer from the connected set
5659
fn remove_connection(&mut self, id: &PlatformId);
60+
61+
/// Record (in-memory) that an alarm has occurred
62+
fn raise_alarm(&mut self, alarm: Alarm);
5763
}
5864

5965
/// Common parameter to [`crate::Node`] methods
@@ -79,6 +85,9 @@ pub struct NodeCtx {
7985

8086
/// Connected peer nodes
8187
connected: BTreeSet<PlatformId>,
88+
89+
/// Any alarms that have occurred
90+
alarms: BTreeSet<Alarm>,
8291
}
8392

8493
impl NodeCtx {
@@ -89,6 +98,7 @@ impl NodeCtx {
8998
persistent_state_changed: false,
9099
outgoing: Vec::new(),
91100
connected: BTreeSet::new(),
101+
alarms: BTreeSet::new(),
92102
}
93103
}
94104
}
@@ -105,6 +115,10 @@ impl NodeCommonCtx for NodeCtx {
105115
fn connected(&self) -> &BTreeSet<PlatformId> {
106116
&self.connected
107117
}
118+
119+
fn alarms(&self) -> &BTreeSet<Alarm> {
120+
&self.alarms
121+
}
108122
}
109123

110124
impl NodeHandlerCtx for NodeCtx {
@@ -138,6 +152,10 @@ impl NodeHandlerCtx for NodeCtx {
138152
fn remove_connection(&mut self, id: &PlatformId) {
139153
self.connected.remove(id);
140154
}
155+
156+
fn raise_alarm(&mut self, alarm: Alarm) {
157+
self.alarms.insert(alarm);
158+
}
141159
}
142160

143161
impl NodeCallerCtx for NodeCtx {

trust-quorum/tests/cluster.proptest-regressions

Lines changed: 7 additions & 0 deletions
Large diffs are not rendered by default.

trust-quorum/tests/cluster.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,7 @@ impl TestState {
834834
self.invariant_nodes_have_prepared_if_coordinator_has_acks()?;
835835
self.invariant_nodes_have_committed_if_nexus_has_acks()?;
836836
self.invariant_nodes_not_coordinating_and_computing_key_share_simultaneously()?;
837+
self.invariant_no_alarms()?;
837838
Ok(())
838839
}
839840

@@ -953,6 +954,20 @@ impl TestState {
953954

954955
Ok(())
955956
}
957+
958+
// Ensure there has been no alarm at any node
959+
fn invariant_no_alarms(&self) -> Result<(), TestCaseError> {
960+
for (id, (_, ctx)) in &self.sut.nodes {
961+
let alarms = ctx.alarms();
962+
prop_assert!(
963+
alarms.is_empty(),
964+
"Alarms found for {}: {:#?}",
965+
id,
966+
alarms
967+
);
968+
}
969+
Ok(())
970+
}
956971
}
957972

958973
/// Broken out of `TestState` to alleviate borrow checker woes

0 commit comments

Comments
 (0)