Skip to content

TQ: Add support for alarms in the protocol #8753

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: tq-reconfigure
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion trust-quorum/gfss/src/shamir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,17 @@ pub enum SplitError {
TooFewTotalShares { n: u8, k: u8 },
}

#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
#[derive(
Debug,
Clone,
thiserror::Error,
PartialEq,
Eq,
PartialOrd,
Ord,
Serialize,
Deserialize,
)]
pub enum CombineError {
#[error("must be at least 2 shares to combine")]
TooFewShares,
Expand Down
36 changes: 36 additions & 0 deletions trust-quorum/src/alarm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Mechanism for reporting protocol invariant violations

use serde::{Deserialize, Serialize};

use crate::{Configuration, Epoch, PlatformId};

#[allow(clippy::large_enum_variant)]
#[derive(
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize,
)]
pub enum Alarm {
/// Different configurations found for the same epoch
///
/// Reason: Nexus creates configurations and stores them in CRDB before
/// sending them to a coordinator of its choosing. Nexus will not send the
/// same reconfiguration request to different coordinators. If it does those
/// coordinators will generate different key shares. However, since Nexus
/// will not tell different nodes to coordinate the same configuration, this
/// state should be impossible to reach.
MismatchedConfigurations {
config1: Configuration,
config2: Configuration,
from: PlatformId,
},

/// The `keyShareComputer` could not compute this node's share
///
/// Reason: A threshold of valid key shares were received based on the the
/// share digests in the Configuration. However, computation of the share
/// still failed. This should be impossible.
ShareComputationFailed { epoch: Epoch, err: gfss::shamir::CombineError },
}
36 changes: 24 additions & 12 deletions trust-quorum/src/compute_key_share.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
//! other nodes so that it can compute its own key share.

use crate::crypto::Sha3_256Digest;
use crate::{Configuration, Epoch, NodeHandlerCtx, PeerMsgKind, PlatformId};
use crate::{
Alarm, Configuration, Epoch, NodeHandlerCtx, PeerMsgKind, PlatformId,
};
use gfss::gf256::Gf256;
use gfss::shamir::{self, Share};
use slog::{Logger, error, o, warn};
Expand Down Expand Up @@ -101,6 +103,7 @@ impl KeyShareComputer {
"epoch" => %epoch,
"from" => %from
);
return false;
}

// A valid share was received. Is it new?
Expand All @@ -116,12 +119,23 @@ impl KeyShareComputer {
// What index are we in the configuration? This is our "x-coordinate"
// for our key share calculation. We always start indexing from 1, since
// 0 is the rack secret.
let index = self
.config
.members
.keys()
.position(|id| id == ctx.platform_id())
.expect("node exists");
let index =
self.config.members.keys().position(|id| id == ctx.platform_id());

let Some(index) = index else {
let msg = concat!(
"Failed to get index for ourselves in current configuration. ",
"We are not a member, and must have been expunged."
);
error!(
self.log,
"{msg}";
"platform_id" => %ctx.platform_id(),
"config" => ?self.config
);
return false;
};

let x_coordinate =
Gf256::new(u8::try_from(index + 1).expect("index fits in u8"));

Expand All @@ -137,11 +151,9 @@ impl KeyShareComputer {
});
true
}
Err(e) => {
// TODO: put the node into into an `Alarm` state similar to
// https://github.com/oxidecomputer/omicron/pull/8062 once we
// have alarms?
error!(self.log, "Failed to compute share: {}", e);
Err(err) => {
error!(self.log, "Failed to compute share: {}", err);
ctx.raise_alarm(Alarm::ShareComputationFailed { epoch, err });
false
}
}
Expand Down
4 changes: 3 additions & 1 deletion trust-quorum/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ pub enum ConfigurationError {
/// The configuration for a given epoch.
///
/// Only valid for non-lrtq configurations
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize,
)]
pub struct Configuration {
/// Unique Id of the rack
pub rack_id: RackUuid,
Expand Down
2 changes: 2 additions & 0 deletions trust-quorum/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ mod persistent_state;
mod validators;
pub use configuration::Configuration;
pub use coordinator_state::{CoordinatorOperation, CoordinatorState};
mod alarm;

pub use alarm::Alarm;
pub use crypto::RackSecret;
pub use messages::*;
pub use node::Node;
Expand Down
76 changes: 54 additions & 22 deletions trust-quorum/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use crate::validators::{
MismatchedRackIdError, ReconfigurationError, ValidatedReconfigureMsg,
};
use crate::{
Configuration, CoordinatorState, Epoch, NodeHandlerCtx, PlatformId,
Alarm, Configuration, CoordinatorState, Epoch, NodeHandlerCtx, PlatformId,
messages::*,
};
use gfss::shamir::Share;
Expand Down Expand Up @@ -308,28 +308,33 @@ impl Node {
"epoch" => %config.epoch
);
ctx.update_persistent_state(|ps| ps.commits.insert(config.epoch));
return;
}

// Do we have the configuration in our persistent state? If not save it.
ctx.update_persistent_state(|ps| {
if let Err(e) = ps.configs.insert_unique(config.clone()) {
let existing =
e.duplicates().first().expect("duplicate exists");
if *existing != &config {
error!(
self.log,
"Received a configuration mismatch";
"from" => %from,
"existing_config" => #?existing,
"received_config" => #?config
);
// TODO: Alarm
}
false
} else {
true
if let Some(existing) =
ctx.persistent_state().configuration(config.epoch)
{
if existing != &config {
error!(
self.log,
"Received a configuration mismatch";
"from" => %from,
"existing_config" => #?existing,
"received_config" => #?config
);
ctx.raise_alarm(Alarm::MismatchedConfigurations {
config1: (*existing).clone(),
config2: config.clone(),
from: from.clone(),
});
}
});
} else {
ctx.update_persistent_state(|ps| {
ps.configs.insert_unique(config.clone()).expect("new config");
true
});
}

// Are we coordinating for an older epoch? If so, cancel.
if let Some(cs) = &self.coordinator_state {
Expand All @@ -343,14 +348,14 @@ impl Node {
"received_epoch" => %config.epoch
);
self.coordinator_state = None;
// Intentionally fall through
} else if coordinating_epoch == config.epoch {
error!(
info!(
self.log,
"Received CommitAdvance while coordinating for same epoch!";
"from" => %from,
"epoch" => %config.epoch
);
// TODO: Alarm
return;
} else {
info!(
Expand Down Expand Up @@ -399,7 +404,8 @@ impl Node {
}
}

// We either were collectiong shares for an old epoch or haven't started yet.
// We either were collectiong shares for an old epoch or haven't started
// yet.
self.key_share_computer =
Some(KeyShareComputer::new(&self.log, ctx, config));
}
Expand All @@ -414,6 +420,18 @@ impl Node {
ctx.persistent_state().latest_committed_configuration()
{
if latest_committed_config.epoch > epoch {
if !latest_committed_config.members.contains_key(&from) {
info!(
self.log,
"Received a GetShare message from expunged node";
"from" => %from,
"latest_committed_epoch" =>
%latest_committed_config.epoch,
"requested_epoch" => %epoch
);
// TODO: Send an expunged message
return;
}
info!(
self.log,
concat!(
Expand All @@ -432,6 +450,20 @@ impl Node {
}
}

// Do we have the configuration? Is the requesting peer a member?
if let Some(config) = ctx.persistent_state().configuration(epoch) {
if !config.members.contains_key(&from) {
info!(
self.log,
"Received a GetShare message from expunged node";
"from" => %from,
"epoch" => %epoch
);
// TODO: Send an expunged message
return;
}
}

// If we have the share for the requested epoch, we always return it. We
// know that it is at least as new as the last committed epoch. We might
// not have learned about the configuration being committed yet, but
Expand Down
20 changes: 19 additions & 1 deletion trust-quorum/src/node_ctx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@

//! Parameter to Node API calls that allows interaction with the system at large

use crate::{Envelope, PeerMsg, PeerMsgKind, PersistentState, PlatformId};
use crate::{
Alarm, Envelope, PeerMsg, PeerMsgKind, PersistentState, PlatformId,
};
use std::collections::BTreeSet;

/// An API shared by [`NodeCallerCtx`] and [`NodeHandlerCtx`]
pub trait NodeCommonCtx {
fn platform_id(&self) -> &PlatformId;
fn persistent_state(&self) -> &PersistentState;
fn connected(&self) -> &BTreeSet<PlatformId>;
fn alarms(&self) -> &BTreeSet<Alarm>;
}

/// An API for an [`NodeCtx`] usable from a [`crate::Node`]
Expand Down Expand Up @@ -54,6 +57,9 @@ pub trait NodeHandlerCtx: NodeCommonCtx {

/// Remove a peer from the connected set
fn remove_connection(&mut self, id: &PlatformId);

/// Record (in-memory) that an alarm has occurred
fn raise_alarm(&mut self, alarm: Alarm);
}

/// Common parameter to [`crate::Node`] methods
Expand All @@ -79,6 +85,9 @@ pub struct NodeCtx {

/// Connected peer nodes
connected: BTreeSet<PlatformId>,

/// Any alarms that have occurred
alarms: BTreeSet<Alarm>,
}

impl NodeCtx {
Expand All @@ -89,6 +98,7 @@ impl NodeCtx {
persistent_state_changed: false,
outgoing: Vec::new(),
connected: BTreeSet::new(),
alarms: BTreeSet::new(),
}
}
}
Expand All @@ -105,6 +115,10 @@ impl NodeCommonCtx for NodeCtx {
fn connected(&self) -> &BTreeSet<PlatformId> {
&self.connected
}

fn alarms(&self) -> &BTreeSet<Alarm> {
&self.alarms
}
}

impl NodeHandlerCtx for NodeCtx {
Expand Down Expand Up @@ -138,6 +152,10 @@ impl NodeHandlerCtx for NodeCtx {
fn remove_connection(&mut self, id: &PlatformId) {
self.connected.remove(id);
}

fn raise_alarm(&mut self, alarm: Alarm) {
self.alarms.insert(alarm);
}
}

impl NodeCallerCtx for NodeCtx {
Expand Down
7 changes: 7 additions & 0 deletions trust-quorum/tests/cluster.proptest-regressions

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions trust-quorum/tests/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,7 @@ impl TestState {
self.invariant_nodes_have_prepared_if_coordinator_has_acks()?;
self.invariant_nodes_have_committed_if_nexus_has_acks()?;
self.invariant_nodes_not_coordinating_and_computing_key_share_simultaneously()?;
self.invariant_no_alarms()?;
Ok(())
}

Expand Down Expand Up @@ -953,6 +954,20 @@ impl TestState {

Ok(())
}

// Ensure there has been no alarm at any node
fn invariant_no_alarms(&self) -> Result<(), TestCaseError> {
for (id, (_, ctx)) in &self.sut.nodes {
let alarms = ctx.alarms();
prop_assert!(
alarms.is_empty(),
"Alarms found for {}: {:#?}",
id,
alarms
);
}
Ok(())
}
}

/// Broken out of `TestState` to alleviate borrow checker woes
Expand Down
Loading