From 8ad78763b65191f12a2a08f82e332041c84bd7c1 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 8 Aug 2025 05:47:15 +0000 Subject: [PATCH 1/7] TQ: Introduce tqdb This PR introduces a new test tool for the trust-quorum protocol: tqdb. tqdb is a repl that takes event traces produced by the `cluster` proptest and uses them for deterministic replay of actions against test state. The test state includes a "universe" of real protocol nodes, a fake nexus, and fake networks. The proptest and debugging state is shared and contained in the `trust-quorum-test-utils`. The debugger allows a variety of functionality including stepping through individual events, setting breakpoints, snapshotting and diffing states and viewing the event log itself. The purpose of tqdb is twofold: 1. Allow for debugging of failed proptests. This is non-trivial in some cases, even with shrunken tests, because the generated actions are high-level and are all generated up front. The actual operations such as reconfigurations are derived from these high level random generations in conjunction with the current state of the system. Therefore the set of failing generated actions doesn't really tell you much. You have to look at the logs, and the assertion that fired and reason about it with incomplete information. Now, for each concrete action taken, we record the event in a log. In the case of a failure an event log can be loaded into tqdb, with a breakpoint set right before the failure. A snapshot of the state can be taken, and then the failing event can be applied. The diff will tell you what changed and allow you to inspect the actual state of the system. Full visibility into your failure is now possible. 2. The trust quorum protocol is non-trivial. Tqdb allows developers to see in detail how the protocol behaves and understand what is happening in certain situations. Event logs can be created by hand (or script) for particularly interesting scenarios and then run through tqdb. In order to get the diff functionality to work as I wanted, I had to implement `Eq` for types that implemented `subtle::ConstantTimeEq` in both `gfss` (our secret sharing library), and `trust-quorum` crates. However the safety in terms of the compiler breaking the constant time guarantees is unknown. Therefore, a feature flag was added such that only `test-utils` and `tqdb` crates are able to use these implementations. They are not used in the production codebase. Feature unification is not at play here because neither `test-utils` or `tqdb` are part of the product. --- Cargo.lock | 42 ++ Cargo.toml | 6 + dev-tools/reconfigurator-cli/src/lib.rs | 2 +- dev-tools/repl-utils/src/lib.rs | 16 +- trust-quorum/Cargo.toml | 13 + trust-quorum/gfss/Cargo.toml | 11 + trust-quorum/gfss/src/gf256.rs | 11 +- trust-quorum/gfss/src/shamir.rs | 10 + trust-quorum/src/compute_key_share.rs | 12 + trust-quorum/src/configuration.rs | 11 +- trust-quorum/src/coordinator_state.rs | 36 +- trust-quorum/src/crypto.rs | 21 +- trust-quorum/src/lib.rs | 29 +- trust-quorum/src/messages.rs | 2 + trust-quorum/src/node.rs | 26 + trust-quorum/src/node_ctx.rs | 32 + trust-quorum/src/persistent_state.rs | 4 +- trust-quorum/src/validators.rs | 31 +- trust-quorum/test-utils/Cargo.toml | 18 + trust-quorum/test-utils/src/event.rs | 33 + trust-quorum/test-utils/src/event_log.rs | 40 + trust-quorum/test-utils/src/lib.rs | 23 + trust-quorum/test-utils/src/nexus.rs | 169 +++++ trust-quorum/test-utils/src/state.rs | 891 +++++++++++++++++++++++ trust-quorum/tests/cluster.rs | 814 +++++---------------- trust-quorum/tqdb/Cargo.toml | 30 + trust-quorum/tqdb/src/bin/tqdb/main.rs | 719 ++++++++++++++++++ 27 files changed, 2394 insertions(+), 658 deletions(-) create mode 100644 trust-quorum/test-utils/Cargo.toml create mode 100644 trust-quorum/test-utils/src/event.rs create mode 100644 trust-quorum/test-utils/src/event_log.rs create mode 100644 trust-quorum/test-utils/src/lib.rs create mode 100644 trust-quorum/test-utils/src/nexus.rs create mode 100644 trust-quorum/test-utils/src/state.rs create mode 100644 trust-quorum/tqdb/Cargo.toml create mode 100644 trust-quorum/tqdb/src/bin/tqdb/main.rs diff --git a/Cargo.lock b/Cargo.lock index a64e0070901..9c6883e8aa4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14001,6 +14001,27 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" +[[package]] +name = "tqdb" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "clap", + "colored", + "daft", + "iddqd", + "omicron-repl-utils", + "omicron-workspace-hack", + "reconfigurator-cli", + "reedline", + "serde_json", + "slog", + "tabled 0.15.0", + "trust-quorum", + "trust-quorum-test-utils", +] + [[package]] name = "tracing" version = "0.1.40" @@ -14165,6 +14186,7 @@ dependencies = [ name = "trust-quorum" version = "0.1.0" dependencies = [ + "anyhow", "assert_matches", "bcs", "bootstore", @@ -14172,6 +14194,7 @@ dependencies = [ "chacha20poly1305", "daft", "derive_more 0.99.20", + "dropshot", "gfss", "hex", "hkdf", @@ -14183,6 +14206,7 @@ dependencies = [ "rand 0.9.2", "secrecy 0.10.3", "serde", + "serde_json", "serde_with", "sha3", "slog", @@ -14192,10 +14216,28 @@ dependencies = [ "test-strategy", "thiserror 2.0.12", "tokio", + "trust-quorum-test-utils", "uuid", "zeroize", ] +[[package]] +name = "trust-quorum-test-utils" +version = "0.1.0" +dependencies = [ + "camino", + "daft", + "dropshot", + "gfss", + "iddqd", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "serde", + "serde_json", + "slog", + "trust-quorum", +] + [[package]] name = "try-lock" version = "0.2.5" diff --git a/Cargo.toml b/Cargo.toml index e526f23b118..0af37939fb2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -141,6 +141,8 @@ members = [ "test-utils", "trust-quorum", "trust-quorum/gfss", + "trust-quorum/test-utils", + "trust-quorum/tqdb", "typed-rng", "update-common", "update-engine", @@ -298,6 +300,8 @@ default-members = [ "sp-sim", "trust-quorum", "trust-quorum/gfss", + "trust-quorum/test-utils", + "trust-quorum/tqdb", "test-utils", "typed-rng", "update-common", @@ -460,6 +464,8 @@ gateway-test-utils = { path = "gateway-test-utils" } gateway-types = { path = "gateway-types" } gethostname = "0.5.0" gfss = { path = "trust-quorum/gfss" } +trust-quorum = { path = "trust-quorum" } +trust-quorum-test-utils = { path = "trust-quorum/test-utils" } glob = "0.3.2" guppy = "0.17.20" headers = "0.4.1" diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 43be0311aee..4156b979b59 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -14,7 +14,7 @@ use iddqd::IdOrdMap; use indent_write::fmt::IndentWriter; use internal_dns_types::diff::DnsDiff; use itertools::Itertools; -use log_capture::LogCapture; +pub use log_capture::LogCapture; use nexus_inventory::CollectionBuilder; use nexus_reconfigurator_blippy::Blippy; use nexus_reconfigurator_blippy::BlippyReportSortKey; diff --git a/dev-tools/repl-utils/src/lib.rs b/dev-tools/repl-utils/src/lib.rs index 3a4a0c5547e..f14f7a606e2 100644 --- a/dev-tools/repl-utils/src/lib.rs +++ b/dev-tools/repl-utils/src/lib.rs @@ -9,6 +9,7 @@ use anyhow::anyhow; use anyhow::bail; use camino::Utf8Path; use clap::Parser; +use reedline::Prompt; use reedline::Reedline; use reedline::Signal; use std::fs::File; @@ -110,13 +111,24 @@ pub fn run_repl_from_file( pub fn run_repl_on_stdin( run_one: &mut dyn FnMut(C) -> anyhow::Result>, ) -> anyhow::Result<()> { - let mut ed = Reedline::create(); + let ed = Reedline::create(); let prompt = reedline::DefaultPrompt::new( reedline::DefaultPromptSegment::Empty, reedline::DefaultPromptSegment::Empty, ); + run_repl_on_stdin_customized(ed, &prompt, run_one) +} + +/// Runs a REPL using stdin/stdout with a customized `Reedline` and `Prompt` +/// +/// See docs for [`run_repl_on_stdin`] +pub fn run_repl_on_stdin_customized( + mut ed: Reedline, + prompt: &dyn Prompt, + run_one: &mut dyn FnMut(C) -> anyhow::Result>, +) -> anyhow::Result<()> { loop { - match ed.read_line(&prompt) { + match ed.read_line(prompt) { Ok(Signal::Success(buffer)) => { // Strip everything after '#' as a comment. let entry = match buffer.split_once('#') { diff --git a/trust-quorum/Cargo.toml b/trust-quorum/Cargo.toml index eaf141ddf2d..0d6ac6863c0 100644 --- a/trust-quorum/Cargo.toml +++ b/trust-quorum/Cargo.toml @@ -8,6 +8,7 @@ license = "MPL-2.0" workspace = true [dependencies] +anyhow.workspace = true bcs.workspace = true bootstore.workspace = true camino.workspace = true @@ -36,6 +37,18 @@ omicron-workspace-hack.workspace = true [dev-dependencies] assert_matches.workspace = true +dropshot.workspace = true omicron-test-utils.workspace = true proptest.workspace = true +serde_json.workspace = true test-strategy.workspace = true +trust-quorum-test-utils.workspace = true + +[features] +# Impl `PartialEq` and `Eq` for types implementing `subtle::ConstantTimeEq` when +# this feature is enabled. +# +# This is of unknown risk. The rust compiler may obviate the security of using +# subtle when we do this. On the other hand its very useful for testing and +# debugging outside of production. +danger_partial_eq_ct_wrapper = ["gfss/danger_partial_eq_ct_wrapper"] diff --git a/trust-quorum/gfss/Cargo.toml b/trust-quorum/gfss/Cargo.toml index 5802654f80e..3b6ad9fdf1b 100644 --- a/trust-quorum/gfss/Cargo.toml +++ b/trust-quorum/gfss/Cargo.toml @@ -21,3 +21,14 @@ omicron-workspace-hack.workspace = true [dev-dependencies] proptest.workspace = true test-strategy.workspace = true + +[features] + + +# Impl `PartialEq` and `Eq` for types implementing `subtle::ConstantTimeEq` when +# this feature is enabled. +# +# This is of unknown risk. The rust compiler may obviate the security of using +# subtle when we do this. On the other hand its very useful for testing and +# debugging outside of production. +danger_partial_eq_ct_wrapper = [] diff --git a/trust-quorum/gfss/src/gf256.rs b/trust-quorum/gfss/src/gf256.rs index 235cf37265c..78fc2bc1f28 100644 --- a/trust-quorum/gfss/src/gf256.rs +++ b/trust-quorum/gfss/src/gf256.rs @@ -32,7 +32,7 @@ use zeroize::Zeroize; /// An element in a finite field of prime power 2^8 /// -/// We explicitly don't enable the equality operators to prevent ourselves from +/// We explicitly don't derive the equality operators to prevent ourselves from /// accidentally using those instead of the constant time ones. #[repr(transparent)] #[derive(Debug, Clone, Copy, Zeroize, Serialize, Deserialize)] @@ -120,6 +120,15 @@ impl ConstantTimeEq for Gf256 { } } +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl PartialEq for Gf256 { + fn eq(&self, other: &Self) -> bool { + self.ct_eq(&other).into() + } +} +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl Eq for Gf256 {} + impl Add for Gf256 { type Output = Self; diff --git a/trust-quorum/gfss/src/shamir.rs b/trust-quorum/gfss/src/shamir.rs index 2da11b83bad..49ea0a90a48 100644 --- a/trust-quorum/gfss/src/shamir.rs +++ b/trust-quorum/gfss/src/shamir.rs @@ -137,6 +137,16 @@ impl Share { } } +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl PartialEq for Share { + fn eq(&self, other: &Self) -> bool { + self.x_coordinate == other.x_coordinate + && self.y_coordinates == other.y_coordinates + } +} +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl Eq for Share {} + impl std::fmt::Debug for Share { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("KeyShareGf256").finish() diff --git a/trust-quorum/src/compute_key_share.rs b/trust-quorum/src/compute_key_share.rs index 8cc780f752e..7754edac401 100644 --- a/trust-quorum/src/compute_key_share.rs +++ b/trust-quorum/src/compute_key_share.rs @@ -19,6 +19,7 @@ use std::collections::BTreeMap; /// In memory state that tracks retrieval of key shares in order to compute /// this node's key share for a given configuration. +#[derive(Debug, Clone)] pub struct KeyShareComputer { log: Logger, @@ -28,6 +29,17 @@ pub struct KeyShareComputer { collected_shares: BTreeMap, } +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl PartialEq for KeyShareComputer { + fn eq(&self, other: &Self) -> bool { + self.config == other.config + && self.collected_shares == other.collected_shares + } +} + +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl Eq for KeyShareComputer {} + impl KeyShareComputer { pub fn new( log: &Logger, diff --git a/trust-quorum/src/configuration.rs b/trust-quorum/src/configuration.rs index a6057c62ed1..8b116e6f4a8 100644 --- a/trust-quorum/src/configuration.rs +++ b/trust-quorum/src/configuration.rs @@ -7,6 +7,7 @@ use crate::crypto::{EncryptedRackSecrets, RackSecret, Sha3_256Digest}; use crate::validators::ValidatedReconfigureMsg; use crate::{Epoch, PlatformId, Threshold}; +use daft::Diffable; use gfss::shamir::{Share, SplitError}; use iddqd::{IdOrdItem, id_upcast}; use omicron_uuid_kinds::RackUuid; @@ -31,7 +32,15 @@ pub enum ConfigurationError { /// /// Only valid for non-lrtq configurations #[derive( - Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, + Debug, + Clone, + PartialEq, + Eq, + PartialOrd, + Ord, + Serialize, + Deserialize, + Diffable, )] pub struct Configuration { /// Unique Id of the rack diff --git a/trust-quorum/src/coordinator_state.rs b/trust-quorum/src/coordinator_state.rs index 78e8c8b1254..1440cdcc68b 100644 --- a/trust-quorum/src/coordinator_state.rs +++ b/trust-quorum/src/coordinator_state.rs @@ -4,12 +4,14 @@ //! State of a reconfiguration coordinator inside a [`crate::Node`] -use crate::NodeHandlerCtx; +use crate::configuration::ConfigurationDiff; use crate::crypto::{ LrtqShare, PlaintextRackSecrets, Sha3_256Digest, ShareDigestLrtq, }; use crate::validators::{ReconfigurationError, ValidatedReconfigureMsg}; use crate::{Configuration, Epoch, PeerMsgKind, PlatformId, RackSecret}; +use crate::{NodeHandlerCtx, ValidatedReconfigureMsgDiff}; +use daft::{Diffable, Leaf}; use gfss::shamir::Share; use slog::{Logger, error, info, o, warn}; use std::collections::{BTreeMap, BTreeSet}; @@ -27,7 +29,9 @@ use std::mem; /// allows progress to always be made with a full linearization of epochs. /// /// We allow some unused fields before we complete the coordination code +#[derive(Clone, Debug, Diffable)] pub struct CoordinatorState { + #[daft(ignore)] log: Logger, /// A copy of the message used to start this reconfiguration @@ -41,6 +45,34 @@ pub struct CoordinatorState { op: CoordinatorOperation, } +// For diffs we want to allow access to all fields, but not make them public in +// the `CoordinatorState` type itself. +impl<'daft> CoordinatorStateDiff<'daft> { + pub fn reconfigure_msg(&self) -> &ValidatedReconfigureMsgDiff<'daft> { + &self.reconfigure_msg + } + + pub fn configuration(&self) -> &ConfigurationDiff<'daft> { + &self.configuration + } + + pub fn op(&self) -> Leaf<&CoordinatorOperation> { + self.op + } +} + +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl PartialEq for CoordinatorState { + fn eq(&self, other: &Self) -> bool { + self.reconfigure_msg == other.reconfigure_msg + && self.configuration == other.configuration + && self.op == other.op + } +} + +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl Eq for CoordinatorState {} + impl CoordinatorState { /// Start coordinating a reconfiguration for a brand new trust quorum /// @@ -467,6 +499,8 @@ impl CoordinatorState { } /// What should the coordinator be doing? +#[derive(Clone, Debug, Diffable)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] pub enum CoordinatorOperation { CollectShares { old_epoch: Epoch, diff --git a/trust-quorum/src/crypto.rs b/trust-quorum/src/crypto.rs index 69d33c6cd66..cdb99677339 100644 --- a/trust-quorum/src/crypto.rs +++ b/trust-quorum/src/crypto.rs @@ -45,6 +45,7 @@ const CHACHA20POLY1305_NONCE_LEN: usize = 12; // The key share format used for LRTQ #[derive(Clone, Serialize, Deserialize, Zeroize, ZeroizeOnDrop, From)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] pub struct LrtqShare(Vec); // We don't want to risk debug-logging the actual share contents, so implement @@ -74,18 +75,20 @@ impl LrtqShare { pub struct ShareDigestLrtq(Sha3_256Digest); #[derive( - Default, - Debug, - Clone, - PartialEq, - Eq, - PartialOrd, - Ord, - Serialize, - Deserialize, + Default, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, )] pub struct Sha3_256Digest(pub [u8; 32]); +impl std::fmt::Debug for Sha3_256Digest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "sha3 digest: ")?; + for v in self.0.as_slice() { + write!(f, "{:x?}", v)?; + } + Ok(()) + } +} + /// A boxed array containing rack secret data /// /// We explicitly choose to box the data so that it is not littered around diff --git a/trust-quorum/src/lib.rs b/trust-quorum/src/lib.rs index 8bb8d8de5d3..9d326983826 100644 --- a/trust-quorum/src/lib.rs +++ b/trust-quorum/src/lib.rs @@ -9,6 +9,7 @@ //! All persistent state and all networking is managed outside of this //! implementation. +use daft::Diffable; use derive_more::Display; use serde::{Deserialize, Serialize}; @@ -22,16 +23,19 @@ mod node_ctx; mod persistent_state; mod validators; pub use configuration::Configuration; -pub use coordinator_state::{CoordinatorOperation, CoordinatorState}; +pub use coordinator_state::{ + CoordinatorOperation, CoordinatorState, CoordinatorStateDiff, +}; +pub use validators::ValidatedReconfigureMsgDiff; mod alarm; pub use alarm::Alarm; pub use crypto::RackSecret; pub use messages::*; -pub use node::Node; +pub use node::{Node, NodeDiff}; // public only for docs. pub use node_ctx::NodeHandlerCtx; -pub use node_ctx::{NodeCallerCtx, NodeCommonCtx, NodeCtx}; +pub use node_ctx::{NodeCallerCtx, NodeCommonCtx, NodeCtx, NodeCtxDiff}; pub use persistent_state::{PersistentState, PersistentStateSummary}; #[derive( @@ -46,7 +50,9 @@ pub use persistent_state::{PersistentState, PersistentStateSummary}; Serialize, Deserialize, Display, + Diffable, )] +#[daft(leaf)] pub struct Epoch(pub u64); impl Epoch { @@ -69,7 +75,9 @@ impl Epoch { Serialize, Deserialize, Display, + Diffable, )] +#[daft(leaf)] pub struct Threshold(pub u8); /// A unique identifier for a given trust quorum member. @@ -80,8 +88,17 @@ pub struct Threshold(pub u8); /// /// See RFDs 303 and 308 for more details. #[derive( - Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, + Debug, + Clone, + PartialEq, + Eq, + PartialOrd, + Ord, + Serialize, + Deserialize, + Diffable, )] +#[daft(leaf)] pub struct PlatformId { part_number: String, serial_number: String, @@ -108,7 +125,9 @@ impl PlatformId { } /// A container to make messages between trust quorum nodes routable -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, Diffable)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] +#[daft(leaf)] pub struct Envelope { pub to: PlatformId, pub from: PlatformId, diff --git a/trust-quorum/src/messages.rs b/trust-quorum/src/messages.rs index 052a8d04a40..3167cba5002 100644 --- a/trust-quorum/src/messages.rs +++ b/trust-quorum/src/messages.rs @@ -24,12 +24,14 @@ pub struct ReconfigureMsg { /// Messages sent between trust quorum members over a sprockets channel #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] pub struct PeerMsg { pub rack_id: RackUuid, pub kind: PeerMsgKind, } #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] pub enum PeerMsgKind { /// Sent from a coordinator node to inform a peer about a new configuration Prepare { diff --git a/trust-quorum/src/node.rs b/trust-quorum/src/node.rs index a6613f9062f..4f7240a4ba7 100644 --- a/trust-quorum/src/node.rs +++ b/trust-quorum/src/node.rs @@ -23,6 +23,7 @@ use crate::{ Alarm, Configuration, CoordinatorState, Epoch, NodeHandlerCtx, PlatformId, messages::*, }; +use daft::{Diffable, Leaf}; use gfss::shamir::Share; use omicron_uuid_kinds::RackUuid; use slog::{Logger, error, info, o, warn}; @@ -32,7 +33,9 @@ use slog::{Logger, error, info, o, warn}; /// This is a `sans-io` implementation that is deterministic (except for /// `RackSecretGeneration`, which currently hardcodes use of an OsRng). This /// style is primarily for testing purposes. +#[derive(Debug, Clone, Diffable)] pub struct Node { + #[daft(ignore)] log: Logger, /// In memory state for when this node is coordinating a reconfiguration @@ -43,6 +46,29 @@ pub struct Node { key_share_computer: Option, } +// For diffs we want to allow access to all fields, but not make them public in +// the `Node` type itself. +impl NodeDiff<'_> { + pub fn coordinator_state(&self) -> Leaf> { + self.coordinator_state + } + + pub fn key_share_computer(&self) -> Leaf> { + self.key_share_computer + } +} + +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl PartialEq for Node { + fn eq(&self, other: &Self) -> bool { + self.coordinator_state == other.coordinator_state + && self.key_share_computer == other.key_share_computer + } +} + +#[cfg(feature = "danger_partial_eq_ct_wrapper")] +impl Eq for Node {} + impl Node { pub fn new(log: &Logger, ctx: &mut impl NodeHandlerCtx) -> Node { let id_str = format!("{:?}", ctx.platform_id()); diff --git a/trust-quorum/src/node_ctx.rs b/trust-quorum/src/node_ctx.rs index e3a4f7fed32..e7d36da7bd7 100644 --- a/trust-quorum/src/node_ctx.rs +++ b/trust-quorum/src/node_ctx.rs @@ -6,7 +6,9 @@ use crate::{ Alarm, Envelope, PeerMsg, PeerMsgKind, PersistentState, PlatformId, + persistent_state::PersistentStateDiff, }; +use daft::{BTreeSetDiff, Diffable, Leaf}; use std::collections::BTreeSet; /// An API shared by [`NodeCallerCtx`] and [`NodeHandlerCtx`] @@ -67,6 +69,8 @@ pub trait NodeHandlerCtx: NodeCommonCtx { /// We separate access to this context via different APIs; namely [`NodeCallerCtx`] /// and [`NodeHandlerCtx`]. This statically prevents both the caller and /// [`crate::Node`] internals from performing improper mutations. +#[derive(Debug, Clone, Diffable)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] pub struct NodeCtx { /// The unique hardware ID of a sled platform_id: PlatformId, @@ -90,6 +94,34 @@ pub struct NodeCtx { alarms: BTreeSet, } +// For diffs we want to allow access to all fields, but not make them public in +// the `NodeCtx` type itself. +impl<'daft> NodeCtxDiff<'daft> { + pub fn platform_id(&self) -> Leaf<&PlatformId> { + self.platform_id + } + + pub fn persistent_state(&self) -> &PersistentStateDiff<'daft> { + &self.persistent_state + } + + pub fn persistent_state_changed(&self) -> Leaf<&bool> { + self.persistent_state_changed + } + + pub fn outgoing(&self) -> Leaf<&[Envelope]> { + self.outgoing + } + + pub fn connected(&self) -> &BTreeSetDiff<'daft, PlatformId> { + &self.connected + } + + pub fn alarms(&self) -> &BTreeSetDiff<'daft, Alarm> { + &self.alarms + } +} + impl NodeCtx { pub fn new(platform_id: PlatformId) -> NodeCtx { NodeCtx { diff --git a/trust-quorum/src/persistent_state.rs b/trust-quorum/src/persistent_state.rs index ba6d1306272..d2a9a090396 100644 --- a/trust-quorum/src/persistent_state.rs +++ b/trust-quorum/src/persistent_state.rs @@ -9,6 +9,7 @@ use crate::crypto::LrtqShare; use crate::{Configuration, Epoch, PlatformId}; use bootstore::schemes::v0::SharePkgCommon as LrtqShareData; +use daft::Diffable; use gfss::shamir::Share; use iddqd::IdOrdMap; use omicron_uuid_kinds::{GenericUuid, RackUuid}; @@ -16,7 +17,8 @@ use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, BTreeSet}; /// All the persistent state for this protocol -#[derive(Debug, Clone, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, Serialize, Deserialize, Default, Diffable)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] pub struct PersistentState { // If this node was an LRTQ node, sled-agent will start it with the ledger // data it read from disk. This allows us to upgrade from LRTQ. diff --git a/trust-quorum/src/validators.rs b/trust-quorum/src/validators.rs index aaf045d3aa6..ffa361dc1f2 100644 --- a/trust-quorum/src/validators.rs +++ b/trust-quorum/src/validators.rs @@ -7,6 +7,7 @@ use crate::configuration::ConfigurationError; use crate::messages::ReconfigureMsg; use crate::{Epoch, PersistentStateSummary, PlatformId, Threshold}; +use daft::{BTreeSetDiff, Diffable, Leaf}; use omicron_uuid_kinds::RackUuid; use slog::{Logger, error, info, warn}; use std::collections::BTreeSet; @@ -124,7 +125,7 @@ pub enum ReconfigurationError { /// A `ReconfigureMsg` that has been determined to be valid for the remainder /// of code paths. We encode this check into a type in a "parse, don't validate" /// manner. -#[derive(Debug)] +#[derive(Debug, Clone, PartialEq, Eq, Diffable)] pub struct ValidatedReconfigureMsg { rack_id: RackUuid, epoch: Epoch, @@ -137,6 +138,34 @@ pub struct ValidatedReconfigureMsg { coordinator_id: PlatformId, } +// For diffs we want to allow access to all fields, but not make them public in +// the `ValidatedReconfigureMsg` type itself. +impl<'daft> ValidatedReconfigureMsgDiff<'daft> { + pub fn rack_id(&self) -> Leaf<&RackUuid> { + self.rack_id + } + + pub fn epoch(&self) -> Leaf<&Epoch> { + self.epoch + } + + pub fn last_committed_epoch(&self) -> Leaf> { + self.last_committed_epoch + } + + pub fn members(&self) -> &BTreeSetDiff<'daft, PlatformId> { + &self.members + } + + pub fn threshold(&self) -> Leaf<&Threshold> { + self.threshold + } + + pub fn coordinator_id(&self) -> Leaf<&PlatformId> { + self.coordinator_id + } +} + impl PartialEq for ReconfigureMsg { fn eq(&self, other: &ValidatedReconfigureMsg) -> bool { let ReconfigureMsg { diff --git a/trust-quorum/test-utils/Cargo.toml b/trust-quorum/test-utils/Cargo.toml new file mode 100644 index 00000000000..984d2bb8d7c --- /dev/null +++ b/trust-quorum/test-utils/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "trust-quorum-test-utils" +version = "0.1.0" +edition = "2024" + +[dependencies] +camino.workspace = true +daft.workspace = true +dropshot.workspace = true +gfss = { workspace = true, features = ["danger_partial_eq_ct_wrapper"] } +iddqd.workspace = true +omicron-uuid-kinds.workspace = true +serde.workspace = true +serde_json.workspace = true +slog.workspace = true +trust-quorum = { workspace = true, features = ["danger_partial_eq_ct_wrapper"] } + +omicron-workspace-hack.workspace = true diff --git a/trust-quorum/test-utils/src/event.rs b/trust-quorum/test-utils/src/event.rs new file mode 100644 index 00000000000..6a573d95852 --- /dev/null +++ b/trust-quorum/test-utils/src/event.rs @@ -0,0 +1,33 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Events passed to our SUT/Nexus sim in both proptests and tqdb + +use crate::nexus::{NexusConfig, NexusReply}; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeSet; +use trust_quorum::{Epoch, PlatformId}; + +/// An event that can be fed into our system under test (SUT) +/// +/// Proptest generated `Action`s get translated into events at test execution +/// time and recorded for replay. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Event { + InitialSetup { + member_universe_size: usize, + config: NexusConfig, + crashed_nodes: BTreeSet, + }, + AbortConfiguration(Epoch), + SendNexusReplyOnUnderlay(NexusReply), + /// Pull an envelope off the bootstrap network and call `Node::handle` + DeliverEnvelope { + destination: PlatformId, + }, + /// Pull a `NexusReply` off the underlay network and update the `NexusState` + DeliverNexusReply, + CommitConfiguration(PlatformId), + Reconfigure(NexusConfig), +} diff --git a/trust-quorum/test-utils/src/event_log.rs b/trust-quorum/test-utils/src/event_log.rs new file mode 100644 index 00000000000..4319513a1ea --- /dev/null +++ b/trust-quorum/test-utils/src/event_log.rs @@ -0,0 +1,40 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A mechanism for recording [`crate::Event`]s + +use super::Event; +use camino::Utf8Path; +use std::fs::File; +use std::io::{Seek, Write}; + +pub struct EventLog { + file: File, +} + +impl EventLog { + pub fn new(path: &Utf8Path) -> EventLog { + let mut file = File::create(path).unwrap(); + // We want to incremntally write an array of `Event`s. + // Start the array + file.write_all(b"[\n").expect("opening brace written"); + EventLog { file } + } + + pub fn record(&mut self, event: &Event) { + serde_json::to_writer_pretty(&mut self.file, event) + .expect("writing event succeeded"); + self.file.write_all(b",\n").expect("write succeeded"); + } +} + +impl Drop for EventLog { + fn drop(&mut self) { + // Backup over the trailing comma and newline + let _ = self.file.seek_relative(-2); + // Finish writing the array of events + let _ = self.file.write_all(b"\n]\n"); + let _ = self.file.sync_data(); + } +} diff --git a/trust-quorum/test-utils/src/lib.rs b/trust-quorum/test-utils/src/lib.rs new file mode 100644 index 00000000000..7eccc64f5a9 --- /dev/null +++ b/trust-quorum/test-utils/src/lib.rs @@ -0,0 +1,23 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helpers for use by our proptests and tqdb + +mod event; +mod event_log; +pub mod nexus; +mod state; + +pub use event::Event; +pub use event_log::EventLog; +pub use state::TqState; + +use trust_quorum::PlatformId; + +/// All possible members used in a test +pub fn member_universe(size: usize) -> Vec { + (0..size) + .map(|serial| PlatformId::new("test".into(), serial.to_string())) + .collect() +} diff --git a/trust-quorum/test-utils/src/nexus.rs b/trust-quorum/test-utils/src/nexus.rs new file mode 100644 index 00000000000..53715e81fa8 --- /dev/null +++ b/trust-quorum/test-utils/src/nexus.rs @@ -0,0 +1,169 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Nexus related types for trust-quorum testing + +use daft::Diffable; +use iddqd::id_ord_map::RefMut; +use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use omicron_uuid_kinds::RackUuid; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeSet; +use trust_quorum::{Epoch, PlatformId, ReconfigureMsg, Threshold}; + +// The operational state of nexus for a given configuration +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Diffable)] +pub enum NexusOp { + Committed, + Aborted, + Preparing, +} + +/// A single nexus configuration +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Diffable)] +pub struct NexusConfig { + pub op: NexusOp, + pub epoch: Epoch, + pub last_committed_epoch: Option, + pub coordinator: PlatformId, + pub members: BTreeSet, + // This is our `K` parameter + pub threshold: Threshold, + + // This is our `Z` parameter. + // + // Nexus can commit when it has seen K+Z prepare acknowledgements + // + // Only nexus needs to know this value since it alone determines when a + // commit may occur. + pub commit_crash_tolerance: u8, + + pub prepared_members: BTreeSet, + pub committed_members: BTreeSet, +} + +impl NexusConfig { + pub fn new( + epoch: Epoch, + last_committed_epoch: Option, + coordinator: PlatformId, + members: BTreeSet, + threshold: Threshold, + ) -> NexusConfig { + // We want a few extra nodes beyond `threshold` to ack before we commit. + // This is the number of nodes that can go offline while still allowing + // an unlock to occur. + let commit_crash_tolerance = match members.len() - threshold.0 as usize + { + 0..=1 => 0, + 2..=4 => 1, + 5..=7 => 2, + _ => 3, + }; + NexusConfig { + op: NexusOp::Preparing, + epoch, + last_committed_epoch, + coordinator, + members, + threshold, + commit_crash_tolerance, + prepared_members: BTreeSet::new(), + committed_members: BTreeSet::new(), + } + } + + pub fn to_reconfigure_msg(&self, rack_id: RackUuid) -> ReconfigureMsg { + ReconfigureMsg { + rack_id, + epoch: self.epoch, + last_committed_epoch: self.last_committed_epoch, + members: self.members.clone(), + threshold: self.threshold, + } + } + + // Are there enough prepared members to commit? + pub fn can_commit(&self) -> bool { + self.prepared_members.len() + >= (self.threshold.0 + self.commit_crash_tolerance) as usize + } +} + +impl IdOrdItem for NexusConfig { + type Key<'a> = Epoch; + + fn key(&self) -> Self::Key<'_> { + self.epoch + } + + id_upcast!(); +} + +/// A model of Nexus's view of the world during the test +#[derive(Debug, Clone, Diffable)] +pub struct NexusState { + // No reason to change the rack_id + pub rack_id: RackUuid, + + pub configs: IdOrdMap, +} + +impl NexusState { + pub fn new() -> NexusState { + NexusState { rack_id: RackUuid::new_v4(), configs: IdOrdMap::new() } + } + + // Create a `ReconfigureMsg` for the latest nexus config + pub fn reconfigure_msg_for_latest_config( + &self, + ) -> (&PlatformId, ReconfigureMsg) { + let config = self.configs.iter().last().expect("at least one config"); + (&config.coordinator, config.to_reconfigure_msg(self.rack_id)) + } + + /// Abort the latest reconfiguration attempt + pub fn abort_reconfiguration(&mut self) { + let config = self.configs.iter().last().expect("at least one config"); + // Can only abort while preparing + assert_eq!(config.op, NexusOp::Preparing); + } + + pub fn latest_config(&self) -> &NexusConfig { + self.configs.iter().last().expect("at least one config") + } + + pub fn latest_config_mut(&mut self) -> RefMut<'_, NexusConfig> { + self.configs.iter_mut().last().expect("at least one config") + } + + pub fn last_committed_config(&self) -> Option<&NexusConfig> { + // IdOrdMap doesn't allow reverse iteration. + // We therefore iterate through all configs to find the latest committed one. + // We could track this out of band but that leaves more room for error. + let mut found: Option<&NexusConfig> = None; + for c in &self.configs { + if c.op == NexusOp::Committed { + found = Some(c) + } + } + found + } +} + +#[derive( + Debug, + Clone, + PartialOrd, + Ord, + PartialEq, + Eq, + Serialize, + Deserialize, + Diffable, +)] +pub enum NexusReply { + AckedPreparesFromCoordinator { epoch: Epoch, acks: BTreeSet }, + CommitAck { from: PlatformId, epoch: Epoch }, +} diff --git a/trust-quorum/test-utils/src/state.rs b/trust-quorum/test-utils/src/state.rs new file mode 100644 index 00000000000..9508e18c7e4 --- /dev/null +++ b/trust-quorum/test-utils/src/state.rs @@ -0,0 +1,891 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The entire state of our test system + +use crate::nexus::{ + NexusConfig, NexusOp, NexusReply, NexusState, NexusStateDiff, +}; +use crate::{Event, member_universe}; +use daft::{BTreeMapDiff, BTreeSetDiff, Diffable, Leaf}; +use iddqd::IdOrdMap; +use slog::Logger; +use std::collections::{BTreeMap, BTreeSet}; +use std::fmt::Display; +use trust_quorum::{ + Configuration, CoordinatorOperation, CoordinatorStateDiff, Envelope, Epoch, + Node, NodeCallerCtx, NodeCommonCtx, NodeCtx, NodeCtxDiff, NodeDiff, + PeerMsgKind, PlatformId, ValidatedReconfigureMsgDiff, +}; + +// The state of our entire system including the system under test and +// test specific infrastructure. +#[derive(Debug, Clone, Diffable)] +pub struct TqState { + /// A logger for our test + #[daft(ignore)] + pub log: Logger, + + /// Our system under test + pub sut: Sut, + + /// All in flight messages between nodes + pub bootstrap_network: BTreeMap>, + + /// All in flight responses to nexus. We don't model the requests, as those + /// are `Node` public method calls. But we don't want to synchronously + /// update nexus state as a result of those calls, because that ruins any + /// possible interleaving with other actions. + /// + /// This is a way to allow interleaving of nexus replies without changing + /// the Node API to accept a separate set of Nexus messages and return + /// messages. We may decide that we want to do that, but for now we'll stick + /// with a concrete `Node` method based API that is "triggered" by nexus + /// messages. + pub underlay_network: Vec, + + /// A model of Nexus's view of the world during the test + pub nexus: NexusState, + + /// A cache of our member universe, so we only have to generate it once + pub member_universe: Vec, + + /// All possible system faults in our test + pub faults: Faults, + + /// All configurations ever generated by a coordinator. + /// + /// If an epoch got skipped due to a crashed coordinator then there will not + /// be a configuration for that epoch. + pub all_coordinated_configs: IdOrdMap, + + /// Expunged nodes cannot be added to a cluster. We never reuse nodes in + /// this test. We include nodes here that may not know yet that they have + /// been expunged in the `Sut`. + pub expunged: BTreeSet, +} + +impl TqState { + pub fn new(log: Logger) -> TqState { + // We'll fill this in when applying the initial_config + let sut = Sut::empty(); + let member_universe = vec![]; + TqState { + log, + sut, + bootstrap_network: BTreeMap::new(), + underlay_network: Vec::new(), + nexus: NexusState::new(), + member_universe, + faults: Faults::default(), + all_coordinated_configs: IdOrdMap::new(), + expunged: BTreeSet::new(), + } + } + + /// Send the latest `ReconfigureMsg` from `Nexus` to the coordinator node + /// + /// If the node is not available, then abort the configuration at nexus + pub fn send_reconfigure_msg(&mut self) { + let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config(); + let epoch_to_config = msg.epoch; + if self.faults.crashed_nodes.contains(coordinator) { + // We must abort the configuration. This mimics a timeout. + self.nexus.abort_reconfiguration(); + } else { + let (node, ctx) = self + .sut + .nodes + .get_mut(coordinator) + .expect("coordinator exists"); + + node.coordinate_reconfiguration(ctx, msg) + .expect("valid configuration"); + + // Do we have a `Configuration` for this epoch yet? + // + // For most reconfigurations, shares for the last committed + // configuration must be retrieved before the configuration is + // generated and saved in the persistent state. + let latest_persisted_config = + ctx.persistent_state().latest_config().expect("config exists"); + if latest_persisted_config.epoch == epoch_to_config { + // Save the configuration for later + self.all_coordinated_configs + .insert_unique(latest_persisted_config.clone()) + .expect("unique"); + } + } + } + + /// Check postcondition assertions after initial configuration + pub fn postcondition_initial_configuration(&mut self) { + let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config(); + + // The coordinator should have received the `ReconfigureMsg` from Nexus + if !self.faults.crashed_nodes.contains(coordinator) { + let (node, ctx) = self + .sut + .nodes + .get_mut(coordinator) + .expect("coordinator exists"); + let mut connected_members = 0; + // The coordinator should start preparing by sending a `PrepareMsg` to all + // connected nodes in the membership set. + for member in + msg.members.iter().filter(|&id| id != coordinator).cloned() + { + if self.faults.is_connected(coordinator.clone(), member.clone()) + { + connected_members += 1; + let msg_found = ctx.envelopes().any(|envelope| { + envelope.to == member + && envelope.from == *coordinator + && matches!( + envelope.msg.kind, + PeerMsgKind::Prepare { .. } + ) + }); + assert!(msg_found); + } + } + assert_eq!(connected_members, ctx.envelopes().count()); + + // The coordinator should be in the prepare phase + let cs = node.get_coordinator_state().expect("is coordinating"); + assert!(matches!(cs.op(), CoordinatorOperation::Prepare { .. })); + + // The persistent state should have changed + assert!(ctx.persistent_state_change_check_and_reset()); + assert!(ctx.persistent_state().has_prepared(msg.epoch)); + assert!(ctx.persistent_state().latest_committed_epoch().is_none()); + } + } + + /// Put any outgoing coordinator messages from the latest configuration on the wire + pub fn send_envelopes_from_coordinator(&mut self) { + let coordinator = { + let (coordinator, _) = + self.nexus.reconfigure_msg_for_latest_config(); + coordinator.clone() + }; + self.send_envelopes_from(&coordinator); + } + + pub fn send_envelopes_from(&mut self, id: &PlatformId) { + let (_, ctx) = self.sut.nodes.get_mut(id).expect("node exists"); + for envelope in ctx.drain_envelopes() { + let msgs = + self.bootstrap_network.entry(envelope.to.clone()).or_default(); + msgs.push(envelope); + } + } + + pub fn apply_event(&mut self, event: Event) { + match event { + Event::InitialSetup { + member_universe_size, + config, + crashed_nodes, + } => { + self.apply_event_initial_config( + member_universe_size, + config, + crashed_nodes, + ); + } + Event::AbortConfiguration(epoch) => { + self.apply_event_abort_configuration(epoch) + } + Event::SendNexusReplyOnUnderlay(reply) => { + self.apply_event_send_nexus_reply_on_underlay(reply) + } + Event::DeliverEnvelope { destination } => { + self.apply_event_deliver_envelope(destination); + } + Event::DeliverNexusReply => { + self.apply_event_deliver_nexus_reply(); + } + Event::CommitConfiguration(dest) => { + self.apply_event_commit(dest); + } + Event::Reconfigure(nexus_config) => { + self.apply_event_reconfigure(nexus_config) + } + } + } + + fn apply_event_initial_config( + &mut self, + member_universe_size: usize, + config: NexusConfig, + crashed_nodes: BTreeSet, + ) { + // Generate the member universe + self.member_universe = member_universe(member_universe_size); + // Create the SUT nodes + self.sut = Sut::new(&self.log, self.member_universe.clone()); + + self.faults.crashed_nodes = crashed_nodes; + + // Inform nexus about the initial configuration + self.nexus.configs.insert_unique(config).expect("new config"); + + // Establish bootstrap network connections between live nodes + for (from, (node, ctx)) in self + .sut + .nodes + .iter_mut() + .filter(|(id, _)| !self.faults.crashed_nodes.contains(id)) + { + for to in self.member_universe.iter().filter(|id| { + !self.faults.crashed_nodes.contains(id) && from != *id + }) { + node.on_connect(ctx, to.clone()); + } + } + + self.send_reconfigure_msg(); + + // Check the results of the initial setup + self.postcondition_initial_configuration(); + + // Put the coordinator's outgoing messages on the wire if there are any + self.send_envelopes_from_coordinator(); + } + + fn apply_event_commit(&mut self, id: PlatformId) { + let rack_id = self.nexus.rack_id; + let latest_config = self.nexus.latest_config(); + let (node, ctx) = + self.sut.nodes.get_mut(&id).expect("destination exists"); + node.commit_configuration(ctx, rack_id, latest_config.epoch) + .expect("commit succeeded"); + + self.underlay_network.push(NexusReply::CommitAck { + from: id, + epoch: latest_config.epoch, + }); + } + + fn apply_event_send_nexus_reply_on_underlay(&mut self, reply: NexusReply) { + self.underlay_network.push(reply); + } + + fn apply_event_deliver_nexus_reply(&mut self) { + let mut latest_config = self.nexus.latest_config_mut(); + let reply = self.underlay_network.pop().expect("reply exists"); + match reply { + NexusReply::AckedPreparesFromCoordinator { epoch, acks } => { + if epoch == latest_config.epoch { + latest_config.prepared_members.extend(acks); + } + } + NexusReply::CommitAck { from, epoch } => { + if latest_config.epoch == epoch { + latest_config.committed_members.insert(from); + } + } + } + } + + fn apply_event_abort_configuration(&mut self, epoch: Epoch) { + let mut latest_config = self.nexus.latest_config_mut(); + assert_eq!(epoch, latest_config.epoch); + latest_config.op = NexusOp::Aborted; + } + + fn apply_event_deliver_envelope(&mut self, destination: PlatformId) { + let envelope = self + .bootstrap_network + .get_mut(&destination) + .unwrap() + .pop() + .expect("envelope in bootstrap network"); + let (node, ctx) = + self.sut.nodes.get_mut(&envelope.to).expect("destination exists"); + node.handle(ctx, envelope.from, envelope.msg); + + // If this is the first time we've seen a configuration, track it + // + // We have to do this here because for reconfigurations, shares + // for the last committed reconfiguration are gathered before + // the config is created. We don't know exactly when config + // generation occurs, but know that it happens after envelopes + // are delivered, except for configurations that don't have + // a last committed config. This is normally the initial + // configuration, but can be later ones if the initial config + // is aborted. + if ctx.persistent_state_change_check_and_reset() { + if let Some(latest_config) = ctx.persistent_state().latest_config() + { + if !self + .all_coordinated_configs + .contains_key(&latest_config.epoch) + { + // The coordinator must be the first node to create + // the configuration. + assert_eq!(&latest_config.coordinator, ctx.platform_id()); + + self.all_coordinated_configs + .insert_unique(latest_config.clone()) + .expect("unique config"); + } + } + } + + // Send any messages as a result of handling this message + send_envelopes(ctx, &mut self.bootstrap_network); + + // Remove any destinations with zero messages in-flight + self.bootstrap_network.retain(|_, msgs| !msgs.is_empty()); + } + + fn apply_event_reconfigure(&mut self, nexus_config: NexusConfig) { + self.nexus.configs.insert_unique(nexus_config).expect("new config"); + self.send_reconfigure_msg(); + self.send_envelopes_from_coordinator(); + } +} + +/// Broken out of `TqState` to alleviate borrow checker woes +fn send_envelopes( + ctx: &mut NodeCtx, + bootstrap_network: &mut BTreeMap>, +) { + for envelope in ctx.drain_envelopes() { + let envelopes = + bootstrap_network.entry(envelope.to.clone()).or_default(); + envelopes.push(envelope); + } +} + +/// The system under test +/// +/// This is our real code. +#[derive(Debug, Clone, Diffable)] +pub struct Sut { + /// All nodes in the member universe + pub nodes: BTreeMap, +} + +impl Sut { + pub fn empty() -> Sut { + Sut { nodes: BTreeMap::new() } + } + + pub fn new(log: &Logger, universe: Vec) -> Sut { + let nodes = universe + .into_iter() + .map(|id| { + let mut ctx = NodeCtx::new(id.clone()); + let node = Node::new(log, &mut ctx); + (id, (node, ctx)) + }) + .collect(); + Sut { nodes } + } +} + +/// Faults in our system. It's useful to keep these self contained and not +/// in separate fields in `TestState` so that we can access them all at once +/// independently of other `TestState` fields. +#[derive(Default, Debug, Clone, Diffable)] +pub struct Faults { + // We allow nodes to crash and restart and therefore track crashed nodes here. + // + // A crashed node is implicitly disconnected from every other node. We don't + // bother storing the pairs in `disconnected_nodes`, but instead check both + // fields when necessary. + pub crashed_nodes: BTreeSet, + + /// The set of disconnected nodes + pub disconnected_nodes: DisconnectedNodes, +} + +impl Faults { + pub fn is_connected(&self, node1: PlatformId, node2: PlatformId) -> bool { + !self.crashed_nodes.contains(&node1) + && !self.crashed_nodes.contains(&node2) + && !self.disconnected_nodes.contains(node1, node2) + } +} + +/// For cardinality purposes, we assume all nodes are connected and explicitly +/// disconnect some of them. This allows us to track and compare much less data. +#[derive(Default, Debug, Clone, Diffable)] +pub struct DisconnectedNodes { + // We sort each pair on insert for quick lookups + pairs: BTreeSet<(PlatformId, PlatformId)>, +} + +impl DisconnectedNodes { + // Return true if the pair is newly inserted + pub fn insert(&mut self, node1: PlatformId, node2: PlatformId) -> bool { + assert_ne!(node1, node2); + + let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) }; + self.pairs.insert(pair) + } + + // Return true if the pair of nodes is disconnected, false otherwise + pub fn contains(&self, node1: PlatformId, node2: PlatformId) -> bool { + assert_ne!(node1, node2); + let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) }; + self.pairs.contains(&pair) + } +} + +/***************************************************************************** + * + * Diff related display code + * + *****************************************************************************/ + +/// Diff Display functionality for `TqState` +/// +/// All diff related code lives in `test-utils`, because we enable the +/// trust-quorum feature `danger_partial_eq_ct_wrapper` in this crate. We +/// don't enable it for all uses of the `trust_quorum` crate, especially in +/// production. +/// +/// Since we only use it for human readable output in test tools - at least for +/// now, we put it behind a feature flag and implement all display functionality +/// here. +impl Display for TqStateDiff<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // The set of SUT nodes never changes + for (&id, &leaf) in self.sut.nodes.common.iter() { + if leaf.is_modified() { + writeln!(f, "Node changed: {id}")?; + let (node_diff, ctx_diff) = leaf.diff_pair(); + display_node_diff(node_diff, f)?; + display_node_ctx_diff(ctx_diff, f)?; + + // Add a blank line between modified nodes + writeln!(f, "")?; + } + } + + display_bootstrap_network_diff(&self.bootstrap_network, f)?; + display_underlay_network_diff(&self.underlay_network, f)?; + display_nexus_state_diff(&self.nexus, f)?; + display_faults_diff(&self.faults, f)?; + display_expunged_diff(&self.expunged, f)?; + + Ok(()) + } +} + +fn display_expunged_diff( + diff: &BTreeSetDiff<'_, PlatformId>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + if !diff.added.is_empty() { + writeln!(f, "expunged nodes:")?; + for id in &diff.added { + writeln!(f, " {id}")?; + } + } + Ok(()) +} + +fn display_faults_diff( + diff: &FaultsDiff<'_>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + if !diff.crashed_nodes.added.is_empty() { + writeln!(f, " Nodes crashed:")?; + for id in &diff.crashed_nodes.added { + writeln!(f, " {id}")?; + } + } + if !diff.crashed_nodes.removed.is_empty() { + writeln!(f, " nodes started:")?; + for id in &diff.crashed_nodes.removed { + writeln!(f, " {id}")?; + } + } + + if !diff.disconnected_nodes.pairs.added.is_empty() { + writeln!(f, " nodes disconnected from each other:")?; + for pair in &diff.disconnected_nodes.pairs.added { + writeln!(f, " {}, {}", pair.0, pair.1)?; + } + } + if !diff.disconnected_nodes.pairs.removed.is_empty() { + writeln!(f, " nodes connected to each other:")?; + for pair in &diff.disconnected_nodes.pairs.removed { + writeln!(f, " {}, {}", pair.0, pair.1)?; + } + } + Ok(()) +} + +fn display_nexus_state_diff( + diff: &NexusStateDiff<'_>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + if diff.configs.modified().count() != 0 { + writeln!(f, " nexus state changed:")?; + } + + // Nexus configs can only be added or modified + for c in &diff.configs.added { + writeln!(f, " config added at epoch {}, op: {:?}", c.epoch, c.op)?; + } + for c in diff.configs.modified_diff() { + writeln!(f, " config modified at epoch {}", c.epoch.before)?; + if c.op.is_modified() { + let op = c.op.diff_pair(); + writeln!(f, " op changed: {:?} -> {:?}", op.before, op.after)?; + } + for id in c.prepared_members.added { + writeln!(f, " new prepare ack received: {id}")?; + } + for id in c.committed_members.added { + writeln!(f, " new commit ack received: {id}")?; + } + } + + Ok(()) +} + +fn display_underlay_network_diff( + diff: &Leaf<&[NexusReply]>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + if diff.is_unchanged() { + return Ok(()); + } + + let before: BTreeSet<_> = diff.before.iter().collect(); + let after: BTreeSet<_> = diff.after.iter().collect(); + + let added = after.difference(&before).count(); + let removed = before.difference(&after).count(); + + writeln!(f, " {} new nexus replies in flight on underlay network", added)?; + writeln!( + f, + " {} nexus replies delivered to nexus from underlay network", + removed + )?; + + Ok(()) +} + +fn display_bootstrap_network_diff( + diff: &BTreeMapDiff<'_, PlatformId, Vec>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + if !diff.added.is_empty() { + writeln!(f, " messages newly in flight on bootstrap network:")?; + for (id, _) in &diff.added { + writeln!(f, " destination: {id}")?; + } + } + + if !diff.removed.is_empty() { + writeln!(f, " all messages delivered from bootstrap network:")?; + for (id, _) in &diff.removed { + writeln!(f, " destination: {id}")?; + } + } + + if diff.unchanged_keys().count() != 0 { + writeln!(f, " messages remain in flight from bootstrap network:")?; + for id in diff.unchanged_keys() { + writeln!(f, " destination: {id}")?; + } + } + Ok(()) +} + +// Walk a `NodeCtxDiff` and format it for display +fn display_node_ctx_diff( + diff: NodeCtxDiff<'_>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + if !diff.persistent_state().configs.added.is_empty() { + writeln!(f, " config added to persistent state: ")?; + for c in &diff.persistent_state().configs.added { + writeln!(f, " epoch: {}", c.epoch)?; + } + } + if !diff.persistent_state().configs.removed.is_empty() { + writeln!(f, " config removed from persistent state: ")?; + for c in &diff.persistent_state().configs.removed { + writeln!(f, " epoch: {}", c.epoch)?; + } + } + + if !diff.persistent_state().shares.added.is_empty() { + writeln!(f, " our share added to persistent state: ")?; + for (e, _) in &diff.persistent_state().shares.added { + writeln!(f, " epoch: {}", e)?; + } + } + if !diff.persistent_state().shares.removed.is_empty() { + writeln!(f, " our share removed from persistent state: ")?; + for (e, _) in &diff.persistent_state().shares.removed { + writeln!(f, " epoch: {}", e)?; + } + } + + if !diff.persistent_state().commits.added.is_empty() { + writeln!(f, " commit added to persistent state: ")?; + for e in &diff.persistent_state().commits.added { + writeln!(f, " epoch: {}", e)?; + } + } + if !diff.persistent_state().commits.removed.is_empty() { + writeln!(f, " commit removed from persistent state: ")?; + for e in &diff.persistent_state().commits.removed { + writeln!(f, " epoch: {}", e)?; + } + } + + if diff.outgoing().is_modified() { + writeln!(f, " messages sent to or delivered from bootstrap network")?; + } + + if !diff.connected().added.is_empty() { + writeln!(f, " nodes connected:")?; + for id in &diff.connected().added { + writeln!(f, " {id}")?; + } + } + + if !diff.connected().removed.is_empty() { + writeln!(f, " nodes disconnected:")?; + for id in &diff.connected().removed { + writeln!(f, " {id}")?; + } + } + + if !diff.alarms().added.is_empty() { + writeln!(f, " alarms triggered:")?; + for alarm in &diff.alarms().added { + writeln!(f, " {alarm:?}")?; + } + } + + if !diff.alarms().removed.is_empty() { + writeln!(f, " alarms cleared:")?; + for alarm in &diff.alarms().removed { + writeln!(f, " {alarm:?}")?; + } + } + + Ok(()) +} + +// Walk a `NodeDiff` and format it for display +fn display_node_diff( + node_diff: NodeDiff<'_>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + // Show changes in `Node::coordinator_state` + if node_diff.coordinator_state().is_modified() { + writeln!(f, " coordinator state changed: ")?; + if node_diff.coordinator_state().before.is_none() { + writeln!( + f, + " started coordinating at epoch {}", + node_diff + .coordinator_state() + .after + .unwrap() + .reconfigure_msg() + .epoch() + )?; + } else if node_diff.coordinator_state().after.is_none() { + writeln!( + f, + " stopped coordinating at epoch {}", + node_diff + .coordinator_state() + .before + .unwrap() + .reconfigure_msg() + .epoch() + )?; + } else { + let before = node_diff.coordinator_state().before.unwrap(); + let after = node_diff.coordinator_state().after.unwrap(); + + // They are both `Some`, so figure out what changed + // by recursing + let diff = before.diff(&after); + display_coordinator_state_diff(diff, f)?; + } + } + + // Show changes in `Node::key_share_computer` + if node_diff.key_share_computer().is_modified() { + writeln!(f, " key share computer changed: ")?; + if node_diff.key_share_computer().before.is_none() { + writeln!( + f, + " started computing key share at epoch {}", + node_diff.key_share_computer().after.unwrap().config().epoch + )?; + } else if node_diff.key_share_computer().after.is_none() { + writeln!( + f, + " stopped computing key share at epoch {}", + node_diff.key_share_computer().before.unwrap().config().epoch + )?; + } else { + writeln!( + f, + " computing key share at epochs: {} -> {}", + node_diff.key_share_computer().before.unwrap().config().epoch, + node_diff.key_share_computer().after.unwrap().config().epoch + )?; + } + } + + Ok(()) +} + +pub fn display_coordinator_state_diff( + diff: CoordinatorStateDiff<'_>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + display_validated_reconfigure_msg_diff(diff.reconfigure_msg(), f)?; + + // Configuration contains roughly the same information as a + // `ValidatedReconfigureMsg`. Let's report the only relevant change. + if diff.configuration().encrypted_rack_secrets.is_modified() { + writeln!(f, " encrypted rack secrets changed")?; + } + + display_coordinator_operation_diff(diff.op().diff_pair(), f)?; + + Ok(()) +} + +pub fn display_validated_reconfigure_msg_diff( + diff: &ValidatedReconfigureMsgDiff<'_>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + // diff.rack_id changes when tqdb `rewind` command is used, which makes it + // confusing. It never changes inside tests, so no need to diff it. + if diff.epoch().is_modified() { + writeln!( + f, + " epoch: {} -> {}", + diff.epoch().before, + diff.epoch().after + )?; + } + if diff.last_committed_epoch().is_modified() { + writeln!( + f, + " last committed epoch: {:?} -> {:?}", + diff.last_committed_epoch().before, + diff.last_committed_epoch().after + )?; + } + if !diff.members().added.is_empty() { + writeln!(f, " added members:")?; + for member in &diff.members().added { + writeln!(f, " {}", member)?; + } + } + if !diff.members().removed.is_empty() { + writeln!(f, " removed members:")?; + for member in &diff.members().removed { + writeln!(f, " {}", member)?; + } + } + if diff.threshold().is_modified() { + writeln!( + f, + " threshold: {} -> {}", + diff.threshold().before, + diff.threshold().after + )?; + } + // Always write out the coordinator id. It's useful for digging. + writeln!( + f, + " coordinator: {} -> {}", + diff.coordinator_id().before, + diff.coordinator_id().after, + )?; + + Ok(()) +} + +pub fn display_coordinator_operation_diff( + diff: Leaf<&CoordinatorOperation>, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + if diff.is_unchanged() { + return Ok(()); + } + + // If the same variant changed contents, compare them. Otherwise report the + // change in variants. + match (diff.before, diff.after) { + ( + CoordinatorOperation::CollectShares { + old_epoch, + old_collected_shares, + .. + }, + CoordinatorOperation::CollectShares { + old_epoch: after_old_epoch, + old_collected_shares: after_old_collected_shares, + .. + }, + ) => { + // If the collection epoch changed, then only report that + if old_epoch != after_old_epoch { + writeln!( + f, + " collecting shares: epoch changed: {} -> {}", + old_epoch, after_old_epoch + )?; + } else { + if old_collected_shares != after_old_collected_shares { + writeln!( + f, + " collected shares changed at epoch: {}", + old_epoch + )?; + } + } + } + ( + CoordinatorOperation::CollectLrtqShares { shares: before, .. }, + CoordinatorOperation::CollectLrtqShares { shares: after, .. }, + ) => { + if before != after { + writeln!(f, " collected lrtq shares differ")?; + } + } + ( + CoordinatorOperation::Prepare { prepare_acks: before, .. }, + CoordinatorOperation::Prepare { prepare_acks: after, .. }, + ) => { + if before != after { + writeln!(f, " received prepare acks differ")?; + } + } + (before, after) => { + writeln!( + f, + " coordinator operation changed: {} -> {}", + before.name(), + after.name() + )?; + } + } + + Ok(()) +} diff --git a/trust-quorum/tests/cluster.rs b/trust-quorum/tests/cluster.rs index 9bc7da94c65..e09092925c1 100644 --- a/trust-quorum/tests/cluster.rs +++ b/trust-quorum/tests/cluster.rs @@ -5,10 +5,8 @@ //! Property based test driving multiple trust quorum nodes use daft::Diffable; -use iddqd::id_ord_map::RefMut; -use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use dropshot::test_util::log_prefix_for_test; use omicron_test_utils::dev::test_setup_log; -use omicron_uuid_kinds::RackUuid; use prop::sample::Index; use proptest::collection::{btree_set, size_range}; use proptest::prelude::*; @@ -17,265 +15,18 @@ use slog::{Logger, info, o}; use std::collections::{BTreeMap, BTreeSet}; use test_strategy::{Arbitrary, proptest}; use trust_quorum::{ - Configuration, CoordinatorOperation, Envelope, Epoch, Node, NodeCallerCtx, - NodeCommonCtx, NodeCtx, PeerMsgKind, PlatformId, ReconfigureMsg, Threshold, + CoordinatorOperation, Epoch, NodeCommonCtx, PlatformId, Threshold, +}; +use trust_quorum_test_utils::TqState; +use trust_quorum_test_utils::{ + Event, EventLog, + nexus::{NexusConfig, NexusOp, NexusReply}, }; - -/// The system under test -/// -/// This is our real code. -pub struct Sut { - /// All nodes in the member universe - pub nodes: BTreeMap, -} - -impl Sut { - pub fn new(log: &Logger, universe: Vec) -> Sut { - let nodes = universe - .into_iter() - .map(|id| { - let mut ctx = NodeCtx::new(id.clone()); - let node = Node::new(log, &mut ctx); - (id, (node, ctx)) - }) - .collect(); - Sut { nodes } - } -} - -#[derive(Debug, PartialEq, Eq)] -pub enum NexusOp { - Committed, - Aborted, - Preparing, -} - -/// A single nexus configuration -#[derive(Debug)] -pub struct NexusConfig { - op: NexusOp, - epoch: Epoch, - last_committed_epoch: Option, - coordinator: PlatformId, - members: BTreeSet, - // This is our `K` parameter - threshold: Threshold, - - // This is our `Z` parameter. - // - // Nexus can commit when it has seen K+Z prepare acknowledgements - // - // Only nexus needs to know this value since it alone determines when a - // commit may occur. - commit_crash_tolerance: u8, - - prepared_members: BTreeSet, - committed_members: BTreeSet, -} - -impl NexusConfig { - pub fn new( - epoch: Epoch, - last_committed_epoch: Option, - coordinator: PlatformId, - members: BTreeSet, - threshold: Threshold, - ) -> NexusConfig { - // We want a few extra nodes beyond `threshold` to ack before we commit. - // This is the number of nodes that can go offline while still allowing - // an unlock to occur. - let commit_crash_tolerance = match members.len() - threshold.0 as usize - { - 0..=1 => 0, - 2..=4 => 1, - 5..=7 => 2, - _ => 3, - }; - NexusConfig { - op: NexusOp::Preparing, - epoch, - last_committed_epoch, - coordinator, - members, - threshold, - commit_crash_tolerance, - prepared_members: BTreeSet::new(), - committed_members: BTreeSet::new(), - } - } - - pub fn to_reconfigure_msg(&self, rack_id: RackUuid) -> ReconfigureMsg { - ReconfigureMsg { - rack_id, - epoch: self.epoch, - last_committed_epoch: self.last_committed_epoch, - members: self.members.clone(), - threshold: self.threshold, - } - } - - // Are there enough prepared members to commit? - pub fn can_commit(&self) -> bool { - self.prepared_members.len() - >= (self.threshold.0 + self.commit_crash_tolerance) as usize - } -} - -impl IdOrdItem for NexusConfig { - type Key<'a> = Epoch; - - fn key(&self) -> Self::Key<'_> { - self.epoch - } - - id_upcast!(); -} - -/// A model of Nexus's view of the world during the test -pub struct NexusState { - // No reason to change the rack_id - pub rack_id: RackUuid, - - pub configs: IdOrdMap, -} - -impl NexusState { - pub fn new() -> NexusState { - NexusState { rack_id: RackUuid::new_v4(), configs: IdOrdMap::new() } - } - - // Create a `ReconfigureMsg` for the latest nexus config - pub fn reconfigure_msg_for_latest_config( - &self, - ) -> (&PlatformId, ReconfigureMsg) { - let config = self.configs.iter().last().expect("at least one config"); - (&config.coordinator, config.to_reconfigure_msg(self.rack_id)) - } - - /// Abort the latest reconfiguration attempt - pub fn abort_reconfiguration(&mut self) { - let config = self.configs.iter().last().expect("at least one config"); - // Can only abort while preparing - assert_eq!(config.op, NexusOp::Preparing); - } - - pub fn latest_config(&self) -> &NexusConfig { - self.configs.iter().last().expect("at least one config") - } - - pub fn latest_config_mut(&mut self) -> RefMut<'_, NexusConfig> { - self.configs.iter_mut().last().expect("at least one config") - } - - pub fn last_committed_config(&self) -> Option<&NexusConfig> { - // IdOrdMap doesn't allow reverse iteration. - // We therefore iterate through all configs to find the latest committed one. - // We could track this out of band but that leaves more room for error. - let mut found: Option<&NexusConfig> = None; - for c in &self.configs { - if c.op == NexusOp::Committed { - found = Some(c) - } - } - found - } -} - -/// Faults in our system. It's useful to keep these self contained and not -/// in separate fields in `TestState` so that we can access them all at once -/// independently of other `TestState` fields. -#[derive(Default)] -pub struct Faults { - // We allow nodes to crash and restart and therefore track crashed nodes here. - // - // A crashed node is implicitly disconnected from every other node. We don't - // bother storing the pairs in `disconnected_nodes`, but instead check both - // fields when necessary. - pub crashed_nodes: BTreeSet, - - /// The set of disconnected nodes - pub disconnected_nodes: DisconnectedNodes, -} - -impl Faults { - pub fn is_connected(&self, node1: PlatformId, node2: PlatformId) -> bool { - !self.crashed_nodes.contains(&node1) - && !self.crashed_nodes.contains(&node2) - && !self.disconnected_nodes.contains(node1, node2) - } -} - -/// For cardinality purposes, we assume all nodes are connected and explicitly -/// disconnect some of them. This allows us to track and compare much less data. -#[derive(Default)] -pub struct DisconnectedNodes { - // We sort each pair on insert for quick lookups - pairs: BTreeSet<(PlatformId, PlatformId)>, -} - -impl DisconnectedNodes { - // Return true if the pair is newly inserted - pub fn insert(&mut self, node1: PlatformId, node2: PlatformId) -> bool { - assert_ne!(node1, node2); - - let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) }; - self.pairs.insert(pair) - } - - // Return true if the pair of nodes is disconnected, false otherwise - pub fn contains(&self, node1: PlatformId, node2: PlatformId) -> bool { - assert_ne!(node1, node2); - let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) }; - self.pairs.contains(&pair) - } -} - -pub enum NexusReply { - CommitAck { from: PlatformId, epoch: Epoch }, -} /// The state of our test +#[derive(Clone, Diffable)] struct TestState { - /// A logger for our test - pub log: Logger, - - /// Our system under test - pub sut: Sut, - - /// All in flight messages between nodes - pub bootstrap_network: BTreeMap>, - - /// All in flight responses to nexus. We don't model the requests, as those - /// are `Node` public method calls. But we don't want to synchronously - /// update nexus state as a result of those calls, because that ruins any - /// possible interleaving with other actions. - /// - /// This is a way to allow interleaving of nexus replies without changing - /// the Node API to accept a separate set of Nexus messages and return - /// messages. We may decide that we want to do that, but for now we'll stick - /// with a concrete `Node` method based API that is "triggered" by nexus - /// messages. - pub underlay_network: Vec, - - /// A model of Nexus's view of the world during the test - pub nexus: NexusState, - - /// A cache of our member universe, so we only have to generate it once - pub member_universe: Vec, - - /// All possible system faults in our test - pub faults: Faults, - - /// All configurations ever generated by a coordinator. - /// - /// If an epoch got skipped due to a crashed coordinator then there will not - /// be a configuration for that epoch. - pub all_coordinated_configs: IdOrdMap, - - /// Expunged nodes cannot be added to a cluster. We never reuse nodes in - /// this test. We include nodes here that may not know yet that they have - /// been expunged in the `Sut`. - pub expunged: BTreeSet, + pub tq_state: TqState, /// Keep track of the number of generated `Action`s that get skipped /// @@ -287,29 +38,23 @@ struct TestState { impl TestState { pub fn new(log: Logger) -> TestState { - let sut = Sut::new(&log, member_universe()); - TestState { - log: log.new(o!("component" => "tq-proptest")), - sut, - bootstrap_network: BTreeMap::new(), - underlay_network: Vec::new(), - nexus: NexusState::new(), - member_universe: member_universe(), - faults: Faults::default(), - all_coordinated_configs: IdOrdMap::new(), - expunged: BTreeSet::new(), - skipped_actions: 0, - } + TestState { tq_state: TqState::new(log), skipped_actions: 0 } } - pub fn create_nexus_initial_config( - &mut self, + fn initial_config_event( + &self, config: GeneratedConfiguration, - ) { + down_nodes: BTreeSet, + ) -> Event { + // `tq_state` doesn't create the member universe until the first event is + // applied. We duplicate it here so we can create that initial config + // event. + let member_universe = + trust_quorum_test_utils::member_universe(MEMBER_UNIVERSE_SIZE); let members: BTreeSet = config .members .iter() - .map(|index| self.member_universe[*index].clone()) + .map(|index| member_universe[*index].clone()) .collect(); let threshold = Threshold(usize::max( 2, @@ -319,135 +64,21 @@ impl TestState { let coordinator = members.first().cloned().expect("at least one member"); let last_committed_epoch = None; - let nexus_config = NexusConfig::new( + let config = NexusConfig::new( epoch, last_committed_epoch, coordinator, members, threshold, ); - self.nexus.configs.insert_unique(nexus_config).expect("new config"); - } - - pub fn setup_initial_connections(&mut self, down_nodes: BTreeSet) { - self.faults.crashed_nodes = down_nodes + let crashed_nodes = down_nodes .into_iter() - .map(|index| self.member_universe[index].clone()) + .map(|index| member_universe[index].clone()) .collect(); - - for (from, (node, ctx)) in self - .sut - .nodes - .iter_mut() - .filter(|(id, _)| !self.faults.crashed_nodes.contains(id)) - { - for to in self.member_universe.iter().filter(|id| { - !self.faults.crashed_nodes.contains(id) && from != *id - }) { - node.on_connect(ctx, to.clone()); - } - } - } - - /// Send the latest `ReconfigureMsg` from `Nexus` to the coordinator node - /// - /// If the node is not available, then abort the configuration at nexus - pub fn send_reconfigure_msg(&mut self) { - let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config(); - let epoch_to_config = msg.epoch; - if self.faults.crashed_nodes.contains(coordinator) { - // We must abort the configuration. This mimics a timeout. - self.nexus.abort_reconfiguration(); - } else { - let (node, ctx) = self - .sut - .nodes - .get_mut(coordinator) - .expect("coordinator exists"); - - node.coordinate_reconfiguration(ctx, msg) - .expect("valid configuration"); - - // Do we have a `Configuration` for this epoch yet? - // - // For most reconfigurations, shares for the last committed - // configuration must be retrieved before the configuration is - // generated and saved in the persistent state. - let latest_persisted_config = - ctx.persistent_state().latest_config().expect("config exists"); - if latest_persisted_config.epoch == epoch_to_config { - // Save the configuration for later - self.all_coordinated_configs - .insert_unique(latest_persisted_config.clone()) - .expect("unique"); - } - } - } - - /// Check postcondition assertions after initial configuration - pub fn postcondition_initial_configuration( - &mut self, - ) -> Result<(), TestCaseError> { - let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config(); - - // The coordinator should have received the `ReconfigureMsg` from Nexus - if !self.faults.crashed_nodes.contains(coordinator) { - let (node, ctx) = self - .sut - .nodes - .get_mut(coordinator) - .expect("coordinator exists"); - let mut connected_members = 0; - // The coordinator should start preparing by sending a `PrepareMsg` to all - // connected nodes in the membership set. - for member in - msg.members.iter().filter(|&id| id != coordinator).cloned() - { - if self.faults.is_connected(coordinator.clone(), member.clone()) - { - connected_members += 1; - let msg_found = ctx.envelopes().any(|envelope| { - envelope.to == member - && envelope.from == *coordinator - && matches!( - envelope.msg.kind, - PeerMsgKind::Prepare { .. } - ) - }); - prop_assert!(msg_found); - } - } - assert_eq!(connected_members, ctx.envelopes().count()); - - // The coordinator should be in the prepare phase - let cs = node.get_coordinator_state().expect("is coordinating"); - assert!(matches!(cs.op(), CoordinatorOperation::Prepare { .. })); - - // The persistent state should have changed - assert!(ctx.persistent_state_change_check_and_reset()); - assert!(ctx.persistent_state().has_prepared(msg.epoch)); - assert!(ctx.persistent_state().latest_committed_epoch().is_none()); - } - - Ok(()) - } - - /// Put any outgoing coordinator messages from the latest configuration on the wire - pub fn send_envelopes_from_coordinator(&mut self) { - let coordinator = { - let (coordinator, _) = - self.nexus.reconfigure_msg_for_latest_config(); - coordinator.clone() - }; - self.send_envelopes_from(&coordinator); - } - - pub fn send_envelopes_from(&mut self, id: &PlatformId) { - let (_, ctx) = self.sut.nodes.get_mut(id).expect("node exists"); - for envelope in ctx.drain_envelopes() { - let msgs = - self.bootstrap_network.entry(envelope.to.clone()).or_default(); - msgs.push(envelope); + Event::InitialSetup { + member_universe_size: MEMBER_UNIVERSE_SIZE, + config, + crashed_nodes, } } @@ -455,174 +86,107 @@ impl TestState { pub fn run_actions( &mut self, actions: Vec, + event_log: &mut EventLog, ) -> Result<(), TestCaseError> { for action in actions { - let skipped = match action { - Action::DeliverEnvelopes(indices) => { - self.action_deliver_envelopes(indices) - } - Action::PollPrepareAcks => self.action_poll_prepare_acks(), - Action::Commit(indices) => self.action_commit(indices), - Action::DeliverNexusReplies(n) => { - self.action_deliver_nexus_replies(n) - } - Action::Reconfigure { - num_added_nodes, - removed_nodes, - threshold, - coordinator, - } => self.action_reconfigure( - num_added_nodes, - removed_nodes, - threshold, - coordinator, - ), - }; - - if skipped { - self.skipped_actions += 1; - } else { + let events = self.action_to_events(action); + for event in &events { + event_log.record(event); + } + let check_invariants = !events.is_empty(); + for event in events { + self.tq_state.apply_event(event); + } + if check_invariants { self.check_invariants()?; + } else { + self.skipped_actions += 1; } } Ok(()) } - // Deliver network messages to generated destinations - fn action_deliver_envelopes(&mut self, indices: Vec) -> bool { - let destinations: Vec<_> = - self.bootstrap_network.keys().cloned().collect(); - if destinations.is_empty() { - // nothing to do - return true; - } - for index in indices { - let id = index.get(&destinations); - if let Some(envelope) = - self.bootstrap_network.get_mut(id).unwrap().pop() - { - let (node, ctx) = - self.sut.nodes.get_mut(id).expect("destination exists"); - node.handle(ctx, envelope.from, envelope.msg); - - // If this is the first time we've seen a configuration, track it - // - // We have to do this here because for reconfigurations, shares - // for the last committed reconfiguration are gathered before - // the config is created. We don't know exactly when config - // generation occurs, but know that it happens after envelopes - // are delivered, except for configurations that don't have - // a last committed config. This is normally the initial - // configuration, but can be later ones if the initial config - // is aborted. - if ctx.persistent_state_change_check_and_reset() { - if let Some(latest_config) = - ctx.persistent_state().latest_config() - { - if !self - .all_coordinated_configs - .contains_key(&latest_config.epoch) - { - // The coordinator must be the first node to create - // the configuration. - assert_eq!( - &latest_config.coordinator, - ctx.platform_id() - ); - - self.all_coordinated_configs - .insert_unique(latest_config.clone()) - .expect("unique config"); - } - } - } - - // Send any messages as a result of handling this message - send_envelopes(ctx, &mut self.bootstrap_network); + fn action_to_events(&self, action: Action) -> Vec { + match action { + Action::DeliverEnvelopes(indices) => { + self.action_to_events_deliver_envelopes(indices) + } + Action::PollPrepareAcks => { + self.action_to_events_poll_prepare_acks() } + Action::Commit(indices) => self.action_to_events_commit(indices), + Action::DeliverNexusReplies(n) => { + self.action_to_events_deliver_nexus_replies(n) + } + Action::Reconfigure { + num_added_nodes, + removed_nodes, + threshold, + coordinator, + } => self.action_to_events_reconfigure( + num_added_nodes, + removed_nodes, + threshold, + coordinator, + ), } - - // Remove any destinations with zero messages in-flight - self.bootstrap_network.retain(|_, msgs| !msgs.is_empty()); - - false } - // Call `Node::commit_reconfiguration` for nodes that have prepared and have - // not yet acked their commit. - fn action_commit(&mut self, indices: Vec) -> bool { - let rack_id = self.nexus.rack_id; - let latest_config = self.nexus.latest_config(); - if latest_config.op != NexusOp::Committed { - return true; - } - let committable: Vec<_> = latest_config - .prepared_members - .difference(&latest_config.committed_members) - .collect(); - - if committable.is_empty() { - // All members have committed - self.skipped_actions += 1; - return true; + fn action_to_events_deliver_envelopes( + &self, + indices: Vec, + ) -> Vec { + let mut events = vec![]; + let destinations: Vec<_> = + self.tq_state.bootstrap_network.keys().cloned().collect(); + if destinations.is_empty() { + // nothing to do + return events; } - // We shouldn't be calling commit twice or sending multiple replies - // to nexus, but a random bunch of indices might result in that. We - // therefore track nodes that have committed already. - let mut committed: BTreeSet = BTreeSet::new(); - + // Add an event only if there is actually an envelope to send + let mut counts = BTreeMap::new(); for index in indices { - let id = *index.get(&committable); - if committed.contains(id) { - continue; + let id = index.get(&destinations); + let count = counts.entry(id).or_insert(0usize); + *count += 1; + let num_envelopes = self + .tq_state + .bootstrap_network + .get(id) + .expect("destination exists") + .len(); + if *count <= num_envelopes { + events.push(Event::DeliverEnvelope { destination: id.clone() }); } - let (node, ctx) = - self.sut.nodes.get_mut(id).expect("destination exists"); - node.commit_configuration(ctx, rack_id, latest_config.epoch) - .expect("commit succeeded"); - committed.insert(id.clone()); - } - - let epoch = latest_config.epoch; - for from in committed { - self.underlay_network.push(NexusReply::CommitAck { from, epoch }); } - false - } - fn action_deliver_nexus_replies(&mut self, n: usize) -> bool { - let mut config = self.nexus.latest_config_mut(); - let n = usize::min(n, self.underlay_network.len()); - for reply in self.underlay_network.drain(0..n) { - match reply { - NexusReply::CommitAck { from, epoch } => { - if config.epoch == epoch { - config.committed_members.insert(from); - } - } - } - } - false + events } - /// Poll the coordinator for acks if nexus is preparing, and commit - /// if enough acks have been received. - fn action_poll_prepare_acks(&mut self) -> bool { - let mut latest_config = self.nexus.latest_config_mut(); + fn action_to_events_poll_prepare_acks(&self) -> Vec { + let mut events = vec![]; + let latest_config = self.tq_state.nexus.latest_config(); if latest_config.op != NexusOp::Preparing { // No point in checking. Commit or abort has occurred. - return true; + return events; } // If the coordinator has crashed then Nexus should abort. // Crashing is not actually implemented yet, but it will be. - if self.faults.crashed_nodes.contains(&latest_config.coordinator) { - latest_config.op = NexusOp::Aborted; + if self + .tq_state + .faults + .crashed_nodes + .contains(&latest_config.coordinator) + { + events.push(Event::AbortConfiguration(latest_config.epoch)); + return events; } // Lookup the coordinator node let (coordinator, ctx) = self + .tq_state .sut .nodes .get(&latest_config.coordinator) @@ -635,7 +199,7 @@ impl TestState { .latest_config() .map_or(Epoch(0), |c| c.epoch); if coordinator_epoch != latest_config.epoch { - return true; + return events; } // Poll the coordinator for acks. @@ -644,68 +208,66 @@ impl TestState { // crashed and nexus is still preparing. // // In a real system this request would go over the network, but would - // end up at the same place. It's not apparent that its worth the - // complexity here to delay poll replies to Nexus, but we can do that - // if necessary and then deliver them when the `DeliverNexusReplies` - // action fires. + // end up at the same place. let cs = coordinator .get_coordinator_state() .expect("coordinator is coordinating"); - latest_config.prepared_members.extend(cs.op().acked_prepares()); - - // Commit if possible - if latest_config.can_commit() { - info!(self.log, "nexus committed"; - "epoch" => %latest_config.epoch, - "coordinator" => %latest_config.coordinator - ); - - latest_config.op = NexusOp::Committed; - - let new_members = latest_config.members.clone(); - let new_epoch = latest_config.epoch; + // Put the reply on the network + events.push(Event::SendNexusReplyOnUnderlay( + NexusReply::AckedPreparesFromCoordinator { + epoch: coordinator_epoch, + acks: cs.op().acked_prepares(), + }, + )); + events + } - // Expunge any removed nodes from the last committed configuration - if let Some(last_committed_epoch) = - latest_config.last_committed_epoch - { - // Release our mutable borrow - drop(latest_config); + fn action_to_events_commit(&self, indices: Vec) -> Vec { + let mut events = vec![]; + let latest_config = self.tq_state.nexus.latest_config(); + if latest_config.op != NexusOp::Committed { + return events; + } + let committable: Vec<_> = latest_config + .prepared_members + .difference(&latest_config.committed_members) + .collect(); - let last_committed_config = self - .nexus - .configs - .get(&last_committed_epoch) - .expect("config exists"); + if committable.is_empty() { + return events; + } - let expunged = last_committed_config - .members - .difference(&new_members) - .cloned(); + // De-duplicate the Index->PlatformId mapping + let mut nodes: BTreeSet = BTreeSet::new(); + for index in indices { + let id = *index.get(&committable); + nodes.insert(id.clone()); + } + for node in nodes { + events.push(Event::CommitConfiguration(node)); + } + events + } - for e in expunged { - info!( - self.log, - "expunged node"; - "epoch" => %new_epoch, - "platform_id" => %e); - self.expunged.insert(e); - } - } + fn action_to_events_deliver_nexus_replies(&self, n: usize) -> Vec { + let mut events = vec![]; + let n = usize::min(n, self.tq_state.underlay_network.len()); + for _ in 0..n { + events.push(Event::DeliverNexusReply); } - false + events } - fn action_reconfigure( - &mut self, + fn action_to_events_reconfigure( + &self, num_added_nodes: usize, removed_nodes: Vec, threshold: Index, coordinator: Selector, - ) -> bool { - let latest_epoch = self.nexus.latest_config().epoch; - let last_committed_config = self.nexus.last_committed_config(); + ) -> Vec { + let latest_epoch = self.tq_state.nexus.latest_config().epoch; + let last_committed_config = self.tq_state.nexus.last_committed_config(); // We must leave at least one node available to coordinate between the // new and old configurations. let (new_members, coordinator) = match last_committed_config { @@ -720,7 +282,7 @@ impl TestState { let num_nodes_to_add = usize::min( MEMBER_UNIVERSE_SIZE - c.members.len() - - self.expunged.len(), + - self.tq_state.expunged.len(), possible_num_nodes_to_add, ); @@ -737,7 +299,7 @@ impl TestState { // We can only start a reconfiguration if Nexus has an // acknowledgement that at least one node has seen the commit. if c.committed_members.is_empty() { - return true; + return vec![]; } let coordinator = coordinator.select(c.committed_members.iter()); @@ -762,11 +324,13 @@ impl TestState { // Just pick the first set of nodes in `member_universe` // that are not in the current membership and not expunged. let mut nodes_to_add = BTreeSet::new(); - for id in self.member_universe.iter() { + for id in self.tq_state.member_universe.iter() { if nodes_to_add.len() == num_nodes_to_add { break; } - if !self.expunged.contains(id) && !c.members.contains(id) { + if !self.tq_state.expunged.contains(id) + && !c.members.contains(id) + { nodes_to_add.insert(id.clone()); } } @@ -785,11 +349,12 @@ impl TestState { // We are generating a new config if num_added_nodes < MIN_CLUSTER_SIZE { // Nothing to do here. - return true; + return vec![]; } // Pick the first `num_added_nodes` from member_universe // It's as good a choice as any and deterministic let new_members: BTreeSet<_> = self + .tq_state .member_universe .iter() .take(num_added_nodes) @@ -819,9 +384,7 @@ impl TestState { new_members, threshold, ); - self.nexus.configs.insert_unique(nexus_config).expect("new config"); - self.send_reconfigure_msg(); - false + vec![Event::Reconfigure(nexus_config)] } /// At every point during the running of the test, invariants over the system @@ -845,8 +408,9 @@ impl TestState { fn invariant_all_nodes_have_same_configuration_per_epoch( &self, ) -> Result<(), TestCaseError> { - for (id, (_, ctx)) in &self.sut.nodes { + for (id, (_, ctx)) in &self.tq_state.sut.nodes { let diff = self + .tq_state .all_coordinated_configs .diff(&ctx.persistent_state().configs); // No new configs exist @@ -872,8 +436,9 @@ impl TestState { &self, ) -> Result<(), TestCaseError> { let (acked, epoch) = { - let latest_config = self.nexus.latest_config(); + let latest_config = self.tq_state.nexus.latest_config(); let (node, _) = self + .tq_state .sut .nodes .get(&latest_config.coordinator) @@ -900,7 +465,8 @@ impl TestState { // Make sure the coordinator actually is coordinating for this epoch for id in acked { - let (_, ctx) = self.sut.nodes.get(&id).expect("node exists"); + let (_, ctx) = + self.tq_state.sut.nodes.get(&id).expect("node exists"); prop_assert!(ctx.persistent_state().has_prepared(epoch)); } @@ -916,13 +482,14 @@ impl TestState { fn invariant_nodes_have_committed_if_nexus_has_acks( &self, ) -> Result<(), TestCaseError> { - let latest_config = self.nexus.latest_config(); + let latest_config = self.tq_state.nexus.latest_config(); if latest_config.op != NexusOp::Committed { return Ok(()); } for id in &latest_config.committed_members { - let (_, ctx) = self.sut.nodes.get(&id).expect("node exists"); + let (_, ctx) = + self.tq_state.sut.nodes.get(&id).expect("node exists"); let ps = ctx.persistent_state(); prop_assert!(ps.commits.contains(&latest_config.epoch)); prop_assert!(ps.has_prepared(latest_config.epoch)); @@ -943,7 +510,7 @@ impl TestState { fn invariant_nodes_not_coordinating_and_computing_key_share_simultaneously( &self, ) -> Result<(), TestCaseError> { - for (id, (node, _)) in &self.sut.nodes { + for (id, (node, _)) in &self.tq_state.sut.nodes { prop_assert!( !(node.get_coordinator_state().is_some() && node.is_computing_key_share()), @@ -957,7 +524,7 @@ impl TestState { // Ensure there has been no alarm at any node fn invariant_no_alarms(&self) -> Result<(), TestCaseError> { - for (id, (_, ctx)) in &self.sut.nodes { + for (id, (_, ctx)) in &self.tq_state.sut.nodes { let alarms = ctx.alarms(); prop_assert!( alarms.is_empty(), @@ -970,18 +537,6 @@ impl TestState { } } -/// Broken out of `TestState` to alleviate borrow checker woes -fn send_envelopes( - ctx: &mut NodeCtx, - bootstrap_network: &mut BTreeMap>, -) { - for envelope in ctx.drain_envelopes() { - let envelopes = - bootstrap_network.entry(envelope.to.clone()).or_default(); - envelopes.push(envelope); - } -} - // A high-level set of generated actions to drive the test forward. #[derive(Debug, Arbitrary)] #[allow(clippy::large_enum_variant)] @@ -1056,13 +611,7 @@ pub struct GeneratedConfiguration { /// still be duplicated due to the shift implementation used. Therefore we /// instead just choose from a constrained set of usize values that we can /// use directly as indexes into our fixed size structure for all tests. - /// - /// Note that we intentionally set the max set size to MAX_CLUSTER_SIZE-1. - /// This is because we always want to include the coordinator in the - /// configuration, but its value may not be chosen randomly. In this case, - /// we have to add it to the actual membership set we generate from this - /// configuration with [`TestState::generated_config_to_reconfigure_msg`]. - #[strategy(btree_set(0..=MEMBER_UNIVERSE_SIZE, MIN_CLUSTER_SIZE..MAX_CLUSTER_SIZE))] + #[strategy(btree_set(0..MEMBER_UNIVERSE_SIZE, MIN_CLUSTER_SIZE..MAX_CLUSTER_SIZE))] pub members: BTreeSet, /// An index is roughly equivalent to a threshold, since a threshold cannot @@ -1073,20 +622,13 @@ pub struct GeneratedConfiguration { pub threshold: Index, } -/// All possible members used in a test -fn member_universe() -> Vec { - (0..=MEMBER_UNIVERSE_SIZE) - .map(|serial| PlatformId::new("test".into(), serial.to_string())) - .collect() -} - #[derive(Debug, Arbitrary)] pub struct TestInput { initial_config: GeneratedConfiguration, // We choose a set of nodes to be crashed, resulting in them being // disconnected from every other node. - #[strategy(btree_set(0..=MEMBER_UNIVERSE_SIZE, 0..MAX_INITIAL_DOWN_NODES))] + #[strategy(btree_set(0..MEMBER_UNIVERSE_SIZE, 0..MAX_INITIAL_DOWN_NODES))] initial_down_nodes: BTreeSet, #[any(size_range(MIN_ACTIONS..MAX_ACTIONS).lift())] actions: Vec, @@ -1095,28 +637,28 @@ pub struct TestInput { #[proptest] fn test_trust_quorum_protocol(input: TestInput) { let logctx = test_setup_log("test_trust_quorum_protocol"); + let (parent_dir, prefix) = log_prefix_for_test(logctx.test_name()); + let event_log_path = parent_dir.join(format!("{prefix}-events.json")); + let mut event_log = EventLog::new(&event_log_path); - let mut state = TestState::new(logctx.log.clone()); + let log = logctx.log.new(o!("component" => "tq-proptest")); + let mut state = TestState::new(log.clone()); // Perform the initial setup - state.create_nexus_initial_config(input.initial_config); - state.setup_initial_connections(input.initial_down_nodes); - state.send_reconfigure_msg(); - - // Check the results of the initial setup - state.postcondition_initial_configuration()?; - - // Put the coordinator's outgoing messages on the wire if there are any - state.send_envelopes_from_coordinator(); + let event = state + .initial_config_event(input.initial_config, input.initial_down_nodes); + event_log.record(&event); + state.tq_state.apply_event(event); // Start executing the actions - state.run_actions(input.actions)?; + state.run_actions(input.actions, &mut event_log)?; info!( - state.log, + log, "Test complete"; "skipped_actions" => state.skipped_actions ); + // let _ = std::fs::remove_file(event_log_path); logctx.cleanup_successful(); } diff --git a/trust-quorum/tqdb/Cargo.toml b/trust-quorum/tqdb/Cargo.toml new file mode 100644 index 00000000000..4436cc99fbc --- /dev/null +++ b/trust-quorum/tqdb/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "tqdb" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +anyhow.workspace = true +camino.workspace = true +clap.workspace = true +colored.workspace = true +daft.workspace = true +iddqd.workspace = true +omicron-repl-utils.workspace = true +reedline.workspace = true +reconfigurator-cli.workspace = true +serde_json.workspace = true +slog.workspace = true +tabled.workspace = true +trust-quorum = { workspace = true, features = ["danger_partial_eq_ct_wrapper"] } +trust-quorum-test-utils.workspace = true + +omicron-workspace-hack.workspace = true + +[[bin]] +name = "tqdb" +path = "src/bin/tqdb/main.rs" diff --git a/trust-quorum/tqdb/src/bin/tqdb/main.rs b/trust-quorum/tqdb/src/bin/tqdb/main.rs new file mode 100644 index 00000000000..8778c45bdba --- /dev/null +++ b/trust-quorum/tqdb/src/bin/tqdb/main.rs @@ -0,0 +1,719 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The Trust Quorum Debugger +//! +//! Capable of executing and stepping through event streams generated by +//! trust quorum proptests. + +use anyhow::{Context, bail}; +use camino::Utf8PathBuf; +use clap::{Args, Parser, Subcommand}; +use daft::Diffable; +use omicron_repl_utils::run_repl_on_stdin_customized; +use reconfigurator_cli::LogCapture; +use reedline::{ + ColumnarMenu, DefaultCompleter, DefaultPrompt, DefaultPromptSegment, Emacs, + FileBackedHistory, KeyCode, KeyModifiers, MenuBuilder, Reedline, + ReedlineEvent, default_emacs_keybindings, +}; +use slog::{Logger, o}; +use std::collections::{BTreeMap, BTreeSet}; +use std::fmt::Write; +use std::fs; +use std::io::IsTerminal; +use tabled::Tabled; +use trust_quorum::PlatformId; +use trust_quorum_test_utils::{Event, TqState}; + +fn main() -> Result<(), anyhow::Error> { + let repl = TqdbRepl {}; + repl.exec() +} + +/// Internal debugger state +pub struct Tqdb { + event_log_path: Option, + + events: Vec, + + // Current state of the trust-quorum + current_state: TqState, + + // Index of the next event to be applied + next_event: usize, + + // All set breakpoints at log event index + breakpoints: BTreeSet, + + // All snapshots ever taken. + // + // Snapshots are taken automatically when a breakpoint is hit. + snapshots: BTreeMap, + + // Snapshot requests for events that haven't yet been applied + pending_snapshots: BTreeSet, +} + +impl Tqdb { + pub fn new(log: &Logger) -> Self { + let log = log.new(o!("component" => "tqdb")); + Tqdb { + event_log_path: None, + events: vec![], + current_state: TqState::new(log), + next_event: 0, + breakpoints: BTreeSet::new(), + snapshots: BTreeMap::new(), + pending_snapshots: BTreeSet::new(), + } + } + + pub fn reset_state(&mut self) { + let Tqdb { + event_log_path, + events, + current_state, + next_event, + breakpoints, + snapshots, + pending_snapshots, + } = self; + let log = current_state.log.clone(); + + *event_log_path = None; + *events = vec![]; + *current_state = TqState::new(log); + *next_event = 0; + *breakpoints = BTreeSet::new(); + *snapshots = BTreeMap::new(); + *pending_snapshots = BTreeSet::new(); + } + + pub fn toggle_breakpoint(&mut self, index: usize) -> anyhow::Result { + if index >= self.events.len() { + bail!( + "Invalid event index: {index}. Only {} total events.", + self.events.len() + ); + } + if !self.breakpoints.remove(&index) { + let _ = self.breakpoints.insert(index); + Ok(true) + } else { + Ok(false) + } + } + + // Reset the state to the beginning of time + // + // Don't remove any breakpoints, snapshots, or pending snapshots + pub fn rewind(&mut self) { + let Tqdb { + event_log_path: _, + events: _, + current_state, + next_event, + breakpoints: _, + snapshots: _, + pending_snapshots: _, + } = self; + + *current_state = TqState::new(current_state.log.clone()); + *next_event = 0; + } + + pub fn breakpoints(&self) -> &BTreeSet { + &self.breakpoints + } + + pub fn maybe_snapshot(&mut self) { + if self.next_event == 0 { + return; + } + let curr_event = self.next_event - 1; + if self.pending_snapshots.remove(&curr_event) { + self.snapshots.insert(curr_event, self.current_state.clone()); + } + } +} + +/// Interactive REPL for our trust quorum debugger +pub struct TqdbRepl {} + +impl TqdbRepl { + /// Execute the command. + pub fn exec(self) -> anyhow::Result<()> { + let (log_capture, log) = + LogCapture::new(std::io::stdout().is_terminal()); + + let mut tqdb = Tqdb::new(&log); + + let mut completer = Box::new(DefaultCompleter::with_inclusions(&['-'])); + completer.insert(Self::commands()); + let completion_menu = + Box::new(ColumnarMenu::default().with_name("commands")); + let mut keybindings = default_emacs_keybindings(); + keybindings.add_binding( + KeyModifiers::NONE, + KeyCode::Tab, + ReedlineEvent::UntilFound(vec![ + ReedlineEvent::Menu("commands".to_string()), + ReedlineEvent::MenuNext, + ]), + ); + let edit_mode = Box::new(Emacs::new(keybindings)); + + let history = Box::new( + FileBackedHistory::with_file( + 10000, + "/tmp/.tqdb-history.txt".into(), + ) + .expect("Error configuring history with file"), + ); + + let ed = Reedline::create() + .with_history(history) + .with_completer(completer) + .with_menu(reedline::ReedlineMenu::EngineCompleter(completion_menu)) + .with_edit_mode(edit_mode); + + let prompt = DefaultPrompt::new( + DefaultPromptSegment::Basic("tqdb".into()), + DefaultPromptSegment::Empty, + ); + + run_repl_on_stdin_customized(ed, &prompt, &mut |cmd: TopLevelArgs| { + process_command(&mut tqdb, cmd, &log_capture) + }) + } + + // Update this with each new subcommand + fn commands() -> Vec { + // This assignment and match exists soley to ensure we update our list + // when we add or remove a command. + let c = Commands::Run; + match c { + Commands::Open { .. } + | Commands::Run + | Commands::Step { .. } + | Commands::BreakpointToggle { .. } + | Commands::BreakpointList + | Commands::Snapshot { .. } + | Commands::SnapshotList + | Commands::SnapshotListPending + | Commands::SnapshotShow { .. } + | Commands::Diff { .. } + | Commands::NodeShow { .. } + | Commands::Rewind + | Commands::Events(_) + | Commands::Show + | Commands::Summary => {} + } + + [ + "open", + "run", + "step", + "breakpoint-toggle", + "breakpoint-list", + "snapshot", + "snapshot-list", + "snapshot-list-pending", + "snapshot-show", + "diff", + "node-show", + "rewind", + "events", + "show", + "summary", + ] + .into_iter() + .map(Into::into) + .collect() + } +} + +/// Arguments for our debugger REPL +#[derive(Parser, Debug)] +struct TopLevelArgs { + #[command(subcommand)] + command: Commands, +} + +#[derive(Debug, Subcommand)] +enum Commands { + // open event log file + #[clap(alias = "o")] + Open { + /// path to the event log file + path: Utf8PathBuf, + }, + // apply all events until completion or a breakpoint + #[clap(alias = "r")] + Run, + /// step over n events by applying them + /// + /// This command steps over breakpoints. Use `run` if you want to stop at + /// breakpoints. + #[clap(alias = "s")] + Step { + /// number of events to apply, 1 if not given + num_events: Option, + }, + + /// toggle a breakpoint at a given event + #[clap(alias = "b")] + BreakpointToggle { + /// index of the event in the log to apply the breakpoint to + index: usize, + }, + /// display all existing breakpoints + BreakpointList, + + /// take a snapshot of the current state, or at the given event when reached + Snapshot { + /// index of the event to take snapshot + index: Option, + }, + /// display all existing snapshots + SnapshotList, + /// show a snapshot for the given event + SnapshotShow { + /// index of the event where the snapshot was taken + index: usize, + }, + /// list all pending snapshots + SnapshotListPending, + + /// show the difference between the current state and the snapshot + /// or two snapshots if two indexes are given + #[clap(alias = "d")] + Diff { + /// The event log index of where the snapshot was taken + snapshot1: usize, + + /// An optional second snapshot index + snapshot2: Option, + }, + + /// display the current state of a SUT node and its context + NodeShow { + /// The serial number of the node to print. + /// Print all state if not present. + serial: usize, + }, + + /// Reset the state to the beginning of the trace + /// + /// This does not remove breakpoints or pending snapshots + Rewind, + + /// show the full state of the system + Show, + + /// print an overview of the current state of the system + Summary, + + /// display log entries - next entry by default + Events(EventsArgs), +} + +#[derive(Debug, Args)] +struct EventsArgs { + #[clap(subcommand)] + command: Option, +} + +#[derive(Debug, Subcommand)] +pub enum EventsCommand { + All, + Next { num_events: usize }, + Range { start: usize, end: usize }, +} + +/// Processes one "line" of user input. +fn process_command( + tqdb: &mut Tqdb, + cmd: TopLevelArgs, + logs: &LogCapture, +) -> anyhow::Result> { + let TopLevelArgs { command } = cmd; + let cmd_result = match command { + Commands::Open { path } => cmd_open(tqdb, path), + Commands::Run {} => cmd_run(tqdb), + Commands::Step { num_events } => cmd_step(tqdb, num_events), + Commands::BreakpointToggle { index } => { + cmd_toggle_breakpoint(tqdb, index) + } + Commands::BreakpointList {} => cmd_breakpoint_list(tqdb), + Commands::Diff { snapshot1, snapshot2 } => { + cmd_diff(tqdb, snapshot1, snapshot2) + } + Commands::Snapshot { index } => cmd_snapshot(tqdb, index), + Commands::SnapshotList {} => cmd_snapshot_list(tqdb), + Commands::SnapshotListPending => cmd_snapshot_list_pending(tqdb), + Commands::SnapshotShow { index } => cmd_snapshot_show(tqdb, index), + Commands::NodeShow { serial } => cmd_node_show(tqdb, serial), + Commands::Rewind => cmd_rewind(tqdb), + Commands::Show => cmd_show(tqdb), + Commands::Events(args) => cmd_log_show(tqdb, args), + Commands::Summary {} => cmd_summary(tqdb), + }; + + for line in logs.take_log_lines() { + println!("{line}"); + } + + cmd_result +} + +/// Open an event log file for debugging +fn cmd_open( + tqdb: &mut Tqdb, + path: Utf8PathBuf, +) -> anyhow::Result> { + tqdb.reset_state(); + let json = fs::read_to_string(&path).context(path.clone())?; + let events: Vec = serde_json::from_str(&json) + .context("failed to deserialize event log")?; + tqdb.event_log_path = Some(path.clone()); + tqdb.events = events; + Ok(Some(format!("loaded event log: {path}\n{} events.", tqdb.events.len()))) +} + +/// Apply all events until completion or a breakpoint +fn cmd_run(tqdb: &mut Tqdb) -> anyhow::Result> { + if tqdb.event_log_path.is_none() { + bail!("please open an event log file"); + } + + let mut num_events = 0; + if tqdb.next_event < tqdb.events.len() { + let end = tqdb + .breakpoints + .iter() + .cloned() + .find(|&i| i > tqdb.next_event) + .unwrap_or(tqdb.events.len()); + let events: Vec<_> = + tqdb.events[tqdb.next_event..end].iter().cloned().collect(); + for event in events { + tqdb.current_state.apply_event(event); + num_events += 1; + tqdb.next_event += 1; + tqdb.maybe_snapshot(); + } + } + + let output = if tqdb.next_event == tqdb.events.len() { + format!("done: applied {} events", num_events) + } else { + format!( + "stopped at breakpoint {} after applying {} events", + tqdb.next_event, num_events + ) + }; + Ok(Some(output)) +} + +// Step through each event by applying them +fn cmd_step( + tqdb: &mut Tqdb, + num_events: Option, +) -> anyhow::Result> { + if tqdb.event_log_path.is_none() { + bail!("please open an event log file"); + } + + let num_events = num_events.unwrap_or(1); + + let end = tqdb.next_event + num_events; + if end > tqdb.events.len() { + bail!("Number of events to step exceeds remaining events"); + } + + let mut s = String::new(); + let mut applied_events = 0; + let events: Vec<_> = + tqdb.events[tqdb.next_event..end].iter().cloned().collect(); + for event in events { + writeln!(&mut s, "{} {event:#?}", tqdb.next_event)?; + tqdb.current_state.apply_event(event.clone()); + applied_events += 1; + tqdb.next_event += 1; + tqdb.maybe_snapshot(); + } + writeln!(&mut s, "done: applied {} events", applied_events)?; + Ok(Some(s)) +} + +fn cmd_toggle_breakpoint( + tqdb: &mut Tqdb, + index: usize, +) -> anyhow::Result> { + let output = if tqdb.toggle_breakpoint(index)? { + format!("breakpoint set at event {index}") + } else { + format!("breakpoint removed at event {index}") + }; + Ok(Some(output)) +} + +fn cmd_breakpoint_list(tqdb: &mut Tqdb) -> anyhow::Result> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct Breakpoint { + index: usize, + event: String, + } + + let rows = tqdb.breakpoints.iter().map(|i| Breakpoint { + index: *i, + event: format!("{:#?}", tqdb.events[*i]), + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + Ok(Some(table)) +} + +fn cmd_snapshot_list(tqdb: &mut Tqdb) -> anyhow::Result> { + let output = if tqdb.snapshots.is_empty() { + "no snapshots present".to_string() + } else { + let mut s = String::new(); + writeln!(&mut s, "Snapshot indexes: ")?; + for i in tqdb.snapshots.keys() { + writeln!(&mut s, "{i} ")?; + } + s + }; + Ok(Some(output)) +} + +fn cmd_snapshot_list_pending( + tqdb: &mut Tqdb, +) -> anyhow::Result> { + let output = if tqdb.pending_snapshots.is_empty() { + "no snapshots pending".to_string() + } else { + let mut s = String::new(); + writeln!(&mut s, "pending snapshot indexes: ")?; + for i in &tqdb.pending_snapshots { + writeln!(&mut s, "{i} ")?; + } + s + }; + Ok(Some(output)) +} + +fn cmd_snapshot( + tqdb: &mut Tqdb, + index: Option, +) -> anyhow::Result> { + if tqdb.event_log_path.is_none() { + bail!("please open an event log file"); + } + + if tqdb.next_event == 0 && index.is_none() { + bail!("please apply an event to generate a useful state"); + } + + let output = if let Some(index) = index { + if index < tqdb.next_event { + tqdb.pending_snapshots.insert(index); + format!( + "Setting pending snapshot.\n + Already applied event however. + Use 'rewind' to start over." + ) + } else if index > tqdb.events.len() { + bail!( + "index out of bounds. Only {} total events.", + tqdb.events.len() + ); + } else { + tqdb.pending_snapshots.insert(index); + format!("Setting pending snapshot") + } + } else { + tqdb.snapshots.insert( + tqdb.next_event.checked_sub(1).unwrap(), + tqdb.current_state.clone(), + ); + format!("Taking snapshot at current state") + }; + + Ok(Some(output)) +} + +fn cmd_snapshot_show( + tqdb: &mut Tqdb, + index: usize, +) -> anyhow::Result> { + match tqdb.snapshots.get(&index) { + Some(s) => Ok(Some(format!("{s:#?}"))), + None => bail!("no such snapshot"), + } +} + +fn cmd_diff( + tqdb: &mut Tqdb, + snapshot1: usize, + snapshot2: Option, +) -> anyhow::Result> { + if tqdb.event_log_path.is_none() { + bail!("please open an event log file"); + } + + if snapshot2.is_none() && tqdb.next_event == 0 { + bail!("please apply an event to get a useful state to diff with"); + } + + let Some(s1) = tqdb.snapshots.get(&snapshot1) else { + bail!("snapshot at {snapshot1} doesn't exist"); + }; + let diff = match snapshot2 { + Some(snapshot2) => { + let Some(s2) = tqdb.snapshots.get(&snapshot2) else { + bail!("snapshot at {snapshot2} doesn't exist"); + }; + if snapshot1 < snapshot2 { s1.diff(s2) } else { s2.diff(s1) } + } + None => { + if snapshot1 < tqdb.next_event { + s1.diff(&tqdb.current_state) + } else { + tqdb.current_state.diff(&s1) + } + } + }; + Ok(Some(format!("{diff}"))) +} + +fn cmd_show(tqdb: &Tqdb) -> anyhow::Result> { + if tqdb.event_log_path.is_none() { + bail!("please open an event log file"); + } + Ok(Some(format!("{:#?}", tqdb.current_state))) +} + +fn cmd_node_show( + tqdb: &mut Tqdb, + serial: usize, +) -> anyhow::Result> { + let id = PlatformId::new("test".into(), serial.to_string()); + let Some((node, ctx)) = tqdb.current_state.sut.nodes.get(&id) else { + bail!("failed to load node: {id}"); + }; + + Ok(Some(format!("{node:#?}\n{ctx:#?}"))) +} + +fn cmd_rewind(tqdb: &mut Tqdb) -> anyhow::Result> { + tqdb.rewind(); + + let mut s = String::new(); + writeln!(&mut s, "Re-initialized state and set next-event to 0")?; + writeln!(&mut s, "Breakpoints, Snapshots, and pending snapshots remain")?; + + Ok(Some(s)) +} + +fn cmd_log_show( + tqdb: &Tqdb, + args: EventsArgs, +) -> anyhow::Result> { + if tqdb.events.is_empty() { + bail!("no events loaded. Please call 'open' on a valid file"); + } + + // Find the possible start and end range of events + let (start, end) = match args.command { + Some(EventsCommand::All) => (0, tqdb.events.len()), + Some(EventsCommand::Next { num_events }) => { + (tqdb.next_event, tqdb.next_event + num_events) + } + Some(EventsCommand::Range { start, end }) => (start, end), + None => (tqdb.next_event, tqdb.next_event + 1), + }; + + let mut s = String::new(); + if start == tqdb.events.len() { + writeln!(&mut s, "finished applying events")?; + } else { + let end = usize::min(end, tqdb.events.len()); + for i in start..end { + writeln!(&mut s, "{i} {:#?}", tqdb.events[i])?; + } + } + + Ok(Some(s)) +} + +fn cmd_summary(tqdb: &mut Tqdb) -> anyhow::Result> { + let mut s = String::new(); + if let Some(path) = &tqdb.event_log_path { + writeln!(&mut s, "event log path: {:?}", path)?; + writeln!(&mut s, "total events in log: {}", tqdb.events.len())?; + } else { + bail!("no event log loaded: Please call 'open'"); + } + if tqdb.next_event != tqdb.events.len() { + writeln!(&mut s, "next event to apply: {}", tqdb.next_event)?; + writeln!(&mut s, " {:#?}", tqdb.events[tqdb.next_event])?; + } else { + writeln!(&mut s, "finished applying events")?; + } + + writeln!( + &mut s, + "total nodes under test: {}", + tqdb.current_state.sut.nodes.len() + )?; + let total_bootstrap_msgs = tqdb + .current_state + .bootstrap_network + .iter() + .fold(0, |acc, (_, e)| acc + e.len()); + writeln!( + &mut s, + "bootstrap network messages in flight: {}", + total_bootstrap_msgs + )?; + + if tqdb.next_event > 0 { + let latest_config = tqdb.current_state.nexus.latest_config(); + writeln!(&mut s, "nexus config: ")?; + writeln!(&mut s, " epoch: {}", latest_config.epoch)?; + writeln!(&mut s, " op: {:?}", latest_config.op)?; + writeln!( + &mut s, + " coordinator: {}", + latest_config.coordinator.serial_number() + )?; + writeln!(&mut s, " total members: {}", latest_config.members.len())?; + writeln!( + &mut s, + " prepared members: {}", + latest_config.prepared_members.len() + )?; + writeln!( + &mut s, + " committed members: {}", + latest_config.committed_members.len() + )?; + writeln!(&mut s, " threshold: {}", latest_config.threshold.0)?; + writeln!( + &mut s, + " commit crash tolerance: {}", + latest_config.commit_crash_tolerance + )?; + } + + Ok(Some(s)) +} From 1f36226e357170ab6fbb32f302519ffbed1a2976 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 8 Aug 2025 05:52:19 +0000 Subject: [PATCH 2/7] Add example event log --- ...rust_quorum_protocol.14368.453-events.json | 4668 +++++++++++++++++ 1 file changed, 4668 insertions(+) create mode 100644 trust-quorum/tqdb/example-event-logs/cluster-49df2a4b903c778a-test_trust_quorum_protocol.14368.453-events.json diff --git a/trust-quorum/tqdb/example-event-logs/cluster-49df2a4b903c778a-test_trust_quorum_protocol.14368.453-events.json b/trust-quorum/tqdb/example-event-logs/cluster-49df2a4b903c778a-test_trust_quorum_protocol.14368.453-events.json new file mode 100644 index 00000000000..f518a84590c --- /dev/null +++ b/trust-quorum/tqdb/example-event-logs/cluster-49df2a4b903c778a-test_trust_quorum_protocol.14368.453-events.json @@ -0,0 +1,4668 @@ +[ +{ + "InitialSetup": { + "member_universe_size": 40, + "config": { + "op": "Preparing", + "epoch": 1, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "1" + }, + "members": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "3" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "7" + }, + { + "part_number": "test", + "serial_number": "9" + } + ], + "threshold": 2, + "commit_crash_tolerance": 3, + "prepared_members": [], + "committed_members": [] + }, + "crashed_nodes": [ + { + "part_number": "test", + "serial_number": "11" + }, + { + "part_number": "test", + "serial_number": "16" + }, + { + "part_number": "test", + "serial_number": "3" + }, + { + "part_number": "test", + "serial_number": "7" + } + ] + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "37" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "25" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "9" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "32" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "34" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "5" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "39" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "27" + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "4" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "15" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 1, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "15" + }, + { + "part_number": "test", + "serial_number": "25" + }, + { + "part_number": "test", + "serial_number": "27" + }, + { + "part_number": "test", + "serial_number": "32" + }, + { + "part_number": "test", + "serial_number": "34" + }, + { + "part_number": "test", + "serial_number": "37" + }, + { + "part_number": "test", + "serial_number": "39" + }, + { + "part_number": "test", + "serial_number": "4" + }, + { + "part_number": "test", + "serial_number": "5" + }, + { + "part_number": "test", + "serial_number": "9" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 2, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "1" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 2, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 3, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "3" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "AbortConfiguration": 3 +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 4, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "3" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "AbortConfiguration": 4 +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 5, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "2" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 3, + "commit_crash_tolerance": 0, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 5, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 5, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 5, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 5, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 6, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "2" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ], + "threshold": 2, + "commit_crash_tolerance": 0, + "prepared_members": [], + "committed_members": [] + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 6, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 7, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "2" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ], + "threshold": 2, + "commit_crash_tolerance": 0, + "prepared_members": [], + "committed_members": [] + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 8, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "0" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 8, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 9, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "2" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 9, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 9, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 9, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 9, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 10, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "1" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 10, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 11, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "1" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 11, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 12, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "1" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 12, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 12, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 12, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 12, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 12, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 12, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 12, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 13, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "0" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 13, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 14, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "2" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 3, + "commit_crash_tolerance": 0, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 15, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "3" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "AbortConfiguration": 15 +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 16, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "2" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ], + "threshold": 2, + "commit_crash_tolerance": 0, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 17, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "1" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ], + "threshold": 2, + "commit_crash_tolerance": 0, + "prepared_members": [], + "committed_members": [] + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "1" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 17, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "Reconfigure": { + "op": "Preparing", + "epoch": 18, + "last_committed_epoch": null, + "coordinator": { + "part_number": "test", + "serial_number": "2" + }, + "members": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + }, + { + "part_number": "test", + "serial_number": "3" + } + ], + "threshold": 2, + "commit_crash_tolerance": 1, + "prepared_members": [], + "committed_members": [] + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "0" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "1" + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +{ + "DeliverEnvelope": { + "destination": { + "part_number": "test", + "serial_number": "2" + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply", +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +{ + "SendNexusReplyOnUnderlay": { + "AckedPreparesFromCoordinator": { + "epoch": 18, + "acks": [ + { + "part_number": "test", + "serial_number": "0" + }, + { + "part_number": "test", + "serial_number": "1" + }, + { + "part_number": "test", + "serial_number": "2" + } + ] + } + } +}, +"DeliverNexusReply", +"DeliverNexusReply" +] From 4ceab34963ce729aa05cd868d6f99f8dc5de718c Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 20 Aug 2025 16:00:27 +0000 Subject: [PATCH 3/7] Fix relevant AI code review suggestions These are for an older PR: #8682 https://gist.github.com/david-crespo/3accdaab0eb7cf651fe3ada6c00bacf1 --- trust-quorum/src/coordinator_state.rs | 12 ++++++------ trust-quorum/src/node.rs | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/trust-quorum/src/coordinator_state.rs b/trust-quorum/src/coordinator_state.rs index 1440cdcc68b..24be5ad0afa 100644 --- a/trust-quorum/src/coordinator_state.rs +++ b/trust-quorum/src/coordinator_state.rs @@ -211,13 +211,14 @@ impl CoordinatorState { #[expect(unused)] CoordinatorOperation::CollectLrtqShares { members, shares } => {} CoordinatorOperation::Prepare { prepares, .. } => { - for (platform_id, (config, share)) in - prepares.clone().into_iter() - { + for (platform_id, (config, share)) in prepares.iter() { if ctx.connected().contains(&platform_id) { ctx.send( - platform_id, - PeerMsgKind::Prepare { config, share }, + platform_id.clone(), + PeerMsgKind::Prepare { + config: config.clone(), + share: share.clone(), + }, ); } } @@ -241,7 +242,6 @@ impl CoordinatorState { } => {} CoordinatorOperation::CollectLrtqShares { members, shares } => {} CoordinatorOperation::Prepare { prepares, prepare_acks } => { - let rack_id = self.reconfigure_msg.rack_id(); if let Some((config, share)) = prepares.get(&to) { ctx.send( to, diff --git a/trust-quorum/src/node.rs b/trust-quorum/src/node.rs index 4f7240a4ba7..c64e15ecf36 100644 --- a/trust-quorum/src/node.rs +++ b/trust-quorum/src/node.rs @@ -568,6 +568,16 @@ impl Node { return; } + if !config.members.contains_key(ctx.platform_id()) { + error!( + self.log, + "Received Prepare when not a member of configuration"; + "from" => %from, + "prepare_epoch" => %config.epoch + ); + return; + } + // We always save the config and share if we haven't committed a later // configuration. If we have seen a newer `Prepare`, it's possible // that that configuration will not commit, and the latest committed @@ -594,7 +604,10 @@ impl Node { ); } // If we are coordinating for an older epoch, then we should stop - // coordinating. This epoch will never commit. + // coordinating. The configuration at this epoch will either never + // commit, or has already committed without us learning about it from + // Nexus. In either case the rest of the system has moved on and we + // should stop coordinating. if let Some(cs) = &self.coordinator_state { if msg_epoch > cs.reconfigure_msg().epoch() { // This prepare is for a newer configuration than the one we are From 2cf5ec1b11eb17b15f897206685174ff024da45e Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 20 Aug 2025 17:23:40 +0000 Subject: [PATCH 4/7] More AI code review fixes From an older PR: 8741 https://gist.github.com/david-crespo/a84474b432090316fa3efcb41335cc24 --- trust-quorum/src/compute_key_share.rs | 49 +++++++------------------ trust-quorum/src/coordinator_state.rs | 50 ++++++++------------------ trust-quorum/src/lib.rs | 52 +++++++++++++++++++++++++++ trust-quorum/src/node.rs | 18 ++++++++-- 4 files changed, 95 insertions(+), 74 deletions(-) diff --git a/trust-quorum/src/compute_key_share.rs b/trust-quorum/src/compute_key_share.rs index 7754edac401..648519733f5 100644 --- a/trust-quorum/src/compute_key_share.rs +++ b/trust-quorum/src/compute_key_share.rs @@ -8,13 +8,12 @@ //! share for that configuration it must collect a threshold of key shares from //! other nodes so that it can compute its own key share. -use crate::crypto::Sha3_256Digest; use crate::{ Alarm, Configuration, Epoch, NodeHandlerCtx, PeerMsgKind, PlatformId, }; use gfss::gf256::Gf256; use gfss::shamir::{self, Share}; -use slog::{Logger, error, o, warn}; +use slog::{Logger, error, o}; use std::collections::BTreeMap; /// In memory state that tracks retrieval of key shares in order to compute @@ -66,7 +65,9 @@ impl KeyShareComputer { ctx: &mut impl NodeHandlerCtx, peer: PlatformId, ) { - if !self.collected_shares.contains_key(&peer) { + if self.config.members.contains_key(&peer) + && !self.collected_shares.contains_key(&peer) + { ctx.send(peer, PeerMsgKind::GetShare(self.config.epoch)); } } @@ -82,55 +83,29 @@ impl KeyShareComputer { epoch: Epoch, share: Share, ) -> bool { - // Are we trying to retrieve shares for `epoch`? - if epoch != self.config.epoch { - warn!( - self.log, - "Received Share from node with wrong epoch"; - "received_epoch" => %epoch, - "from" => %from - ); - return false; - } - - // Is the sender a member of the configuration `epoch`? - // Was the sender a member of the configuration at `old_epoch`? - let Some(expected_digest) = self.config.members.get(&from) else { - warn!( - self.log, - "Received Share from unexpected node"; - "epoch" => %epoch, - "from" => %from - ); + if !crate::validate_share(&self.log, &self.config, &from, epoch, &share) + { + // Logging done inside `validate_share` return false; }; - // Does the share hash match what we expect? - let mut digest = Sha3_256Digest::default(); - share.digest::(&mut digest.0); - if digest != *expected_digest { - error!( - self.log, - "Received share with invalid digest"; - "epoch" => %epoch, - "from" => %from - ); - return false; - } - // A valid share was received. Is it new? if self.collected_shares.insert(from, share).is_some() { return false; } - // Do we have enough shares to computer our rack share? + // Do we have enough shares to compute our rack share? if self.collected_shares.len() < self.config.threshold.0 as usize { return false; } + // Share indices are assigned according the configuration membership's + // key order, when the configuration is constructed. + // // What index are we in the configuration? This is our "x-coordinate" // for our key share calculation. We always start indexing from 1, since // 0 is the rack secret. + // let index = self.config.members.keys().position(|id| id == ctx.platform_id()); diff --git a/trust-quorum/src/coordinator_state.rs b/trust-quorum/src/coordinator_state.rs index 24be5ad0afa..50cba4d839a 100644 --- a/trust-quorum/src/coordinator_state.rs +++ b/trust-quorum/src/coordinator_state.rs @@ -5,9 +5,7 @@ //! State of a reconfiguration coordinator inside a [`crate::Node`] use crate::configuration::ConfigurationDiff; -use crate::crypto::{ - LrtqShare, PlaintextRackSecrets, Sha3_256Digest, ShareDigestLrtq, -}; +use crate::crypto::{LrtqShare, PlaintextRackSecrets, ShareDigestLrtq}; use crate::validators::{ReconfigurationError, ValidatedReconfigureMsg}; use crate::{Configuration, Epoch, PeerMsgKind, PlatformId, RackSecret}; use crate::{NodeHandlerCtx, ValidatedReconfigureMsgDiff}; @@ -317,41 +315,17 @@ impl CoordinatorState { "new_epoch" => new_epoch.to_string() )); - // Are we trying to retrieve shares for `epoch`? - if *old_epoch != epoch { - warn!( - log, - "Received Share from node with wrong epoch"; - "received_epoch" => %epoch, - "from" => %from - ); + if !crate::validate_share( + &self.log, + &old_config, + &from, + epoch, + &share, + ) { + // Logging done inside `validate_share` return; } - // Was the sender a member of the configuration at `old_epoch`? - let Some(expected_digest) = old_config.members.get(&from) - else { - warn!( - log, - "Received Share from unexpected node"; - "received_epoch" => %epoch, - "from" => %from - ); - return; - }; - - // Does the share hash match what we expect? - let mut digest = Sha3_256Digest::default(); - share.digest::(&mut digest.0); - if digest != *expected_digest { - error!( - log, - "Received share with invalid digest"; - "received_epoch" => %epoch, - "from" => %from - ); - } - // A valid share was received. Is it new? if old_collected_shares.insert(from, share).is_some() { return; @@ -443,6 +417,12 @@ impl CoordinatorState { }; // Save the encrypted rack secrets in the current configuration + // + // A new configuration is always created with a `None` value + // for `encrypted_rack_secrets`, as it gets filled in here. + // + // If we change that it's a programmer error that will be caught + // immediately by our tests. assert!(self.configuration.encrypted_rack_secrets.is_none()); self.configuration.encrypted_rack_secrets = Some(new_encrypted_rack_secrets); diff --git a/trust-quorum/src/lib.rs b/trust-quorum/src/lib.rs index 9d326983826..aed8a518b9e 100644 --- a/trust-quorum/src/lib.rs +++ b/trust-quorum/src/lib.rs @@ -9,9 +9,12 @@ //! All persistent state and all networking is managed outside of this //! implementation. +use crypto::Sha3_256Digest; use daft::Diffable; use derive_more::Display; +use gfss::shamir::Share; use serde::{Deserialize, Serialize}; +use slog::{Logger, error, warn}; mod compute_key_share; mod configuration; @@ -133,3 +136,52 @@ pub struct Envelope { pub from: PlatformId, pub msg: PeerMsg, } + +/// Check if a received share is valid for a given configuration +/// +/// Return true if valid, false otherwise. +pub fn validate_share( + log: &Logger, + config: &Configuration, + from: &PlatformId, + epoch: Epoch, + share: &Share, +) -> bool { + // Are we trying to retrieve shares for `epoch`? + if epoch != config.epoch { + warn!( + log, + "Received Share from node with wrong epoch"; + "received_epoch" => %epoch, + "from" => %from + ); + return false; + } + + // Is the sender a member of the configuration `epoch`? + // Was the sender a member of the configuration at `old_epoch`? + let Some(expected_digest) = config.members.get(&from) else { + warn!( + log, + "Received Share from unexpected node"; + "epoch" => %epoch, + "from" => %from + ); + return false; + }; + + // Does the share hash match what we expect? + let mut digest = Sha3_256Digest::default(); + share.digest::(&mut digest.0); + if digest != *expected_digest { + error!( + log, + "Received share with invalid digest"; + "epoch" => %epoch, + "from" => %from + ); + return false; + } + + true +} diff --git a/trust-quorum/src/node.rs b/trust-quorum/src/node.rs index c64e15ecf36..16503dbef88 100644 --- a/trust-quorum/src/node.rs +++ b/trust-quorum/src/node.rs @@ -314,6 +314,19 @@ impl Node { from: PlatformId, config: Configuration, ) { + // The sender sent us a configuration even though we are not part of the + // configuration. This is a bug on the sender's part, but doesn't rise + // to the level of an alarm. Log an error. + if !config.members.contains_key(ctx.platform_id()) { + error!( + self.log, + "Received CommitAdvance, but not a member of configuration"; + "from" => %from, + "epoch" => %config.epoch + ); + return; + } + // We may have already advanced by the time we receive this message. // Let's check. if ctx.persistent_state().commits.contains(&config.epoch) { @@ -354,6 +367,7 @@ impl Node { config2: config.clone(), from: from.clone(), }); + return; } } else { ctx.update_persistent_state(|ps| { @@ -430,7 +444,7 @@ impl Node { } } - // We either were collectiong shares for an old epoch or haven't started + // We either were collecting shares for an old epoch or haven't started // yet. self.key_share_computer = Some(KeyShareComputer::new(&self.log, ctx, config)); @@ -461,7 +475,7 @@ impl Node { info!( self.log, concat!( - "Received 'GetShare'` from stale node. ", + "Received 'GetShare' from stale node. ", "Responded with 'CommitAdvance'." ); "from" => %from, From 72cd2ef326b67f9063b2e126d0e595e96c5ff54c Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 20 Aug 2025 18:59:22 +0000 Subject: [PATCH 5/7] clippy --- trust-quorum/test-utils/src/nexus.rs | 1 + trust-quorum/test-utils/src/state.rs | 43 ++++++++++++-------------- trust-quorum/tqdb/src/bin/tqdb/main.rs | 15 ++++----- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/trust-quorum/test-utils/src/nexus.rs b/trust-quorum/test-utils/src/nexus.rs index 53715e81fa8..a64acb39d09 100644 --- a/trust-quorum/test-utils/src/nexus.rs +++ b/trust-quorum/test-utils/src/nexus.rs @@ -111,6 +111,7 @@ pub struct NexusState { } impl NexusState { + #[allow(clippy::new_without_default)] pub fn new() -> NexusState { NexusState { rack_id: RackUuid::new_v4(), configs: IdOrdMap::new() } } diff --git a/trust-quorum/test-utils/src/state.rs b/trust-quorum/test-utils/src/state.rs index 9508e18c7e4..35ae9f13e84 100644 --- a/trust-quorum/test-utils/src/state.rs +++ b/trust-quorum/test-utils/src/state.rs @@ -464,7 +464,7 @@ impl Display for TqStateDiff<'_> { display_node_ctx_diff(ctx_diff, f)?; // Add a blank line between modified nodes - writeln!(f, "")?; + writeln!(f)?; } } @@ -566,11 +566,10 @@ fn display_underlay_network_diff( let added = after.difference(&before).count(); let removed = before.difference(&after).count(); - writeln!(f, " {} new nexus replies in flight on underlay network", added)?; + writeln!(f, " {added} new nexus replies in flight on underlay network")?; writeln!( f, - " {} nexus replies delivered to nexus from underlay network", - removed + " {removed} nexus replies delivered to nexus from underlay network", )?; Ok(()) @@ -582,14 +581,14 @@ fn display_bootstrap_network_diff( ) -> std::fmt::Result { if !diff.added.is_empty() { writeln!(f, " messages newly in flight on bootstrap network:")?; - for (id, _) in &diff.added { + for id in diff.added.keys() { writeln!(f, " destination: {id}")?; } } if !diff.removed.is_empty() { writeln!(f, " all messages delivered from bootstrap network:")?; - for (id, _) in &diff.removed { + for id in diff.removed.keys() { writeln!(f, " destination: {id}")?; } } @@ -623,27 +622,27 @@ fn display_node_ctx_diff( if !diff.persistent_state().shares.added.is_empty() { writeln!(f, " our share added to persistent state: ")?; - for (e, _) in &diff.persistent_state().shares.added { - writeln!(f, " epoch: {}", e)?; + for e in diff.persistent_state().shares.added.keys() { + writeln!(f, " epoch: {e}")?; } } if !diff.persistent_state().shares.removed.is_empty() { writeln!(f, " our share removed from persistent state: ")?; - for (e, _) in &diff.persistent_state().shares.removed { - writeln!(f, " epoch: {}", e)?; + for e in diff.persistent_state().shares.removed.keys() { + writeln!(f, " epoch: {e}")?; } } if !diff.persistent_state().commits.added.is_empty() { writeln!(f, " commit added to persistent state: ")?; for e in &diff.persistent_state().commits.added { - writeln!(f, " epoch: {}", e)?; + writeln!(f, " epoch: {e}")?; } } if !diff.persistent_state().commits.removed.is_empty() { writeln!(f, " commit removed from persistent state: ")?; for e in &diff.persistent_state().commits.removed { - writeln!(f, " epoch: {}", e)?; + writeln!(f, " epoch: {e}")?; } } @@ -718,7 +717,7 @@ fn display_node_diff( // They are both `Some`, so figure out what changed // by recursing - let diff = before.diff(&after); + let diff = before.diff(after); display_coordinator_state_diff(diff, f)?; } } @@ -793,13 +792,13 @@ pub fn display_validated_reconfigure_msg_diff( if !diff.members().added.is_empty() { writeln!(f, " added members:")?; for member in &diff.members().added { - writeln!(f, " {}", member)?; + writeln!(f, " {member}")?; } } if !diff.members().removed.is_empty() { writeln!(f, " removed members:")?; for member in &diff.members().removed { - writeln!(f, " {}", member)?; + writeln!(f, " {member}")?; } } if diff.threshold().is_modified() { @@ -846,19 +845,17 @@ pub fn display_coordinator_operation_diff( ) => { // If the collection epoch changed, then only report that if old_epoch != after_old_epoch { + #[allow(clippy::uninlined_format_args)] writeln!( f, " collecting shares: epoch changed: {} -> {}", old_epoch, after_old_epoch )?; - } else { - if old_collected_shares != after_old_collected_shares { - writeln!( - f, - " collected shares changed at epoch: {}", - old_epoch - )?; - } + } else if old_collected_shares != after_old_collected_shares { + writeln!( + f, + " collected shares changed at epoch: {old_epoch}", + )?; } } ( diff --git a/trust-quorum/tqdb/src/bin/tqdb/main.rs b/trust-quorum/tqdb/src/bin/tqdb/main.rs index 8778c45bdba..b7e44e590fe 100644 --- a/trust-quorum/tqdb/src/bin/tqdb/main.rs +++ b/trust-quorum/tqdb/src/bin/tqdb/main.rs @@ -397,8 +397,7 @@ fn cmd_run(tqdb: &mut Tqdb) -> anyhow::Result> { .cloned() .find(|&i| i > tqdb.next_event) .unwrap_or(tqdb.events.len()); - let events: Vec<_> = - tqdb.events[tqdb.next_event..end].iter().cloned().collect(); + let events: Vec<_> = tqdb.events[tqdb.next_event..end].to_vec(); for event in events { tqdb.current_state.apply_event(event); num_events += 1; @@ -436,8 +435,7 @@ fn cmd_step( let mut s = String::new(); let mut applied_events = 0; - let events: Vec<_> = - tqdb.events[tqdb.next_event..end].iter().cloned().collect(); + let events: Vec<_> = tqdb.events[tqdb.next_event..end].to_vec(); for event in events { writeln!(&mut s, "{} {event:#?}", tqdb.next_event)?; tqdb.current_state.apply_event(event.clone()); @@ -526,11 +524,10 @@ fn cmd_snapshot( let output = if let Some(index) = index { if index < tqdb.next_event { tqdb.pending_snapshots.insert(index); - format!( - "Setting pending snapshot.\n + "Setting pending snapshot.\n Already applied event however. Use 'rewind' to start over." - ) + .to_string() } else if index > tqdb.events.len() { bail!( "index out of bounds. Only {} total events.", @@ -538,14 +535,14 @@ fn cmd_snapshot( ); } else { tqdb.pending_snapshots.insert(index); - format!("Setting pending snapshot") + "Setting pending snapshot".to_string() } } else { tqdb.snapshots.insert( tqdb.next_event.checked_sub(1).unwrap(), tqdb.current_state.clone(), ); - format!("Taking snapshot at current state") + "Taking snapshot at current state".to_string() }; Ok(Some(output)) From 69b6544b4a14794386e1c5d0a7dcba32708e6a3a Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 20 Aug 2025 19:08:48 +0000 Subject: [PATCH 6/7] fix workspace-deps --- trust-quorum/test-utils/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/trust-quorum/test-utils/Cargo.toml b/trust-quorum/test-utils/Cargo.toml index 984d2bb8d7c..f2701c471a2 100644 --- a/trust-quorum/test-utils/Cargo.toml +++ b/trust-quorum/test-utils/Cargo.toml @@ -3,6 +3,9 @@ name = "trust-quorum-test-utils" version = "0.1.0" edition = "2024" +[lints] +workspace = true + [dependencies] camino.workspace = true daft.workspace = true From b22ee4a613cac054ec979e83c69b6950d278075d Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 20 Aug 2025 20:21:13 +0000 Subject: [PATCH 7/7] Cleanup test dir --- trust-quorum/tests/cluster.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trust-quorum/tests/cluster.rs b/trust-quorum/tests/cluster.rs index e09092925c1..c4ddd620daa 100644 --- a/trust-quorum/tests/cluster.rs +++ b/trust-quorum/tests/cluster.rs @@ -659,6 +659,6 @@ fn test_trust_quorum_protocol(input: TestInput) { "skipped_actions" => state.skipped_actions ); - // let _ = std::fs::remove_file(event_log_path); + let _ = std::fs::remove_file(event_log_path); logctx.cleanup_successful(); }