diff --git a/Cargo.lock b/Cargo.lock index ba141f6fa1..1e6c532e79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11334,6 +11334,7 @@ dependencies = [ "compio", "configs", "consensus", + "crossfire", "ctrlc", "cyper", "cyper-axum", @@ -11342,7 +11343,6 @@ dependencies = [ "err_trail", "error_set", "figlet-rs", - "flume 0.12.0", "fs2", "hash32 1.0.0", "human-repr", @@ -11464,6 +11464,7 @@ dependencies = [ "metadata", "papaya", "partitions", + "prometheus-client", "tracing", ] diff --git a/core/configs/src/server_config/sharding.rs b/core/configs/src/server_config/sharding.rs index 084c984e1c..fcf0396f05 100644 --- a/core/configs/src/server_config/sharding.rs +++ b/core/configs/src/server_config/sharding.rs @@ -21,11 +21,69 @@ use std::str::FromStr; use configs::ConfigEnv; -#[derive(Debug, Deserialize, Serialize, Default, ConfigEnv)] +/// Default capacity of the per-shard inter-shard inbox channel. Sized +/// comfortably above the consensus working set, which is roughly +/// `PIPELINE_PREPARE_QUEUE_MAX (= 8) * replica_count * directions` +/// frames in flight per shard, without allowing a runaway producer to +/// eat unbounded memory. Tunable via `[system.sharding] inbox_capacity` +/// in TOML. +/// +/// The capacity must also absorb the worst-case cross-shard client +/// Reply burst. Unlike consensus frames, client Replies have no VSR +/// retransmit path: a Reply lost on full inbox is gone and the client +/// times out. A reasonable lower bound is +/// `max_inflight_client_requests / num_shards` (assuming requests are +/// distributed evenly across owning shards) plus the consensus +/// headroom above. +/// +/// Consensus frames and client-reply forwards share this one channel, +/// so the two headrooms are not independent: a consensus burst or +/// retransmit storm can fill the inbox with consensus frames exactly +/// when a client Reply needs the space. A single `inbox_capacity` knob +/// cannot isolate the two frame classes - size it for the sum of both +/// worst cases occurring together. Watch the drop-site `tracing` logs +/// (and, once a per-shard exporter lands, the `frame_drops_total` +/// `{variant="forward_client_send"}` counter) to detect when the bound +/// is too low in production. +pub const DEFAULT_INBOX_CAPACITY: usize = 1024; + +/// Maximum permitted per-shard inbox depth. The channel is allocated +/// up-front per shard, so a runaway value here OOMs the process at boot. +/// `1 << 20` (~1M frames) is several orders of magnitude above any +/// realistic backpressure target and still fits comfortably in process +/// address space. +pub const INBOX_CAPACITY_MAX: usize = 1 << 20; + +const fn default_inbox_capacity() -> usize { + DEFAULT_INBOX_CAPACITY +} + +#[derive(Debug, Deserialize, Serialize, ConfigEnv)] pub struct ShardingConfig { #[serde(default)] #[config_env(leaf)] pub cpu_allocation: CpuAllocation, + /// Per-shard inter-shard inbox channel capacity. Bounded by design. + /// Drops on full inbox of consensus frames are recovered by VSR + /// retransmit. Drops of cross-shard client Reply frames are terminal: + /// the client never receives the reply (no in-protocol retransmit). + /// Both frame classes share this one channel, so a consensus burst + /// can starve client-reply forwards: size against the worst-case sum + /// of consensus working set + peak client-reply fan-out per shard + /// occurring together; see `DEFAULT_INBOX_CAPACITY` for the + /// rationale. Used by `core/server-ng`; the legacy server uses its + /// own hard-coded inbox sizing. + #[serde(default = "default_inbox_capacity")] + pub inbox_capacity: usize, +} + +impl Default for ShardingConfig { + fn default() -> Self { + Self { + cpu_allocation: CpuAllocation::default(), + inbox_capacity: DEFAULT_INBOX_CAPACITY, + } + } } #[derive(Debug, Clone, PartialEq, Default)] diff --git a/core/configs/src/server_config/validators.rs b/core/configs/src/server_config/validators.rs index 1512961d6e..3e5a8fee84 100644 --- a/core/configs/src/server_config/validators.rs +++ b/core/configs/src/server_config/validators.rs @@ -23,7 +23,7 @@ use super::server::{ DataMaintenanceConfig, MessageSaverConfig, MessagesMaintenanceConfig, TelemetryConfig, }; use super::server::{MemoryPoolConfig, PersonalAccessTokenConfig, ServerConfig}; -use super::sharding::{CpuAllocation, ShardingConfig}; +use super::sharding::{CpuAllocation, INBOX_CAPACITY_MAX, ShardingConfig}; use super::system::SegmentConfig; use super::system::{CompressionConfig, LoggingConfig, PartitionConfig}; use crate::ConfigurationError; @@ -379,6 +379,23 @@ impl Validatable for MemoryPoolConfig { impl Validatable for ShardingConfig { fn validate(&self) -> Result<(), ConfigurationError> { + if self.inbox_capacity == 0 { + eprintln!( + "Invalid sharding configuration: inbox_capacity must be > 0 (crossfire silently \ + rounds 0 to 1, masking config errors)" + ); + return Err(ConfigurationError::InvalidConfigurationValue); + } + if self.inbox_capacity > INBOX_CAPACITY_MAX { + eprintln!( + "Invalid sharding configuration: inbox_capacity {} exceeds the {} cap (each \ + shard preallocates a channel of this size; oversizing here OOMs the process at \ + boot)", + self.inbox_capacity, INBOX_CAPACITY_MAX + ); + return Err(ConfigurationError::InvalidConfigurationValue); + } + let available_cpus = available_parallelism() .map_err(|_| { eprintln!("Failed to detect available CPU cores"); diff --git a/core/consensus/src/metadata_helpers.rs b/core/consensus/src/metadata_helpers.rs index 66885911e7..5904f1049e 100644 --- a/core/consensus/src/metadata_helpers.rs +++ b/core/consensus/src/metadata_helpers.rs @@ -353,6 +353,10 @@ mod tests { ) -> Result<(), SendError> { Ok(()) } + + fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {} + fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {} + fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {} } // Register-retry replay: cached IS the register reply; bytes replayed diff --git a/core/consensus/src/plane_helpers.rs b/core/consensus/src/plane_helpers.rs index e2a500cae0..7e0a89cf44 100644 --- a/core/consensus/src/plane_helpers.rs +++ b/core/consensus/src/plane_helpers.rs @@ -472,6 +472,10 @@ mod tests { ) -> Result<(), SendError> { Ok(()) } + + fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {} + fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {} + fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {} } fn prepare_message(op: u64, parent: u128, checksum: u128) -> Message { @@ -675,6 +679,10 @@ mod tests { self.sent.borrow_mut().push((replica, data)); Ok(()) } + + fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {} + fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {} + fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {} } #[test] diff --git a/core/journal/src/file_storage.rs b/core/journal/src/file_storage.rs index 6517f5b908..0de1ac6730 100644 --- a/core/journal/src/file_storage.rs +++ b/core/journal/src/file_storage.rs @@ -31,7 +31,8 @@ pub struct FileStorage { #[allow(clippy::future_not_send)] impl FileStorage { - /// Open or create the file at `path`, setting `write_offset` to current file length. + /// Open or create the file at `path` in read-write mode, setting + /// `write_offset` to current file length. /// /// # Errors /// Returns an I/O error if the file cannot be opened or created. diff --git a/core/journal/src/prepare_journal.rs b/core/journal/src/prepare_journal.rs index 283db5b4d4..ac16d87113 100644 --- a/core/journal/src/prepare_journal.rs +++ b/core/journal/src/prepare_journal.rs @@ -44,7 +44,7 @@ const MAX_ENTRY_SIZE: u64 = 64 * 1024 * 1024; /// still contains them, but they become unreachable for recovery). /// /// **NOTE:** This number needs to be chosen in balance between number of -/// entries in [`core::consensus::pipeline_prepare_queue_max`]. Because this number controls +/// entries in [`consensus::PIPELINE_PREPARE_QUEUE_MAX`]. Because this number controls /// how many committed but not yet snapshotted entries that the buffer can /// hold. This may need to be tuned properly. pub(crate) const SLOT_COUNT: usize = 1024; @@ -117,9 +117,47 @@ const fn slot_for_op(op: u64) -> usize { op as usize % SLOT_COUNT } +/// Repair a damaged WAL tail by truncating to `pos`, or surface a loud +/// error when truncation would be unsafe. +/// +/// Truncation is only sound for a torn final append, which writes at most +/// one entry's worth of bytes. If more than `MAX_ENTRY_SIZE` bytes follow +/// `pos`, the damage is mid-file: truncating would silently discard every +/// committed entry after it, so this hard-errors instead of repairing. +#[allow(clippy::future_not_send)] +async fn truncate_or_fail( + storage: &FileStorage, + pos: u64, + reason: &'static str, +) -> Result<(), JournalError> { + let trailing = storage.file_len().saturating_sub(pos); + // Sound only because this WAL appends exactly one entry per `append` + // + fsync, so a torn tail is at most one entry wide. A batched-append + // WAL could leave a torn tail many entries wide, and this + // `> MAX_ENTRY_SIZE` check would misclassify it as mid-file damage. + if trailing > MAX_ENTRY_SIZE { + return Err(JournalError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "mid-file WAL corruption at pos {pos}: {reason}; {trailing} bytes \ + follow (> MAX_ENTRY_SIZE {MAX_ENTRY_SIZE}), refusing to truncate \ + and discard committed entries" + ), + ))); + } + storage.truncate(pos).await?; + // The repair must be crash-durable. `FileStorage::truncate` is a + // bare `set_len`; without this fsync a power loss right after the + // repair re-presents the torn tail on the next boot. Mirrors the + // write-then-fsync the `append` path already does. + storage.fsync().await?; + Ok(()) +} + #[allow(clippy::cast_possible_truncation)] impl PrepareJournal { - /// Open the WAL file, scanning forward to rebuild the in-memory index. + /// Open the WAL file in read-write mode, scanning forward to rebuild + /// the in-memory index. /// /// `snapshot_op` is the highest op that has been durably snapshotted. /// It must be provided so that `append()` can detect slot collisions @@ -133,6 +171,11 @@ impl PrepareJournal { #[allow(clippy::future_not_send)] pub async fn open(path: &Path, snapshot_op: u64) -> Result { let storage = FileStorage::open(path).await?; + Self::scan(storage, snapshot_op).await + } + + #[allow(clippy::future_not_send)] + async fn scan(storage: FileStorage, snapshot_op: u64) -> Result { let file_len = storage.file_len(); let mut headers: Vec> = vec![None; SLOT_COUNT]; let mut offsets: Vec> = vec![None; SLOT_COUNT]; @@ -143,27 +186,40 @@ impl PrepareJournal { while pos + HEADER_SIZE as u64 <= file_len { // Read the 256-byte header header_buf = storage.read_at(pos, header_buf).await?; - let header: PrepareHeader = - *bytemuck::checked::from_bytes::(&header_buf); + // `try_from_bytes` so an invalid bit pattern (e.g. a corrupt + // `command` byte yielding no `Command2` variant) routes through + // `truncate_or_fail` instead of panicking the recovery thread. + let Ok(header_ref) = bytemuck::checked::try_from_bytes::(&header_buf) + else { + truncate_or_fail(&storage, pos, "corrupt header (invalid bit pattern)").await?; + break; + }; + let header: PrepareHeader = *header_ref; // Validate: must be a Prepare command with sane size if header.command != Command2::Prepare || (header.size as usize) < HEADER_SIZE || u64::from(header.size) > MAX_ENTRY_SIZE { - // Corrupt or non-prepare entry, truncate here - storage.truncate(pos).await?; + truncate_or_fail(&storage, pos, "corrupt or non-prepare entry").await?; break; } let entry_size = u64::from(header.size); + // TODO(hubcio): verify `header.checksum` / `header.checksum_body` + // against the entry body during scan and route a mismatch + // through `truncate_or_fail`. Blocked on the writer side: the + // `PrepareHeader` projection in consensus builds prepares with + // `..Default::default()` so the integrity fields are always 0. + // Until a producer computes them, verification here would be + // trivially-passing noise. Without it, a body bit-flip that + // leaves the header valid is replayed silently as corrupt + // state. + // Check if the full entry fits if pos + entry_size > file_len { - // Truncated entry at tail - // This handles the case where crash happened during write and - // only header was written and body was not. so we truncate the file to the start of the entry. - storage.truncate(pos).await?; + truncate_or_fail(&storage, pos, "truncated entry at tail").await?; break; } @@ -185,7 +241,7 @@ impl PrepareJournal { // If there are leftover bytes less than a header, truncate them if pos < storage.file_len() { - storage.truncate(pos).await?; + truncate_or_fail(&storage, pos, "leftover bytes shorter than a header").await?; } Ok(Self { @@ -216,9 +272,13 @@ impl PrepareJournal { /// Advance the snapshot watermark. The caller must ensure `op` is /// monotonically increasing and corresponds to a durable snapshot. /// + /// # Errors + /// Returns `JournalError::Io` if a future durability requirement + /// surfaces an I/O failure; currently infallible. + /// /// # Panics /// Panics if `op` is less than the current snapshot watermark. - pub fn set_snapshot_op(&self, op: u64) { + pub fn set_snapshot_op(&self, op: u64) -> Result<(), JournalError> { assert!( op >= self.snapshot_op.get(), "snapshot_op must be monotonically increasing: {} -> {}", @@ -226,6 +286,7 @@ impl PrepareJournal { op ); self.snapshot_op.set(op); + Ok(()) } /// Access the underlying storage (for fsync in tests, etc.). @@ -322,6 +383,20 @@ impl Journal for PrepareJournal { // Advance the snapshot watermark so future appends treat // drained ops as safe to overwrite. + // + // TODO(hubcio): this advances the in-memory watermark before the + // WAL rewrite below (tmp create -> write live entries -> fsync -> + // rename -> fsync parent -> reopen) is durable. Any `?` failure + // between here and `reopen()` leaves `snapshot_op` advanced while + // the WAL file and the in-memory index still hold the un-drained + // ops. The next `append()` that wraps onto one of those slots + // then sees `existing.op <= snapshot_op`, passes the slot + // collision check, and silently evicts a live entry from the + // index instead of panicking. The entry survives on disk but + // becomes unreachable: consensus can no longer answer + // `RetransmitPrepares` for it and a lagging peer stalls until a + // view change. Fix: advance `snapshot_op` only after `reopen()` + // returns `Ok`. if end_op > self.snapshot_op.get() { self.snapshot_op.set(end_op); } @@ -588,6 +663,78 @@ mod tests { assert_eq!(entry.header().op, 1); } + #[compio::test] + async fn truncate_or_fail_durably_repairs_torn_tail() { + let dir = tempdir().unwrap(); + let path = dir.path().join("journal.wal"); + + let good_len = { + let journal = PrepareJournal::open(&path, 0).await.unwrap(); + journal.append(make_prepare(1, 64)).await.unwrap(); + journal.append(make_prepare(2, 64)).await.unwrap(); + journal.storage.fsync().await.unwrap(); + journal.storage.file_len() + }; + + // Simulate a writer crash mid-append: junk bytes past the last + // good entry, shorter than MAX_ENTRY_SIZE so it reads as a torn + // tail rather than mid-file corruption. + { + let storage = FileStorage::open(&path).await.unwrap(); + storage.write_append(vec![0xAB_u8; 16]).await.unwrap(); + storage.fsync().await.unwrap(); + } + + // The recovery repair path truncates back to the last good entry. + { + let storage = FileStorage::open(&path).await.unwrap(); + truncate_or_fail(&storage, good_len, "torn tail test") + .await + .unwrap(); + } + + // The repair is durable: a fresh open sees the file ending + // exactly at the last good entry, with no torn tail to re-repair. + assert_eq!(std::fs::metadata(&path).unwrap().len(), good_len); + let journal = PrepareJournal::open(&path, 0).await.unwrap(); + assert_eq!(journal.last_op(), Some(2)); + assert!(journal.header(3).is_none()); + } + + #[compio::test] + async fn open_rejects_mid_file_corruption() { + let dir = tempdir().unwrap(); + let path = dir.path().join("journal.wal"); + + // A corrupt header followed by more than MAX_ENTRY_SIZE of + // trailing bytes is mid-file damage, not a torn final append. A + // read-write open must hard-error instead of silently truncating + // and discarding every committed entry after the corruption. + { + use std::io::Write; + let mut file = std::fs::File::create(&path).unwrap(); + // All-0xFF is not a valid `Command2`/`Operation` bit pattern, + // so `try_from_bytes` rejects the header. + file.write_all(&[0xFF_u8; HEADER_SIZE]).unwrap(); + // Sparse-extend past MAX_ENTRY_SIZE so the corruption at pos 0 + // sits far from EOF without writing 64 MiB. + file.set_len(MAX_ENTRY_SIZE + HEADER_SIZE as u64).unwrap(); + file.sync_all().unwrap(); + } + + let size_before = std::fs::metadata(&path).unwrap().len(); + let err = PrepareJournal::open(&path, 0).await; + assert!( + err.is_err(), + "mid-file corruption must hard-error, not truncate" + ); + let size_after = std::fs::metadata(&path).unwrap().len(); + assert_eq!( + size_before, size_after, + "a rejected mid-file scan must not truncate the WAL" + ); + } + #[compio::test] async fn iter_headers_from() { let dir = tempdir().unwrap(); @@ -637,7 +784,7 @@ mod tests { // Op 3 goes to slot 3 journal.append(make_prepare(3, 32)).await.unwrap(); // Mark op 3 as snapshotted - safe to evict - journal.set_snapshot_op(3); + journal.set_snapshot_op(3).unwrap(); // Op 3 + SLOT_COUNT goes to the same slot, evicting op 3 let wraparound_op = 3 + SLOT_COUNT as u64; journal diff --git a/core/message_bus/src/client_listener/mod.rs b/core/message_bus/src/client_listener/mod.rs index 2d8a9ea6c9..2eea83bd99 100644 --- a/core/message_bus/src/client_listener/mod.rs +++ b/core/message_bus/src/client_listener/mod.rs @@ -25,7 +25,7 @@ //! peer cannot block the accept loop. //! - The `on_accepted` callback is responsible for minting a //! `client_id`. The plain TCP and pre-upgrade WS paths route to the -//! owning shard via `ShardFramePayload::ClientConnectionSetup` and +//! owning shard via `shard::LifecycleFrame::ClientConnectionSetup` and //! `ClientWsConnectionSetup` respectively. TCP-TLS, WSS, and QUIC //! stay shard-0 terminal: their connection state is not serialisable //! so the callback installs locally on shard 0 instead of shipping a diff --git a/core/message_bus/src/client_listener/tcp.rs b/core/message_bus/src/client_listener/tcp.rs index e8b3ab8cec..eee380185f 100644 --- a/core/message_bus/src/client_listener/tcp.rs +++ b/core/message_bus/src/client_listener/tcp.rs @@ -44,7 +44,7 @@ use tracing::{debug, error, info}; pub async fn bind(addr: SocketAddr) -> Result<(TcpListener, SocketAddr), IggyError> { // `SO_REUSEPORT` intentionally not set: only shard 0 binds the client // listener. The shard-0 coordinator round-robins accepts to owning - // shards via `ShardFramePayload::ClientConnectionSetup`. + // shards via `shard::LifecycleFrame::ClientConnectionSetup`. let opts = SocketOpts::new().nodelay(true); let listener = TcpListener::bind_with_options(addr, &opts) .await diff --git a/core/message_bus/src/client_listener/tcp_tls.rs b/core/message_bus/src/client_listener/tcp_tls.rs index 7e70044248..756795e8cf 100644 --- a/core/message_bus/src/client_listener/tcp_tls.rs +++ b/core/message_bus/src/client_listener/tcp_tls.rs @@ -59,7 +59,7 @@ use tracing::{debug, error, info}; /// time; the install path re-applies on every accepted stream because /// Linux does not propagate listener options to accepted sockets. /// `SO_KEEPALIVE` is intentionally NOT set (see -/// [`crate::socket_opts`]): SDK clients manage their own keepalive +/// `socket_opts`): SDK clients manage their own keepalive /// policy at the application layer and replica<->replica liveness is /// observed by VSR heartbeats, so the kernel timer would only race the /// app-level signal. diff --git a/core/message_bus/src/client_listener/ws.rs b/core/message_bus/src/client_listener/ws.rs index 90a6aa2668..20775cbb96 100644 --- a/core/message_bus/src/client_listener/ws.rs +++ b/core/message_bus/src/client_listener/ws.rs @@ -20,7 +20,7 @@ //! Runs only on shard 0. The accept loop performs no protocol work //! beyond `TcpListener::accept`: every accepted stream is handed //! verbatim to the supplied callback, which dups the fd and ships a -//! `ShardFramePayload::ClientWsConnectionSetup` frame to the +//! `shard::LifecycleFrame::ClientWsConnectionSetup` frame to the //! round-robin-selected target shard. The HTTP-Upgrade handshake runs //! on the owning shard inside //! [`crate::ConnectionInstaller::install_client_ws_fd`]; no @@ -30,7 +30,7 @@ //! well-defined. The WS state machine only materialises after the //! upgrade, on the owning shard, where it can stay non-`Send`. //! -//! `ShardFramePayload::ClientWsConnectionSetup` is defined in +//! `shard::LifecycleFrame::ClientWsConnectionSetup` is defined in //! `core/shard/src/lib.rs`; the rustdoc cannot intra-link across crates //! without pulling `shard` in as a doc-only dep. @@ -47,7 +47,7 @@ use tracing::{debug, error, info}; /// /// Mirrors [`crate::client_listener::tcp::bind`] in shape: `TCP_NODELAY` /// on by default; `SO_KEEPALIVE` intentionally NOT set (see -/// [`crate::socket_opts`]). The receiving shard re-applies socket +/// `socket_opts`). The receiving shard re-applies socket /// options on the dup'd fd via the existing client-install path, so /// kernel-level options propagate end-to-end. /// @@ -58,7 +58,7 @@ use tracing::{debug, error, info}; pub async fn bind(addr: SocketAddr) -> Result<(TcpListener, SocketAddr), IggyError> { // `SO_REUSEPORT` intentionally not set: only shard 0 binds the WS // listener. The shard-0 coordinator round-robins accepts to owning - // shards via `ShardFramePayload::ClientWsConnectionSetup`. + // shards via `shard::LifecycleFrame::ClientWsConnectionSetup`. let opts = SocketOpts::new().nodelay(true); let listener = TcpListener::bind_with_options(addr, &opts) .await diff --git a/core/message_bus/src/error.rs b/core/message_bus/src/error.rs index 85ae33b7db..7d5fdd7df5 100644 --- a/core/message_bus/src/error.rs +++ b/core/message_bus/src/error.rs @@ -26,12 +26,26 @@ use thiserror::Error; /// The client- and replica-keyed variants separate three physically /// distinct failure modes so operators can tell apart routing bugs from /// transient disconnects: -/// - `*NotFound` / `*NotConnected` — the key is unknown to this shard's +/// - `*NotFound` / `*NotConnected` - the key is unknown to this shard's /// local registry (genuine peer state, often recoverable by reconnect). -/// - `*RouteMissing` — the bus cannot forward to the owning shard +/// - `*RouteMissing` - the bus cannot forward to the owning shard /// because no forward fn was installed (bootstrap ordering bug). -/// - `*ForwardFailed` — the forward fn rejected the frame (inter-shard +/// - `*ForwardFailed` - the forward fn rejected the frame (inter-shard /// queue full, shutdown, etc.). +/// +/// Recovery asymmetry between forward variants: +/// - `ReplicaForwardFailed` is recoverable in-protocol. The dropped frame +/// is a consensus message (`Prepare` / `PrepareOk` / view-change / +/// `Commit`); VSR's WAL-driven retransmit re-sends it once the inbox +/// drains. +/// - `ClientForwardFailed` is terminal. Client Reply payloads live above +/// the bus and have no retransmit path. A reply lost on full inbox is +/// gone, the client times out, and request / response semantics break. +/// Operators must size `inbox_capacity` for the worst-case cross-shard +/// reply burst and alert on the drop-site `tracing` logs. The +/// `frame_drops_total` `{variant="forward_client_send"}` counter is +/// the structured complement, scrape-able once a per-shard exporter +/// lands. #[derive(Debug, Error)] #[non_exhaustive] pub enum SendError { @@ -41,6 +55,11 @@ pub enum SendError { #[error("client {0}: no inter-shard forward fn installed")] ClientRouteMissing(u128), + /// The owning shard's inter-shard inbox rejected the forward frame + /// (full or disconnected). Unlike `ReplicaForwardFailed`, no VSR + /// retransmit covers this drop: the client Reply payload was moved + /// into the frame and is now gone. Callers must treat this as a + /// final failure for the affected request; the client will time out. #[error("client {0}: inter-shard forward failed")] ClientForwardFailed(u128), diff --git a/core/message_bus/src/installer/mod.rs b/core/message_bus/src/installer/mod.rs index 116f887ffe..6af7336578 100644 --- a/core/message_bus/src/installer/mod.rs +++ b/core/message_bus/src/installer/mod.rs @@ -55,7 +55,7 @@ use std::rc::Rc; use tracing::warn; /// Operations a shard needs to perform on its local bus when the router -/// receives an inter-shard connection-setup or mapping frame. +/// receives an inter-shard connection-setup frame. /// /// The production implementation is on `Rc`. The simulator /// does not exercise this path; if it ever does, add a no-op impl on @@ -87,14 +87,6 @@ pub trait ConnectionInstaller { /// subprotocol negotiation: the caller (server-ng) gates command /// access via the LOGIN allowlist. fn install_client_ws_fd(&self, fd: DupedFd, meta: ClientConnMeta, on_request: RequestHandler); - - /// Update the replica -> owning shard mapping used by the `send_to_replica` - /// slow path on non-owning shards. - fn set_shard_mapping(&self, replica: u8, owning_shard: u16); - - /// Forget the replica -> owning shard mapping (e.g. after a connection - /// loss, before the next allocate). - fn remove_shard_mapping(&self, replica: u8); } impl ConnectionInstaller for Rc { @@ -140,12 +132,4 @@ impl ConnectionInstaller for Rc { }); self.track_background(handle); } - - fn set_shard_mapping(&self, replica: u8, owning_shard: u16) { - IggyMessageBus::set_shard_mapping(self, replica, owning_shard); - } - - fn remove_shard_mapping(&self, replica: u8) { - IggyMessageBus::remove_shard_mapping(self, replica); - } } diff --git a/core/message_bus/src/installer/replica.rs b/core/message_bus/src/installer/replica.rs index 3c20fa0a3a..6e1d07cdfa 100644 --- a/core/message_bus/src/installer/replica.rs +++ b/core/message_bus/src/installer/replica.rs @@ -82,19 +82,20 @@ pub fn install_replica_conn( async_channel::bounded::>(bus.peer_queue_capacity()); // Writer and reader both observe abnormal close and used to fire - // `notify_connection_lost` twice per disconnect, causing shard 0 to - // broadcast two `ReplicaMappingClear` rounds and churn the mapping. + // `notify_connection_lost` twice per disconnect, double-clearing the + // owner-table slot and double-firing any installed test callback. // Shared one-shot guard: whichever post-loop runs first wins. let notified = Rc::new(Cell::new(false)); // If the registry insert below races with a concurrent install for // the same peer id and loses, both spawned halves must skip their // post-loop cleanup: the loser's `replicas().remove` / // `close_peer_if_token_matches` calls would no-op against the winner's - // generation token (so they can't evict the live entry), but - // `notify_connection_lost` has no token guard and would still broadcast - // a spurious mapping-clear round. `compio::runtime::JoinHandle::drop` - // does not cancel the spawned task, so we have to tell the tasks to - // stand down in-band. + // generation token (so they can't evict the live entry), and + // `notify_connection_lost` stands down whenever a live registry entry + // exists - but the loser also drives `replica_dispatch_loop`, which + // must never hand the winner's replica id to `on_message`. + // `compio::runtime::JoinHandle::drop` does not cancel the spawned + // task, so we have to tell the tasks to stand down in-band. let install_aborted = Rc::new(Cell::new(false)); // Generation token published by the registry on a successful insert. @@ -266,6 +267,12 @@ pub fn install_replica_conn( ) { Ok(token) => { install_token.set(Some(token)); + // Stamp this shard as the owner so every shard's + // `send_to_replica` slow path can route to us immediately. The + // matching CAS-clear runs from + // `IggyMessageBus::notify_connection_lost` on either of the + // post-loop guards firing. + bus.mark_replica_owned(peer_id); } Err(rejected) => { // Tell both halves to stand down: the winner's entry is live and diff --git a/core/message_bus/src/installer/tcp_tls.rs b/core/message_bus/src/installer/tcp_tls.rs index d9468f6d0f..3497aac79a 100644 --- a/core/message_bus/src/installer/tcp_tls.rs +++ b/core/message_bus/src/installer/tcp_tls.rs @@ -41,7 +41,7 @@ use tracing::warn; /// handshake starts, matching [`super::tcp::install_client_tcp`]'s /// plaintext behaviour. Linux does not propagate it from the listener /// to accepted sockets, so toggling here is required. `SO_KEEPALIVE` -/// is intentionally NOT set; see [`crate::socket_opts`]. +/// is intentionally NOT set; see `socket_opts`. /// /// TCP-TLS is shard-0 terminal: the rustls connection state machine /// is non-serialisable and tied to the local task; pre-handshake the diff --git a/core/message_bus/src/installer/wss.rs b/core/message_bus/src/installer/wss.rs index 3ba359c7f6..32db26cb4c 100644 --- a/core/message_bus/src/installer/wss.rs +++ b/core/message_bus/src/installer/wss.rs @@ -41,7 +41,7 @@ use tracing::warn; /// /// `TCP_NODELAY` is applied pre-handshake for symmetry with /// [`super::tcp_tls::install_client_tcp_tls`]. `SO_KEEPALIVE` is -/// intentionally NOT set; see [`crate::socket_opts`]. +/// intentionally NOT set; see `socket_opts`. /// /// WSS is shard-0 terminal for the same reasons as the TCP-TLS plane; /// see [`super::tcp_tls::install_client_tcp_tls`] for the rustls diff --git a/core/message_bus/src/lib.rs b/core/message_bus/src/lib.rs index a927bf8c6a..9b1163d341 100644 --- a/core/message_bus/src/lib.rs +++ b/core/message_bus/src/lib.rs @@ -59,7 +59,7 @@ //! - fd-delegation ([`fd_transfer`]) is TCP-only. TLS / QUIC //! connections have no dupable plaintext fd, so shard 0 terminates //! and forwards `Frozen` over the existing -//! inter-shard flume. +//! inter-shard crossfire channel. //! - 0-RTT stays disabled by default on any future QUIC path. Per- //! command opt-in requires a checked-in idempotence audit. //! @@ -84,8 +84,7 @@ pub mod framing; pub mod installer; pub mod lifecycle; pub mod replica; -#[doc(hidden)] -pub mod socket_opts; +pub(crate) mod socket_opts; pub mod transports; pub use config::{IOV_MAX_LIMIT, MessageBusConfig, QuicTuning, WebSocketConfig}; @@ -96,7 +95,7 @@ pub use installer::conn_info::{ }; pub use lifecycle::{ BusMessage, BusReceiver, BusSender, ConnectionRegistry, DrainOutcome, FusedShutdown, - RejectedRegistration, ReplicaRegistry, Shutdown, ShutdownToken, + ReplicaRegistry, Shutdown, ShutdownToken, }; pub use transports::tls::TlsServerCredentials; @@ -105,26 +104,125 @@ use configs::server_ng::ServerNgConfig; use iggy_binary_protocol::consensus::MESSAGE_ALIGN; use iggy_binary_protocol::consensus::iobuf::Frozen; use iggy_binary_protocol::{GenericHeader, Message}; -use std::cell::RefCell; -use std::collections::HashMap; +use std::array; +use std::cell::{OnceCell, RefCell}; use std::net::SocketAddr; use std::rc::Rc; +use std::sync::Arc; +use std::sync::atomic::{AtomicU16, Ordering}; use std::time::Duration; -/// Callback for forwarding a consensus message to a remote shard. +/// Maximum number of replicas a single cluster supports. Replica ids are +/// `u8`, so the address space is 0..=255. +pub const MAX_REPLICAS: usize = 256; + +/// Sentinel stored in [`ReplicaOwnerTable`] slots that have no current owner. +/// +/// `u16::MAX` is reserved by the cluster bootstrap so it can never be a real +/// shard id: `bootstrap` rejects any cluster whose shard count does not fit +/// in `u16` or is `>= OWNER_NONE`, returning `InvalidShardsCount` rather +/// than minting a shard id that collides with this sentinel. +pub const OWNER_NONE: u16 = u16::MAX; + +/// Shared atomic owner table mapping `replica_id` to `owning_shard_id`. +/// +/// One Arc-cloned instance is allocated per cluster at bootstrap and +/// shared across every shard's [`IggyMessageBus`]. The owning shard +/// stamps its id into a slot when an inbound replica connection passes +/// the registry-insert race; the same shard CAS-clears the slot when +/// its connection dies (`notify_connection_lost`). Non-owning shards +/// only ever read. +/// +/// Lock-free by construction: every operation is a single relaxed-or- +/// stronger atomic on a fixed `[AtomicU16; MAX_REPLICAS]`. Sized for +/// 256 slots upfront, so install / disconnect cost is `O(1)` and no +/// hashing or allocation happens on the hot path. +/// +/// The table is the sole authority for cross-shard replica routing: +/// `send_to_replica`'s slow path and [`IggyMessageBus::owning_shard`] +/// both read it directly. No separate broadcast or reconcile loop is +/// involved. +pub struct ReplicaOwnerTable { + slots: [AtomicU16; MAX_REPLICAS], +} + +impl ReplicaOwnerTable { + /// Build a fresh table with every slot initialised to [`OWNER_NONE`]. + #[must_use] + pub fn new() -> Self { + Self { + slots: array::from_fn(|_| AtomicU16::new(OWNER_NONE)), + } + } + + /// Stamp `shard_id` as the current owner of `replica_id`. Overwrites + /// any previous owner; the caller is responsible for sequencing this + /// against the corresponding registry insert (in + /// [`installer::install_replica_conn`]). + pub fn set(&self, replica_id: u8, shard_id: u16) { + self.slots[usize::from(replica_id)].store(shard_id, Ordering::Release); + } + + /// Clear the slot iff its current owner is `shard_id`. Returns + /// `true` when the CAS succeeded. + /// + /// The CAS prevents a stale post-loop on shard A from clobbering a + /// slot that has since been re-registered on shard B. + pub fn clear_if_owned_by(&self, replica_id: u8, shard_id: u16) -> bool { + self.slots[usize::from(replica_id)] + .compare_exchange(shard_id, OWNER_NONE, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + } + + /// Read the current owner of `replica_id`. Returns `None` when the + /// slot is [`OWNER_NONE`]. + #[must_use] + pub fn owner(&self, replica_id: u8) -> Option { + match self.slots[usize::from(replica_id)].load(Ordering::Acquire) { + OWNER_NONE => None, + owner => Some(owner), + } + } +} + +impl Default for ReplicaOwnerTable { + fn default() -> Self { + Self::new() + } +} + +/// Callback for forwarding a consensus replica message to the owning shard. /// /// Provided by the shard layer at bus construction and installed via -/// [`IggyMessageBus::set_replica_forward_fn`] / [`IggyMessageBus::set_client_forward_fn`]. -/// Arguments: `(target_shard_id, message)`. Returns `Ok(())` on successful -/// enqueue into the inter-shard channel, or [`SendError::RoutingFailed`] on -/// `try_send` failure. +/// [`IggyMessageBus::set_replica_forward_fn`]. Arguments: +/// `(replica_id, owning_shard_id, message)`. Returns `Ok(())` on successful +/// enqueue into the inter-shard channel. +/// +/// `replica_id` is carried explicitly so the receiver can rebuild +/// `ShardFrame::Lifecycle`'s `ForwardReplicaSend { replica_id, msg }` +/// payload; the bus' slow path knows the id but the closure body, which lives +/// in the shard layer, would otherwise have to re-derive it from the message. /// /// Asymmetry vs the `Rc` siblings (`AcceptedReplicaFn`, /// `AcceptedClientFn`, `ConnectionLostFn`): the forward fn has a single /// owner (the bus) and is installed once at bootstrap; `Box` suffices. /// The `Rc` siblings are shared with accept loops and connection tasks /// that outlive the caller and need independent ownership. -pub type ShardForwardFn = Box) -> Result<(), SendError>>; +pub type ReplicaForwardFn = Box) -> Result<(), SendError>>; + +/// Callback for forwarding a client-bound message to the owning shard. +/// +/// Provided by the shard layer at bus construction and installed via +/// [`IggyMessageBus::set_client_forward_fn`]. Arguments: +/// `(client_id, owning_shard_id, message)`. Returns `Ok(())` on successful +/// enqueue into the inter-shard channel. +/// +/// `owning_shard_id` is the value returned by [`client_id_owning_shard`]; +/// the slow path passes it pre-resolved so the closure does not redo the +/// shift. `client_id` is carried so the receiver can rebuild the +/// `ShardFrame::Lifecycle`'s `ForwardClientSend { client_id, msg }` +/// payload. +pub type ClientForwardFn = Box) -> Result<(), SendError>>; /// Callback invoked on every successful replica handshake. /// @@ -161,8 +259,14 @@ pub struct AcceptedQuicConn { impl AcceptedQuicConn { /// Bundle a freshly-accepted QUIC connection and its first /// bidirectional stream pair. + /// + /// `pub(crate)` by design: the constructor's signature mentions + /// `compio_quic::Connection`, `SendStream`, and `RecvStream`, all + /// kept off the bus's public `SemVer` surface for the same reason + /// [`Self::into_parts`] is crate-private. The QUIC listener in + /// [`crate::client_listener::quic`] is the only mint site. #[must_use] - pub const fn new( + pub(crate) const fn new( connection: compio_quic::Connection, streams: (compio_quic::SendStream, compio_quic::RecvStream), ) -> Self { @@ -220,7 +324,7 @@ pub type AcceptedQuicClientFn = std::rc::Rc; /// Fires after shard 0's WS listener accepts a raw TCP socket. The /// HTTP-Upgrade handshake has NOT run yet; the callback hands the /// raw stream off to the owning shard via inter-shard fd-shipping -/// (`ShardFramePayload::ClientWsConnectionSetup`). The owning shard +/// (`shard::LifecycleFrame::ClientWsConnectionSetup`). The owning shard /// runs the upgrade locally; no subprotocol is negotiated. The /// shipped fd is plain TCP at ship-time, so fd-delegation (which /// requires a dupable plaintext fd) stays well-defined. @@ -291,6 +395,34 @@ pub type ConnectionLostFn = std::rc::Rc; /// on first poll. Future transports (QUIC, TLS) that would introduce real /// yields must advertise that change in their own doc; consensus code /// relies on the no-yield property to reason about reentrancy. +/// +/// # Recovery asymmetry between `send_to_replica` and `send_to_client` +/// +/// `send_to_replica` carries VSR-covered frames (`Prepare` / `PrepareOk` +/// / view-change / `Commit`). A `ReplicaForwardFailed` on full +/// inter-shard inbox is recovered by VSR retransmit and is +/// informational. +/// `send_to_client` carries the final Reply payload to the originating +/// client; it has no in-protocol retransmit. A `ClientForwardFailed` is +/// terminal: the client never receives the reply and times out. +/// Operators must size `inbox_capacity` for the worst-case cross-shard +/// reply burst. Each drop is logged at the drop site via `tracing`; +/// alert on those logs today. The `frame_drops_total` +/// `{variant="forward_client_send"}` counter is the structured +/// complement and becomes scrape-able once a per-shard exporter lands. +/// The symmetric `{variant="forward_replica_send"}` counter stays +/// informational only. +/// +/// # Setter install contract +/// +/// The three setters below do NOT share one install rule: +/// - [`Self::set_connection_lost_fn`] is re-installable (`RefCell` +/// slot); it is a test observation hook some setups swap mid-test. +/// - [`Self::set_replica_forward_fn`] and [`Self::set_client_forward_fn`] +/// are one-shot (`OnceCell` slot) and panic on a second install; +/// they are bootstrap-only wiring invariants. +/// +/// A bus impl must preserve this divergence - see each method. pub trait MessageBus { fn send_to_client( &self, @@ -305,15 +437,50 @@ pub trait MessageBus { ) -> impl Future>; /// Install a notifier the bus will invoke when a delegated replica - /// connection dies abnormally. Used by shard-0 bootstrap to push a - /// `ShardFramePayload::ConnectionLost` into shard 0's inbox so the - /// coordinator can forget the replica's mapping. - /// - /// Default impl is a no-op for buses that do not delegate real - /// connections (simulator, test doubles); the production - /// `IggyMessageBus` overrides it to wire into its internal - /// `connection_lost_fn` hook. - fn set_connection_lost_fn(&self, _f: ConnectionLostFn) {} + /// connection dies abnormally. Production wiring leaves this unset + /// because the cluster-shared [`ReplicaOwnerTable`] is CAS-cleared + /// inside `IggyMessageBus::notify_connection_lost` before the + /// callback runs, so cross-shard routing recovers without any + /// follow-up message. Test code installs a callback to assert the + /// notifier path fires exactly once per disconnect. + /// + /// No default: every bus must explicitly opt in or stub. A silent + /// no-op default previously masked a wiring bug where shard 0 never + /// learned a delegated replica died. + /// + /// Re-installing is allowed by design - the backing slot is a + /// `RefCell`, so a second call overwrites the first. This is the + /// deliberate exception to the install rules of + /// [`Self::set_replica_forward_fn`] / [`Self::set_client_forward_fn`]: + /// the notifier is a test observation hook some setups swap + /// mid-test, not a one-shot bootstrap invariant. + fn set_connection_lost_fn(&self, f: ConnectionLostFn); + + /// Install the replica-plane inter-shard forward closure. + /// + /// No default: every bus must explicitly opt in or stub. Mirrors the + /// reason `set_connection_lost_fn` is required - a silent no-op + /// default could mask a wiring bug where cross-shard replica sends + /// drop on the floor without anyone noticing. + /// + /// # Panics + /// + /// Panics on a second install on the same bus. Unlike + /// [`Self::set_connection_lost_fn`], the forward closure is a + /// one-shot bootstrap invariant backed by a `OnceCell`, so a + /// double-install is a wiring bug rather than a supported re-bind. + fn set_replica_forward_fn(&self, f: ReplicaForwardFn); + + /// Install the client-plane inter-shard forward closure. + /// + /// No default: every bus must explicitly opt in or stub. Same + /// rationale as [`Self::set_replica_forward_fn`]. + /// + /// # Panics + /// + /// Panics on a second install on the same bus, same one-shot + /// `OnceCell` invariant as [`Self::set_replica_forward_fn`]. + fn set_client_forward_fn(&self, f: ClientForwardFn); } /// Production message bus backed by real TCP connections. @@ -330,7 +497,10 @@ pub trait MessageBus { /// - `send_to_*` clones the per-peer `Sender` out of the registry, calls /// `try_send` on it, and returns. No `.await` happens in the body. Drops /// on `Full` (returned as [`SendError::Backpressure`]) are recovered by -/// VSR retransmission. +/// VSR retransmission for replica-bound consensus traffic. Client Reply +/// drops returned as [`SendError::ClientForwardFailed`] have no +/// retransmit and are terminal; see the trait-level "Recovery +/// asymmetry" section on [`MessageBus`]. /// - The per-connection writer task batches up to `config.max_batch` messages into /// a single `writev` syscall on the plaintext TCP plane (see /// [`transports::tcp`]). @@ -345,25 +515,30 @@ pub struct IggyMessageBus { replicas: ReplicaRegistry, background_tasks: RefCell>>, config: MessageBusConfig, - /// For each replica, the shard that owns the TCP connection from this - /// shard's perspective. Present on ALL shards (owning and non-owning). - /// On the owning shard, `shard_mapping[replica] == self.shard_id`. - shard_mapping: RefCell>, /// Forwards a replica-bound message to the shard that owns the replica's - /// TCP connection. `None` on single-shard setups and tests. - replica_forward_fn: RefCell>, + /// TCP connection. Empty on single-shard setups and tests. Installed + /// once at bootstrap via [`Self::set_replica_forward_fn`]; the slow + /// path reads through [`OnceCell::get`] without a runtime borrow. + replica_forward_fn: OnceCell, /// Forwards a client-bound message to the shard that owns the client's /// TCP connection (the owning shard is encoded in the top 16 bits of - /// the client id). `None` on single-shard setups and tests. - client_forward_fn: RefCell>, - /// Invoked by a delegated replica connection's writer / reader tasks - /// when they exit abnormally. `None` when running without a shard-0 - /// coordinator (single-shard deployments and tests). + /// the client id). Empty on single-shard setups and tests. Installed + /// once at bootstrap via [`Self::set_client_forward_fn`]; the slow + /// path reads through [`OnceCell::get`] without a runtime borrow. + client_forward_fn: OnceCell, + /// Optional disconnect notifier invoked by + /// `Self::notify_connection_lost` after it CAS-clears the owner + /// table. Production leaves this unset; tests install a callback to + /// observe the path. connection_lost_fn: RefCell>, /// Per-connection metadata exposed to the caller via /// [`Self::client_meta`]. Populated by the install path on /// successful registry insert; removed on connection teardown. client_meta: RefCell>>, + /// Cluster-shared atomic owner table. Stamped by the install path + /// after a successful registry insert; CAS-cleared by + /// `Self::notify_connection_lost`. See [`ReplicaOwnerTable`]. + owner_table: Arc, } impl IggyMessageBus { @@ -408,6 +583,19 @@ impl IggyMessageBus { Self::with_tunables(shard_id, MessageBusConfig::from(cfg)) } + /// Production constructor for clusters: same as [`Self::with_config`] + /// but takes a pre-allocated [`ReplicaOwnerTable`] Arc. Bootstrap + /// allocates one table per cluster and clones the Arc into every + /// shard so all buses see the same atomic slots. + #[must_use] + pub fn with_config_and_owner_table( + shard_id: u16, + cfg: &ServerNgConfig, + owner_table: Arc, + ) -> Self { + Self::with_tunables_and_owner_table(shard_id, MessageBusConfig::from(cfg), owner_table) + } + /// Construct a bus from already-derived runtime tunables. /// /// Used by the public constructors above and by tests that need to @@ -420,6 +608,24 @@ impl IggyMessageBus { /// `config.max_batch > IOV_MAX_LIMIT`. #[must_use] pub fn with_tunables(shard_id: u16, config: MessageBusConfig) -> Self { + Self::with_tunables_and_owner_table(shard_id, config, Arc::new(ReplicaOwnerTable::new())) + } + + /// Construct a bus with explicit runtime tunables and a pre-allocated + /// owner table. Cluster bootstrap uses this so every shard's bus + /// shares the same atomic slots; tests use [`Self::with_tunables`] + /// which allocates a fresh table per bus. + /// + /// # Panics + /// + /// Panics if `config.max_batch == 0` or + /// `config.max_batch > IOV_MAX_LIMIT`. + #[must_use] + pub fn with_tunables_and_owner_table( + shard_id: u16, + config: MessageBusConfig, + owner_table: Arc, + ) -> Self { assert!( config.max_batch > 0 && config.max_batch <= IOV_MAX_LIMIT, "MessageBusConfig::max_batch must be in 1..={IOV_MAX_LIMIT} (writev IOV_MAX/2 cap); got {}", @@ -434,14 +640,36 @@ impl IggyMessageBus { replicas: ReplicaRegistry::new(), background_tasks: RefCell::new(Vec::new()), config, - shard_mapping: RefCell::new(HashMap::new()), - replica_forward_fn: RefCell::new(None), - client_forward_fn: RefCell::new(None), + replica_forward_fn: OnceCell::new(), + client_forward_fn: OnceCell::new(), connection_lost_fn: RefCell::new(None), client_meta: RefCell::new(ahash::AHashMap::new()), + owner_table, } } + /// Clone of the cluster-shared owner table Arc. + #[must_use] + pub fn owner_table(&self) -> Arc { + Arc::clone(&self.owner_table) + } + + /// Stamp this bus' shard id into the owner table for `replica_id`. + /// Called by [`installer::install_replica_conn`] after a successful + /// registry insert. + pub fn mark_replica_owned(&self, replica_id: u8) { + self.owner_table.set(replica_id, self.shard_id); + } + + /// CAS-clear the owner-table slot for `replica_id` if its current + /// owner is this bus' shard id. The CAS prevents a stale post-loop + /// from clobbering a slot that has since been re-registered on a + /// different shard. + pub fn clear_replica_owned(&self, replica_id: u8) -> bool { + self.owner_table + .clear_if_owned_by(replica_id, self.shard_id) + } + /// Look up the per-connection metadata recorded for `client_id`. /// /// Returns `None` if the client never connected on this bus or if @@ -472,6 +700,31 @@ impl IggyMessageBus { /// [`Self::set_connection_lost_fn`] (which takes a `borrow_mut`) /// without tripping the runtime borrow check. pub(crate) fn notify_connection_lost(&self, replica_id: u8) { + // Stand down if a live registry entry exists for this replica. A + // reconnect that round-robined back to this same shard installs a + // fresh entry before this stale post-loop runs; the owner-table + // CAS in `clear_replica_owned` only guards *different*-shard + // re-registration, so a same-shard reinstall leaves the slot + // == self.shard_id and the clear below would clobber the live + // connection's slot, stranding cross-shard `send_to_replica` + // forever. Every caller removes its own registry entry before + // invoking this, so a live entry here is always a newer install + // that now owns the slot. No token needed: the bus is + // single-threaded (compio), so this check, the clear, and a + // competing `install_replica_conn` cannot interleave. + if self.replicas.contains(replica_id) { + return; + } + // CAS-clear first so any reader racing the user callback already + // sees the slot vacated. The CAS no-ops if a different shard + // already owns the slot (post-loop arriving after a + // re-installation on another shard). + self.clear_replica_owned(replica_id); + tracing::warn!( + shard_id = self.shard_id, + replica_id, + "replica connection lost" + ); let cb = self .connection_lost_fn .borrow() @@ -493,11 +746,14 @@ impl IggyMessageBus { /// /// # Panics /// - /// Panics on re-entrant `RefCell::borrow_mut` if called from inside - /// an in-flight forward-closure invocation. All production call sites - /// are bootstrap only (single-threaded compio, no re-entry). - pub fn set_replica_forward_fn(&self, f: ShardForwardFn) { - *self.replica_forward_fn.borrow_mut() = Some(f); + /// Panics if called more than once on the same bus. The bootstrap is + /// the sole caller and runs exactly once per bus; a second install + /// signals a wiring bug we want to surface loudly. + pub fn set_replica_forward_fn(&self, f: ReplicaForwardFn) { + self.replica_forward_fn + .set(f) + .ok() + .expect("replica_forward_fn installed twice (bootstrap invariant)"); } /// Install the client-plane inter-shard forward closure. @@ -510,41 +766,23 @@ impl IggyMessageBus { /// /// # Panics /// - /// Panics on re-entrant `RefCell::borrow_mut` if called from inside - /// an in-flight forward-closure invocation. - pub fn set_client_forward_fn(&self, f: ShardForwardFn) { - *self.client_forward_fn.borrow_mut() = Some(f); - } - - /// Update the shard mapping for a replica. - /// - /// Called when the allocation strategy assigns or reassigns connections. - /// - /// # Panics - /// - /// Panics on re-entrant `RefCell::borrow_mut` if called while - /// [`IggyMessageBus::owning_shard`] (or any other read-side of - /// `shard_mapping`) holds an outstanding read borrow on the same - /// bus instance. - pub fn set_shard_mapping(&self, replica: u8, owning_shard: u16) { - self.shard_mapping - .borrow_mut() - .insert(replica, owning_shard); + /// Panics if called more than once on the same bus. Same rationale as + /// [`Self::set_replica_forward_fn`]. + pub fn set_client_forward_fn(&self, f: ClientForwardFn) { + self.client_forward_fn + .set(f) + .ok() + .expect("client_forward_fn installed twice (bootstrap invariant)"); } - /// Remove all shard mappings for a replica (on deallocation). + /// Get the owning shard for a replica, if any. /// - /// # Panics - /// - /// Same re-entrant-`borrow_mut` constraint as - /// [`IggyMessageBus::set_shard_mapping`]. - pub fn remove_shard_mapping(&self, replica: u8) { - self.shard_mapping.borrow_mut().remove(&replica); - } - - /// Get the owning shard for a replica, if mapped. + /// Reads the cluster-shared [`ReplicaOwnerTable`], which is stamped + /// synchronously by the owning shard's installer and CAS-cleared on + /// disconnect inside `Self::notify_connection_lost`. + #[must_use] pub fn owning_shard(&self, replica: u8) -> Option { - self.shard_mapping.borrow().get(&replica).copied() + self.owner_table.owner(replica) } #[must_use] @@ -713,6 +951,14 @@ impl MessageBus for std::rc::Rc { fn set_connection_lost_fn(&self, f: ConnectionLostFn) { (**self).set_connection_lost_fn(f); } + + fn set_replica_forward_fn(&self, f: ReplicaForwardFn) { + (**self).set_replica_forward_fn(f); + } + + fn set_client_forward_fn(&self, f: ClientForwardFn) { + (**self).set_client_forward_fn(f); + } } #[allow(clippy::future_not_send)] @@ -737,11 +983,12 @@ impl MessageBus for IggyMessageBus { Err(_msg) => Err(SendError::ClientNotFound(client_id)), }; } - let forward = self.client_forward_fn.borrow(); - let forward = forward - .as_ref() + let forward = self + .client_forward_fn + .get() .ok_or(SendError::ClientRouteMissing(client_id))?; - forward(owning_shard, message).map_err(|_| SendError::ClientForwardFailed(client_id)) + forward(client_id, owning_shard, message) + .map_err(|_| SendError::ClientForwardFailed(client_id)) } async fn send_to_replica( @@ -760,22 +1007,43 @@ impl MessageBus for IggyMessageBus { Err(message) => message, }; // Slow path: route via the inter-shard channel to the owning shard. + // The cluster-shared `owner_table` is the authoritative routing + // view: `mark_replica_owned` stamps it on install and + // `clear_replica_owned` CAS-clears it on disconnect. let owning_shard = self - .shard_mapping - .borrow() - .get(&replica) - .copied() + .owner_table + .owner(replica) .ok_or(SendError::ReplicaNotConnected(replica))?; - let forward = self.replica_forward_fn.borrow(); - let forward = forward - .as_ref() + // The fast path already failed (no live registry slot), so a + // self-owner here means the disconnect is mid-flight: + // `close_peer_if_token_matches` took the slot but + // `clear_replica_owned` has not yet run. Forwarding now would + // `try_send` a `ForwardReplicaSend` onto our own inbox and the + // pump would re-enter this path - a self-inbox ping-pong on the + // consensus hot path. Treat as not connected, mirroring the + // `send_to_client` self-owner guard. + if owning_shard == self.shard_id { + return Err(SendError::ReplicaNotConnected(replica)); + } + let forward = self + .replica_forward_fn + .get() .ok_or(SendError::ReplicaRouteMissing(replica))?; - forward(owning_shard, message).map_err(|_| SendError::ReplicaForwardFailed(replica)) + forward(replica, owning_shard, message) + .map_err(|_| SendError::ReplicaForwardFailed(replica)) } fn set_connection_lost_fn(&self, f: ConnectionLostFn) { Self::set_connection_lost_fn(self, f); } + + fn set_replica_forward_fn(&self, f: ReplicaForwardFn) { + Self::set_replica_forward_fn(self, f); + } + + fn set_client_forward_fn(&self, f: ClientForwardFn) { + Self::set_client_forward_fn(self, f); + } } /// Extract the owning shard from a client id. @@ -823,16 +1091,20 @@ mod tests { let bus = IggyMessageBus::new(5); let captured: std::rc::Rc>> = std::rc::Rc::new(RefCell::new(None)); let captured_clone = captured.clone(); - bus.set_client_forward_fn(Box::new(move |target, _msg| { + let client_id = (7u128 << 112) | 0x2a; + let captured_id: std::rc::Rc>> = std::rc::Rc::new(RefCell::new(None)); + let captured_id_clone = captured_id.clone(); + bus.set_client_forward_fn(Box::new(move |id, target, _msg| { + *captured_id_clone.borrow_mut() = Some(id); *captured_clone.borrow_mut() = Some(target); Ok(()) })); - let client_id = (7u128 << 112) | 0x2a; bus.send_to_client(client_id, dummy_message().into_frozen()) .await .expect("forward_fn should accept"); assert_eq!(*captured.borrow(), Some(7)); + assert_eq!(*captured_id.borrow(), Some(client_id)); } #[compio::test] @@ -863,21 +1135,25 @@ mod tests { #[compio::test] #[allow(clippy::future_not_send)] - async fn send_to_replica_slow_path_uses_shard_mapping_and_forwards() { + async fn send_to_replica_slow_path_uses_owner_table_and_forwards() { let bus = IggyMessageBus::new(0); let captured: std::rc::Rc>> = std::rc::Rc::new(RefCell::new(None)); let captured_clone = captured.clone(); - bus.set_replica_forward_fn(Box::new(move |target, _msg| { + let captured_id: std::rc::Rc>> = std::rc::Rc::new(RefCell::new(None)); + let captured_id_clone = captured_id.clone(); + bus.set_replica_forward_fn(Box::new(move |id, target, _msg| { + *captured_id_clone.borrow_mut() = Some(id); *captured_clone.borrow_mut() = Some(target); Ok(()) })); - bus.set_shard_mapping(5, 3); + bus.owner_table().set(5, 3); bus.send_to_replica(5, dummy_message().into_frozen()) .await .expect("forward ok"); assert_eq!(*captured.borrow(), Some(3)); + assert_eq!(*captured_id.borrow(), Some(5)); } #[compio::test] @@ -891,6 +1167,24 @@ mod tests { assert!(matches!(err, SendError::ReplicaNotConnected(9))); } + /// Disconnect-race window: the registry slot is already gone (fast + /// path misses) but `owner_table` still points at this shard because + /// `clear_replica_owned` has not run yet. The slow path must treat a + /// self-owner as not connected rather than forwarding the frame onto + /// its own inbox - mirrors the `send_to_client` self-owner guard. + #[compio::test] + #[allow(clippy::future_not_send)] + async fn send_to_replica_slow_path_self_owner_returns_not_connected() { + let bus = IggyMessageBus::new(3); + bus.owner_table().set(5, 3); + + let err = bus + .send_to_replica(5, dummy_message().into_frozen()) + .await + .unwrap_err(); + assert!(matches!(err, SendError::ReplicaNotConnected(5))); + } + #[compio::test] #[allow(clippy::future_not_send)] async fn client_id_owning_shard_extracts_top_16_bits() { @@ -990,6 +1284,43 @@ mod tests { assert_eq!(counter.get(), 11); } + /// Same-shard reconnect race: a stale post-loop from a dead connection + /// must not CAS-clear the owner-table slot that a live reinstall + /// (round-robined back to this shard) just stamped. The slot is only + /// released once no live registry entry remains. + #[compio::test] + #[allow(clippy::future_not_send)] + async fn notify_connection_lost_stands_down_when_replica_reregistered() { + let bus = IggyMessageBus::new(3); + + // Live reinstall: registry entry present + owner table stamped. + let (tx, _rx) = async_channel::bounded(8); + let writer = compio::runtime::spawn(async {}); + let reader = compio::runtime::spawn(async {}); + let (conn_shutdown, _conn_token) = Shutdown::new(); + bus.replicas() + .insert(7, tx, writer, reader, conn_shutdown) + .expect("insert ok"); + bus.mark_replica_owned(7); + + // Stale predecessor post-loop fires while the reinstall is live. + bus.notify_connection_lost(7); + assert_eq!( + bus.owning_shard(7), + Some(3), + "stale post-loop cleared a re-registered replica's owner slot", + ); + + // Genuine disconnect: no live entry, the slot is released. + bus.replicas().remove(7); + bus.notify_connection_lost(7); + assert_eq!( + bus.owning_shard(7), + None, + "owner slot must clear once no live connection remains", + ); + } + #[compio::test] #[allow(clippy::future_not_send)] async fn shutdown_loop_drains_tasks_added_during_shutdown() { diff --git a/core/message_bus/src/transports/mod.rs b/core/message_bus/src/transports/mod.rs index fab08207c9..a141ba04b6 100644 --- a/core/message_bus/src/transports/mod.rs +++ b/core/message_bus/src/transports/mod.rs @@ -70,7 +70,7 @@ //! - fd-delegation (`F_DUPFD_CLOEXEC`) stays TCP-only and lives outside //! this trait (see [`crate::fd_transfer`]). Other transports terminate //! on shard 0 and forward `Frozen` over the inter-shard -//! flume via [`crate::ShardForwardFn`]. +//! crossfire channel via [`crate::ReplicaForwardFn`] / [`crate::ClientForwardFn`]. //! //! Per-transport impls live in //! `transports/{tcp,tcp_tls,ws,wss,quic}.rs`. diff --git a/core/metadata/src/impls/metadata.rs b/core/metadata/src/impls/metadata.rs index 3e5487b488..047b54745b 100644 --- a/core/metadata/src/impls/metadata.rs +++ b/core/metadata/src/impls/metadata.rs @@ -303,11 +303,18 @@ impl std::fmt::Display for RegisterSubmitError { impl std::error::Error for RegisterSubmitError {} pub struct IggyMetadata { - /// Some on shard0, None on other shards + /// `Some` on shard 0, `None` on other shards. Server-ng bootstrap + /// holds the invariant: only shard 0 owns the metadata consensus + /// replica; every other shard reconstructs `mux_stm` from the + /// snapshot + replayed journal during recovery and then drops both. pub consensus: Option, - /// Some on shard0, None on other shards + /// `Some` on shard 0, `None` on other shards. Shard 0 owns the WAL + /// writer (via `PrepareJournal::open`); non-owning shards open the + /// WAL read-only during recovery for replay only and must drop the + /// handle afterwards so a `drain()` rename on shard 0 cannot strand + /// a stale inode visible through their RO fd. pub journal: Option, - /// Some on shard0, None on other shards + /// `Some` on shard 0, `None` on other shards. pub snapshot: Option, /// State machine - lives on all shards pub mux_stm: M, @@ -705,9 +712,13 @@ where .send_to_client(prepare_header.client, reply_buffers) .await { - warn!( - "on_ack: failed to send reply to client={}: {e}", - prepare_header.client + error!( + client = prepare_header.client, + op = prepare_header.op, + request_id = prepare_header.request, + operation = ?prepare_header.operation, + %e, + "client reply forward failed, no retransmit path; client will time out", ); } } diff --git a/core/metadata/src/impls/recovery.rs b/core/metadata/src/impls/recovery.rs index 3d8d0d436c..06987f062d 100644 --- a/core/metadata/src/impls/recovery.rs +++ b/core/metadata/src/impls/recovery.rs @@ -98,6 +98,9 @@ pub struct RecoveredMetadata { /// 4. Replay journal entries from the first post-snapshot op through the state machine /// 5. Return the assembled `RecoveredMetadata` /// +/// Only the owning shard (shard 0) should call this. Peer shards receive +/// a `ReadHandleFactory` bundle from shard 0 and skip WAL access entirely. +/// /// # Errors /// Returns `RecoveryError` if snapshot loading, journal opening, or replay fails. #[allow(clippy::future_not_send)] @@ -110,7 +113,6 @@ where let metadata_dir = data_dir.join(super::METADATA_DIR); std::fs::create_dir_all(&metadata_dir)?; - // 1. Load snapshot if present. let snapshot_path = metadata_dir.join("snapshot.bin"); let snapshot = if snapshot_path.exists() { Some(IggySnapshot::load(&snapshot_path)?) @@ -121,24 +123,17 @@ where .as_ref() .map_or(0, |snapshot| snapshot.sequence_number() + 1); - // 2. Restore state machine from snapshot, or start from a fresh empty state. let mux_stm = match snapshot.as_ref() { Some(snapshot) => M::restore_snapshot(snapshot.snapshot())?, None => M::default(), }; - // 3. Open journal, scan the WAL and build index let journal_path = metadata_dir.join("journal.wal"); - let journal = PrepareJournal::open( - &journal_path, - snapshot.as_ref().map_or(0, IggySnapshot::sequence_number), - ) - .await?; - - // 4. Replay journal entries after snapshot. - // Intentional fail-fast for now: if replay hits a bad entry, recovery - // aborts and the operator must repair or truncate the WAL before the - // node can boot again. + let watermark = snapshot.as_ref().map_or(0, IggySnapshot::sequence_number); + let journal = PrepareJournal::open(&journal_path, watermark).await?; + + // Intentional fail-fast: a bad entry aborts recovery and the operator + // must repair or truncate the WAL before the node can boot again. let headers_to_replay = journal.iter_headers_from(replay_from); let mut last_applied_op: Option = None; diff --git a/core/metadata/src/stm/mod.rs b/core/metadata/src/stm/mod.rs index bde07b02e6..9e5eb2e243 100644 --- a/core/metadata/src/stm/mod.rs +++ b/core/metadata/src/stm/mod.rs @@ -23,11 +23,13 @@ pub mod user; use bytes::Bytes; use iggy_common::Either; -use left_right::{Absorb, ReadHandle, WriteHandle}; +use left_right::{Absorb, ReadHandle, ReadHandleFactory, WriteHandle}; use std::cell::Cell; use std::cell::UnsafeCell; use std::sync::Arc; +pub use left_right::ReadHandleFactory as LeftRightFactory; + pub struct WriteCell where T: Absorb, @@ -113,6 +115,30 @@ where let guard = self.read.enter().expect("read handle should be accessible"); f(&*guard) } + + /// Produce a `Send + Sync` factory that can mint additional [`ReadHandle`]s + /// on other threads. Use this to share the read side across compio shard + /// runtimes without sharing the (`!Sync`) [`ReadHandle`] itself. + #[must_use] + pub fn factory(&self) -> ReadHandleFactory { + self.read.factory() + } + + /// Build a reader-only [`LeftRight`] from a factory minted on the owning + /// thread. The new [`ReadHandle`] is materialised here (the caller's + /// thread), keeping the `!Sync` discipline local. `write` stays `None`, + /// so `do_apply` panics if invoked on a reader-only instance. + /// + /// Takes the factory by value so the constructor reads as ownership + /// transfer from the owning shard to the peer-shard reader. + #[must_use] + #[allow(clippy::needless_pass_by_value)] + pub fn from_factory(factory: ReadHandleFactory) -> Self { + Self { + write: None, + read: Arc::new(factory.handle()), + } + } } impl From for LeftRight::Cmd> @@ -256,6 +282,41 @@ macro_rules! define_state { [<$state Inner>]::new().into() } } + + impl $state { + /// Mint a `Send + Sync` [`ReadHandleFactory`] for this state. + /// Allows the read side to be carried across shard threads + /// without sharing the underlying `!Sync` [`ReadHandle`]. + #[must_use] + pub fn factory(&self) -> $crate::stm::LeftRightFactory<[<$state Inner>]> { + self.inner.factory() + } + + /// Construct a reader-only state wrapper from a factory minted + /// by the writer-side shard. The thread that calls this owns + /// the resulting [`ReadHandle`]; calling `apply` on the + /// returned wrapper panics because `write` is `None`. + #[must_use] + pub fn from_factory( + factory: $crate::stm::LeftRightFactory<[<$state Inner>]>, + ) -> Self { + Self { + inner: $crate::stm::LeftRight::from_factory(factory), + } + } + } + + impl $crate::stm::mux::WithFactory for $state { + type Bundle = $crate::stm::LeftRightFactory<[<$state Inner>]>; + + fn factory_bundle(&self) -> Self::Bundle { + self.factory() + } + + fn from_factory_bundle(bundle: Self::Bundle) -> Self { + Self::from_factory(bundle) + } + } } }; } diff --git a/core/metadata/src/stm/mux.rs b/core/metadata/src/stm/mux.rs index 8097cc750d..c9a0d1e6a8 100644 --- a/core/metadata/src/stm/mux.rs +++ b/core/metadata/src/stm/mux.rs @@ -22,6 +22,49 @@ use iggy_common::variadic; use crate::stm::{State, StateMachine}; +/// Cross-thread read-side handoff for a state machine. +/// +/// The owning shard (typically shard 0) materialises a `Bundle` after +/// recovering the WAL and hands one clone to every peer shard. Peer +/// shards then call [`WithFactory::from_factory_bundle`] on their own +/// runtime to construct a reader-mode state machine — no WAL replay, +/// no shared `!Sync` `ReadHandle`, just `Send + Sync` factories per +/// state. +/// +/// The variadic tuple impl below recurses over the state list, so a +/// `MuxStateMachine` ends up +/// with `Bundle = (Users::Bundle, (Streams::Bundle, (ConsumerGroups::Bundle, ())))`. +pub trait WithFactory: Sized { + type Bundle: Clone + Send + Sync; + fn factory_bundle(&self) -> Self::Bundle; + fn from_factory_bundle(bundle: Self::Bundle) -> Self; +} + +impl WithFactory for () { + type Bundle = (); + fn factory_bundle(&self) -> Self::Bundle {} + fn from_factory_bundle((): Self::Bundle) -> Self {} +} + +impl WithFactory for variadic!(S, ...Rest) +where + S: WithFactory, + Rest: WithFactory, +{ + type Bundle = (S::Bundle, Rest::Bundle); + + fn factory_bundle(&self) -> Self::Bundle { + (self.0.factory_bundle(), self.1.factory_bundle()) + } + + fn from_factory_bundle(bundle: Self::Bundle) -> Self { + ( + S::from_factory_bundle(bundle.0), + Rest::from_factory_bundle(bundle.1), + ) + } +} + // MuxStateMachine that proxies to an tuple of variadic state machines #[derive(Debug)] pub struct MuxStateMachine @@ -55,6 +98,26 @@ where } } +impl MuxStateMachine +where + T: StateMachine + WithFactory, +{ + /// Mint a factory bundle from the owning state machine so peer shards + /// can rebuild a reader-mode mirror on their own runtimes. + #[must_use] + pub fn factory_bundle(&self) -> T::Bundle { + self.inner.factory_bundle() + } + + /// Reconstruct a reader-mode `MuxStateMachine` from a bundle minted + /// on the writer-side shard. Must be invoked on the destination + /// runtime so the inner `!Sync` `ReadHandle`s are created in place. + #[must_use] + pub fn from_factory_bundle(bundle: T::Bundle) -> Self { + Self::new(T::from_factory_bundle(bundle)) + } +} + impl StateMachine for MuxStateMachine where T: StateMachine, diff --git a/core/partitions/src/iggy_partition.rs b/core/partitions/src/iggy_partition.rs index 4a82aaec8a..f57bd790bd 100644 --- a/core/partitions/src/iggy_partition.rs +++ b/core/partitions/src/iggy_partition.rs @@ -1416,14 +1416,14 @@ where .send_to_client(prepare_header.client, reply_buffers) .await { - warn!( + tracing::error!( target: "iggy.partitions.diag", plane = "partitions", client = prepare_header.client, op = prepare_header.op, namespace_raw, %error, - "failed to send reply to client" + "client reply forward failed, no retransmit path; client will time out", ); } } diff --git a/core/server-ng/Cargo.toml b/core/server-ng/Cargo.toml index d11700a198..3b130fdb4f 100644 --- a/core/server-ng/Cargo.toml +++ b/core/server-ng/Cargo.toml @@ -43,7 +43,6 @@ ignored = [ "err_trail", "error_set", "figlet-rs", - "flume", "fs2", "hash32", "human-repr", @@ -105,6 +104,7 @@ clap = { workspace = true } compio = { workspace = true } configs = { workspace = true } consensus = { workspace = true } +crossfire = { workspace = true } ctrlc = { workspace = true } cyper = { workspace = true } cyper-axum = { workspace = true } @@ -113,7 +113,6 @@ dotenvy = { workspace = true } err_trail = { workspace = true } error_set = { workspace = true } figlet-rs = { workspace = true } -flume = { workspace = true } fs2 = { workspace = true } hash32 = { workspace = true } human-repr = { workspace = true } diff --git a/core/server-ng/config.toml b/core/server-ng/config.toml index 7d8b51123a..77261181c2 100644 --- a/core/server-ng/config.toml +++ b/core/server-ng/config.toml @@ -589,7 +589,8 @@ ports = { tcp = 8091, quic = 8081, http = 3001, websocket = 8093, tcp_replica = # - numa settings: # + "numa:auto": Use all available numa node, cores # + "numa:nodes=0,1;cores=4;no_ht=true": Use NUMA node 0 and 1, each nodes use 4 cores, and no hyperthreads -cpu_allocation = "numa:auto" +# TODO(hubcio): revert to "numa:auto" once multi-shard server-ng is stable. +cpu_allocation = 1 [websocket] enabled = true diff --git a/core/server-ng/src/bootstrap.rs b/core/server-ng/src/bootstrap.rs index 816a464f5e..9f61db9efc 100644 --- a/core/server-ng/src/bootstrap.rs +++ b/core/server-ng/src/bootstrap.rs @@ -20,6 +20,7 @@ use crate::config_writer::write_current_config; use crate::server_error::ServerNgError; use configs::server_ng::ServerNgConfig; +use configs::sharding::INBOX_CAPACITY_MAX; use consensus::{LocalPipeline, PartitionsHandle, Sequencer, VsrConsensus}; use iggy_binary_protocol::RequestHeader; use iggy_common::sharding::{IggyNamespace, PartitionLocation, ShardId}; @@ -30,9 +31,7 @@ use iggy_common::{ use journal::Journal; use journal::prepare_journal::PrepareJournal; use message_bus::client_listener::{self, RequestHandler}; -use message_bus::fd_transfer; use message_bus::installer; -use message_bus::installer::ConnectionInstaller; use message_bus::installer::conn_info::{ClientConnMeta, ClientTransportKind}; use message_bus::replica::io as replica_io; use message_bus::replica::listener::{self as replica_listener, MessageHandler}; @@ -42,13 +41,14 @@ use message_bus::transports::tls::{ }; use message_bus::{ AcceptedClientFn, AcceptedQuicClientFn, AcceptedReplicaFn, AcceptedTlsClientFn, - AcceptedWsClientFn, IggyMessageBus, connector, + AcceptedWsClientFn, IggyMessageBus, ReplicaOwnerTable, connector, }; use metadata::IggyMetadata; use metadata::MuxStateMachine; use metadata::impls::metadata::{IggySnapshot, StreamsFrontend}; use metadata::impls::recovery::recover; use metadata::stm::consumer_group::ConsumerGroups; +use metadata::stm::mux::WithFactory; use metadata::stm::snapshot::Snapshot; use metadata::stm::stream::{Partition, Streams}; use metadata::stm::user::Users; @@ -56,38 +56,53 @@ use partitions::{ IggyIndexWriter, IggyPartition, IggyPartitions, MessagesWriter, PartitionsConfig, Segment, }; // TODO: decouple bootstrap/storage helpers and logging from the `server` crate. -use server::bootstrap::create_directories; +use server::bootstrap::{create_directories, create_shard_executor}; use server::log::logger::Logging; +use server::shard_allocator::{ShardAllocator, ShardInfo}; use server::streaming::partitions::storage::{load_consumer_group_offsets, load_consumer_offsets}; use server::streaming::segments::storage::create_segment_storage; use shard::builder::IggyShardBuilder; -use shard::shards_table::PapayaShardsTable; +use shard::metrics::ShardMetrics; +use shard::shards_table::{PapayaShardsTable, calculate_shard_assignment}; use shard::{ - CoordinatorConfig, IggyShard, PartitionConsensusConfig, ShardIdentity, channel, shard_channel, + CoordinatorConfig, IggyShard, PartitionConsensusConfig, Receiver as ShardReceiver, ShardFrame, + ShardIdentity, TaggedSender, channel, shard_cluster_channels, }; -use std::cell::{Cell, RefCell}; -use std::future::Future; +use std::cell::RefCell; use std::net::{IpAddr, SocketAddr}; use std::path::{Path, PathBuf}; use std::rc::{Rc, Weak}; use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::thread; +use std::time::Duration; use tracing::{error, info, warn}; const CLUSTER_ID: u128 = 1; -const SHARD_ID: u16 = 0; const SHARD_REPLICA_ID: u8 = 0; -const SHARD_NAME: &str = "server-ng-shard-0"; -const SHARD_INBOX_CAPACITY: usize = 1024; +/// Bus shutdown drain timeout. Picked larger than typical TCP RTT × +/// in-flight write-batch to give writers their full last +/// `write_vectored_all` budget before the connection registry kicks in. +const SHARD_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(10); +/// Watchdog poll cadence for the cross-thread shutdown flag. 50ms keeps +/// Ctrl-C latency operator-visible without measurable wakeup overhead. +const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(50); type ServerNgMuxStateMachine = MuxStateMachine; + +/// Cross-thread bundle carrying one `ReadHandleFactory` per metadata +/// state. Shard 0 mints one after `recover()` and broadcasts a clone to +/// every peer shard; each peer rebuilds a reader-mode +/// [`ServerNgMuxStateMachine`] on its own runtime, skipping the WAL. +type ServerNgMetadataBundle = ::Bundle; + type ServerNgMetadata = IggyMetadata< VsrConsensus>, PrepareJournal, IggySnapshot, ServerNgMuxStateMachine, >; -type ServerNgShard = IggyShard< +pub type ServerNgShard = IggyShard< Rc, PrepareJournal, IggySnapshot, @@ -97,6 +112,149 @@ type ServerNgShard = IggyShard< type ServerNgShardHandle = Rc>>>; +/// Result of a multi-shard bootstrap. +/// +/// Carries the cross-thread shutdown flag and one OS-thread `JoinHandle` +/// per shard. The caller flips the flag via [`Self::install_ctrlc_handler`] +/// and then drains the cluster via [`Self::join_all`]. +pub struct ClusterHandles { + shutdown_flag: Arc, + shard_threads: Vec<(u16, thread::JoinHandle>)>, +} + +impl ClusterHandles { + /// Install a SIGINT/Ctrl-C handler that flips the shutdown flag on + /// the first signal. A second signal is logged but otherwise + /// ignored so an in-flight WAL fsync or replica drain runs to + /// completion. + /// + /// # Errors + /// + /// Returns the underlying `ctrlc::Error` if the handler cannot be + /// installed (typically because another handler already owns the + /// signal). + pub fn install_ctrlc_handler(&self) -> Result<(), ctrlc::Error> { + let flag = Arc::clone(&self.shutdown_flag); + ctrlc::set_handler(move || { + if flag.swap(true, Ordering::Relaxed) { + // Second Ctrl-C: leave the shutdown machinery to drain. + // Refusing to abort here keeps the WAL fsync / replica + // drain from being interrupted mid-frame. + warn!("second Ctrl-C ignored; cluster is already shutting down"); + } else { + info!("Ctrl-C received; signalling cluster shutdown"); + } + }) + } + + /// Drain every shard thread, returning the first error observed (a + /// thread-level `Result::Err` or a panic). Clean exits log a single + /// info line per shard. + /// + /// # Errors + /// + /// Returns the first per-shard error: either a `Result::Err` returned + /// by `shard_main` or a `ServerNgError::ShardThreadPanicked` carrying + /// the panicking shard's id. + pub fn join_all(self) -> Result<(), ServerNgError> { + let mut first_error: Option = None; + for (shard_id, handle) in self.shard_threads { + match handle.join() { + Ok(Ok(())) => { + info!(shard_id, "shard thread exited cleanly"); + } + Ok(Err(error)) => { + error!(shard_id, error = %error, "shard thread returned error"); + if first_error.is_none() { + first_error = Some(error); + } + } + Err(panic_payload) => { + let message = panic_payload_to_string(&*panic_payload); + error!(shard_id, message = %message, "shard thread panicked"); + if first_error.is_none() { + first_error = + Some(ServerNgError::ShardThreadPanicked { shard_id, message }); + } + } + } + } + first_error.map_or(Ok(()), Err) + } +} + +/// Best-effort extraction of the panic message from a +/// `Box` returned by `JoinHandle::join`. Tries the two +/// payload shapes the standard library guarantees (`&'static str` and +/// `String`) and falls back to a placeholder so the panic still surfaces +/// in the error chain. +fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String { + if let Some(s) = payload.downcast_ref::<&'static str>() { + return (*s).to_string(); + } + if let Some(s) = payload.downcast_ref::() { + return s.clone(); + } + "".to_string() +} + +/// Flips the cross-thread shutdown flag on `Drop` unless disarmed. +/// +/// A shard thread that exits via an error `?` or a panic unwind would +/// otherwise leave sibling shards parked forever on `bus.token().wait()`: +/// their watchdogs never observe the flag and the bus has no +/// `Drop`-triggered shutdown. Arming this for the whole thread body makes +/// every non-clean exit drive sibling-shard teardown. Disarmed only on a +/// clean `Ok(())`. +struct ShutdownOnDrop { + flag: Arc, + armed: bool, +} + +impl ShutdownOnDrop { + const fn new(flag: Arc) -> Self { + Self { flag, armed: true } + } + + const fn disarm(&mut self) { + self.armed = false; + } +} + +impl Drop for ShutdownOnDrop { + fn drop(&mut self) { + if self.armed { + self.flag.store(true, Ordering::Relaxed); + } + } +} + +/// Shard-local end of the metadata bundle handoff. +/// +/// Shard 0 owns the WAL writer and runs `recover()` to build the only +/// `WriteHandle`-bearing [`ServerNgMuxStateMachine`]. It then mints a +/// [`ServerNgMetadataBundle`] (a tuple of `Send + Sync` +/// `ReadHandleFactory`s) and pushes one clone per peer onto `bundle_tx`. +/// Every other shard receives the bundle and rebuilds a reader-mode +/// `MuxStateMachine` on its own runtime - no WAL access, no replay, no +/// `RecoverySync` two-phase fence. Phase 2 of the old handshake was +/// only there to keep peer scans away from shard 0's torn-tail repair; +/// with no peer scan that race is structurally gone. +/// +/// The channel is bounded to the peer count so shard 0's `send` never +/// blocks beyond a peer drain. A peer that dies before recv drops its +/// `bundle_rx`, so shard 0's `send` eventually sees a disconnected +/// channel; the cross-thread shutdown flag drives every waiter out of +/// its `recv` loop if shard 0 panics before broadcasting. +enum MetadataHandoff { + Owner { + bundle_tx: crossfire::MAsyncTx>, + }, + Waiter { + bundle_rx: crossfire::MAsyncRx>, + }, +} + struct TcpTopology { self_replica_id: u8, replica_count: u8, @@ -123,58 +281,6 @@ struct BoundClientListeners { quic: Option, } -pub trait RunServerNg { - fn run( - &self, - config: &ServerNgConfig, - current_replica_id: Option, - ) -> impl Future>; -} - -impl RunServerNg for Rc { - /// Run the fully bootstrapped `server-ng` shard. - /// - /// # Errors - /// - /// Returns an error if TCP listener bootstrap fails or cluster TCP - /// addresses cannot be resolved from config. - async fn run( - &self, - config: &ServerNgConfig, - current_replica_id: Option, - ) -> Result<(), ServerNgError> { - let topology = resolve_tcp_topology(config, current_replica_id)?; - let (stop_tx, stop_rx) = channel(1); - let message_pump_shard = Self::clone(self); - let message_pump_handle = compio::runtime::spawn(async move { - message_pump_shard.run_message_pump(stop_rx).await; - }); - self.bus.track_background(message_pump_handle); - - let on_replica_message = make_replica_message_handler(self); - let on_client_request = make_client_request_handler(self); - let accepted_replica = make_local_replica_accept_fn(&self.bus, on_replica_message); - let accepted_client = make_local_client_accept_fns(&self.bus, on_client_request); - - info!( - shard = self.id, - partitions = self.plane.partitions().len(), - "server-ng shard initialized" - ); - - if let Err(error) = - start_tcp_runtime(self, config, &topology, accepted_replica, accepted_client).await - { - let _ = stop_tx.try_send(()); - return Err(error); - } - - self.bus.token().wait().await; - let _ = stop_tx.try_send(()); - Ok(()) - } -} - /// Load config, prepare directories, and complete late logging init. /// /// # Errors @@ -205,92 +311,474 @@ pub async fn load_config(logging: &mut Logging) -> Result, -) -> Result, ServerNgError> { - let topology = resolve_tcp_topology(config, current_replica_id)?; - let bus = Rc::new(IggyMessageBus::with_config(SHARD_ID, config)); - let recovered = recover::(Path::new(&config.system.path)) +) -> Result { + let allocator = ShardAllocator::new(&config.system.sharding.cpu_allocation) + .map_err(ServerNgError::ShardAllocator)?; + let assignments = allocator + .to_shard_assignments() + .map_err(ServerNgError::ShardAllocator)?; + let shards_count = assignments.len(); + if shards_count == 0 { + return Err(ServerNgError::InvalidShardsCount { count: 0 }); + } + // Shard ids index `ReplicaOwnerTable` slots as `u16`. `OWNER_NONE` + // (`u16::MAX`) is reserved as the empty-slot sentinel, so a cluster + // with `u16::MAX` shards would mint a shard id that collides with + // the sentinel and an owner-table lookup could never tell that shard + // apart from an unowned slot. Reject at boot so the invariant is held + // by the type system above this line, not by hoping the operator + // never configures 65535 cores worth of shards. + let total_shards = match u16::try_from(shards_count) { + Ok(count) if count < message_bus::OWNER_NONE => count, + _ => { + return Err(ServerNgError::InvalidShardsCount { + count: shards_count, + }); + } + }; + + // Re-check the full valid range, not just the zero floor: a caller + // that built the config without running `ShardingConfig::validate` + // would otherwise OOM at boot allocating an oversized inbox channel. + let inbox_capacity = config.system.sharding.inbox_capacity; + if inbox_capacity == 0 || inbox_capacity > INBOX_CAPACITY_MAX { + return Err(ServerNgError::InvalidInboxCapacity { + value: inbox_capacity, + max: INBOX_CAPACITY_MAX, + }); + } + + let (senders, mut inboxes) = shard_cluster_channels(total_shards, inbox_capacity); + let shutdown_flag = Arc::new(AtomicBool::new(false)); + let config = Arc::new(config); + // One owner table per cluster, Arc-cloned into every shard's bus so + // any shard's bus reads the same atomic slots that the owning + // shard's installer / disconnect path writes. + let owner_table = Arc::new(ReplicaOwnerTable::new()); + + // Single-shot bundle handoff (see `MetadataHandoff`): shard 0 sends + // one cloned `ServerNgMetadataBundle` per peer; each peer drains + // exactly one. Bounded to the peer count so shard 0's broadcast + // never blocks past a peer drain, and so a single-shard cluster + // (zero peers, crossfire clamps capacity to 1) can still allocate + // the channel without minting a useless slot. If a peer dies before + // recv, shard 0's `send` eventually sees a disconnected channel; + // the cross-thread shutdown flag drives every waiter out of its + // recv loop if shard 0 panics before broadcasting. + let metadata_peers = shards_count.saturating_sub(1); + let (metadata_bundle_tx, metadata_bundle_rx) = + crossfire::mpmc::bounded_async::(metadata_peers); + + let mut shard_threads: Vec<(u16, thread::JoinHandle>)> = + Vec::with_capacity(shards_count); + for (idx, assignment) in assignments.into_iter().enumerate() { + #[allow(clippy::cast_possible_truncation)] + let shard_id = idx as u16; + let inbox = inboxes[idx] + .take() + .expect("shard_cluster_channels populates every inbox slot exactly once"); + let senders_for_shard = senders.clone(); + let config_for_shard = Arc::clone(&config); + let shutdown_flag_for_shard = Arc::clone(&shutdown_flag); + let owner_table_for_shard = Arc::clone(&owner_table); + let metadata_handoff_for_shard = if shard_id == 0 { + MetadataHandoff::Owner { + bundle_tx: metadata_bundle_tx.clone(), + } + } else { + MetadataHandoff::Waiter { + bundle_rx: metadata_bundle_rx.clone(), + } + }; + + let handle = match thread::Builder::new() + .name(format!("shard-{shard_id}")) + .spawn(move || -> Result<(), ServerNgError> { + run_shard_thread( + shard_id, + total_shards, + current_replica_id, + assignment, + senders_for_shard, + inbox, + config_for_shard, + shutdown_flag_for_shard, + metadata_handoff_for_shard, + owner_table_for_shard, + ) + }) { + Ok(handle) => handle, + Err(source) => { + // Signal every shard already spawned before propagating, so + // their watchdog loops drive `bus.shutdown(...)` and the + // process can exit instead of hanging on stuck OS threads. + shutdown_flag.store(true, Ordering::Relaxed); + // Drop bootstrap's own channel clones before joining + // survivors. Otherwise a peer waiting on `bundle_rx.recv` + // would never observe the sender side disconnecting and + // would hang until the shutdown watchdog kicks the bus. + drop(metadata_bundle_tx); + drop(metadata_bundle_rx); + for (_, survivor) in shard_threads { + let _ = survivor.join(); + } + return Err(ServerNgError::ShardSpawnFailed { shard_id, source }); + } + }; + shard_threads.push((shard_id, handle)); + } + + // Drop bootstrap's own channel clones now that every shard owns its + // half. Keeping them on bootstrap's stack would deadlock a peer + // whose `bundle_rx.recv` only completes once every sender + // disconnects. + drop(metadata_bundle_tx); + drop(metadata_bundle_rx); + + info!( + shards_count, + "server-ng cluster bootstrap dispatched; awaiting shard runtimes" + ); + + Ok(ClusterHandles { + shutdown_flag, + shard_threads, + }) +} + +/// Per-shard OS thread entry. Pins CPU + memory, builds the compio +/// runtime, and `block_on`s `shard_main`. +#[allow(clippy::needless_pass_by_value, clippy::too_many_arguments)] +fn run_shard_thread( + shard_id: u16, + total_shards: u16, + replica_id: Option, + assignment: ShardInfo, + senders: Vec, + inbox: ShardReceiver, + config: Arc, + shutdown_flag: Arc, + metadata_handoff: MetadataHandoff, + owner_table: Arc, +) -> Result<(), ServerNgError> { + // Armed for the whole thread body: a post-spawn error `?` or a panic + // unwind here must flip `shutdown_flag` so sibling watchdogs drive + // their bus shutdown instead of parking forever on `bus.token().wait()`. + let mut shutdown_guard = ShutdownOnDrop::new(Arc::clone(&shutdown_flag)); + + assignment + .bind_cpu() + .map_err(|source| ServerNgError::CpuAffinityFailed { shard_id, source })?; + assignment + .bind_memory() + .map_err(|source| ServerNgError::MemoryAffinityFailed { shard_id, source })?; + + // TODO(hubcio): decouple runtime creation from the `server` crate + // (mirrors the identical TODO in `main.rs`). Reusing legacy here so + // server-ng and the legacy server share one io_uring tuning surface. + let runtime = create_shard_executor() + .map_err(|source| ServerNgError::ShardRuntimeCreateFailed { shard_id, source })?; + + let result = runtime.block_on(async move { + shard_main( + shard_id, + total_shards, + replica_id, + senders, + inbox, + &config, + shutdown_flag, + metadata_handoff, + owner_table, + ) .await - .map_err(ServerNgError::MetadataRecovery)?; - let restored_op = recovered.last_applied_op.unwrap_or_else(|| { - recovered - .snapshot - .as_ref() - .map_or(0, IggySnapshot::sequence_number) }); + if result.is_ok() { + shutdown_guard.disarm(); + } + result +} + +/// Per-shard async lifecycle. Builds the bus, recovers metadata, +/// constructs the `IggyShard` for this shard's slice of partitions, +/// wires listeners on shard 0, and runs the message pump until +/// shutdown. +#[allow(clippy::too_many_arguments, clippy::too_many_lines)] +async fn shard_main( + shard_id: u16, + total_shards: u16, + replica_id: Option, + senders: Vec, + inbox: ShardReceiver, + config: &ServerNgConfig, + shutdown_flag: Arc, + metadata_handoff: MetadataHandoff, + owner_table: Arc, +) -> Result<(), ServerNgError> { + let topology = resolve_tcp_topology(config, replica_id)?; + let bus = Rc::new(IggyMessageBus::with_config_and_owner_table( + shard_id, + config, + owner_table, + )); + + let shutdown_flag_for_handoff = Arc::clone(&shutdown_flag); + spawn_shutdown_watchdog(Rc::clone(&bus), shutdown_flag); + + // Metadata bootstrap is single-writer: shard 0 owns the WAL and the + // only `WriteHandle`-bearing `MuxStateMachine`. Peer shards receive + // a `ReadHandleFactory` bundle on the inter-thread channel and + // rebuild a reader-mode `MuxStateMachine` on their own runtime - no + // WAL access, no replay. Writes still funnel through shard 0's + // metadata VSR; per-commit `publish()` (in `WriteCell::apply`) + // bounds reader staleness to one op. + let data_dir = Path::new(&config.system.path); + let (mux_stm, owner_state) = match metadata_handoff { + MetadataHandoff::Owner { bundle_tx } => { + let recovered = recover::(data_dir) + .await + .map_err(ServerNgError::MetadataRecovery)?; + broadcast_metadata_bundle( + shard_id, + &bundle_tx, + recovered.mux_stm.factory_bundle(), + total_shards.saturating_sub(1), + &shutdown_flag_for_handoff, + ) + .await?; + ( + recovered.mux_stm, + Some(( + recovered.journal, + recovered.snapshot, + recovered.last_applied_op, + )), + ) + } + MetadataHandoff::Waiter { bundle_rx } => { + let bundle = + await_metadata_bundle(shard_id, &bundle_rx, &shutdown_flag_for_handoff).await?; + (ServerNgMuxStateMachine::from_factory_bundle(bundle), None) + } + }; + + // Metadata consensus + journal + snapshot live only on shard 0. + // `IggyShard::tick_metadata` short-circuits when `consensus.is_none()`, + // so peer shards have no caller that reads `journal` or `snapshot`. + let (metadata_consensus, journal_for_metadata, snapshot_for_metadata) = + if let Some((journal, snapshot, last_applied_op)) = owner_state { + let restored_op = last_applied_op + .unwrap_or_else(|| snapshot.as_ref().map_or(0, IggySnapshot::sequence_number)); + let consensus = restore_metadata_consensus( + &journal, + restored_op, + topology.self_replica_id, + topology.replica_count, + Rc::clone(&bus), + ); + (Some(consensus), Some(journal), snapshot) + } else { + (None, None, None) + }; let metadata = ServerNgMetadata::new( - Some(restore_metadata_consensus( - &recovered.journal, - restored_op, - topology.self_replica_id, - topology.replica_count, - Rc::clone(&bus), - )), - Some(recovered.journal), - recovered.snapshot, - recovered.mux_stm, + metadata_consensus, + journal_for_metadata, + snapshot_for_metadata, + mux_stm, Some(PathBuf::from(&config.system.path)), ); - let shard = build_single_shard(config, &topology, metadata, bus).await?; - info!(shard = shard.id, "server-ng bootstrap complete"); - Ok(shard) -} + let shard_metrics = ShardMetrics::for_shard(shard_id); + let shard = build_shard_for_thread( + shard_id, + total_shards, + config, + &topology, + metadata, + Rc::clone(&bus), + senders, + inbox, + shard_metrics, + ) + .await?; -fn restore_metadata_consensus( - journal: &PrepareJournal, - restored_op: u64, - self_replica_id: u8, - replica_count: u8, - bus: Rc, -) -> VsrConsensus> { - let mut consensus = VsrConsensus::new( - CLUSTER_ID, - self_replica_id, - replica_count, - 0, - bus, - LocalPipeline::new(), + info!( + shard = shard_id, + partitions = shard.plane.partitions().len(), + "server-ng shard initialized" ); - let last_header = journal - .last_op() - .and_then(|op| usize::try_from(op).ok()) - .and_then(|op| journal.header(op).map(|header| *header)); - if let Some(header) = last_header { - consensus.set_view(header.view); + let (stop_tx, stop_rx) = channel(1); + let pump_shard = Rc::clone(&shard); + let pump_handle = compio::runtime::spawn(async move { + pump_shard.run_message_pump(stop_rx).await; + }); + bus.track_background(pump_handle); + + // Listeners (replica + every client transport) bind on shard 0 only. + // Shard 0's coordinator round-robins inbound TCP/WS connections to + // peer shards via fd-transfer. QUIC and TCP-TLS clients terminate + // locally on shard 0 (their per-connection state is non-portable - + // see `LifecycleFrame::ClientWsConnectionSetup` rustdoc). + // + // No phase-2 listener fence is needed: peer shards no longer scan + // the WAL, so a shard-0 append accepted mid-boot cannot race a + // peer's `truncate_or_fail`. The factory-bundle handoff has already + // installed reader-mode `MuxStateMachine`s on every peer by the + // time shard 0 returns from `broadcast_metadata_bundle`. + if shard_id == 0 { + let coord = shard + .coordinator() + .expect("shard 0 always has a coordinator attached by the builder"); + let on_client_request = make_client_request_handler(&shard); + let accepted_replica = make_delegating_replica_accept_fn(Rc::clone(&coord)); + let accepted_client = make_shard_zero_client_accept_fns(coord, &bus, on_client_request); + + if let Err(error) = + start_tcp_runtime(&shard, config, &topology, accepted_replica, accepted_client).await + { + let _ = stop_tx.try_send(()); + return Err(error); + } } - consensus.init(); - consensus.sequencer().set_sequence(restored_op); - // Known gap: clustered bootstrap does not yet persist a durable commit - // watermark. Until that exists, recovery assumes the not-yet-supported - // case where replicas are not rejoining with divergent commit state. - consensus.restore_commit_state(restored_op, restored_op); - if let Some(header) = last_header { - consensus.set_last_prepare_checksum(header.checksum); + bus.token().wait().await; + let _ = stop_tx.try_send(()); + + info!(shard = shard_id, "server-ng shard exited cleanly"); + Ok(()) +} + +/// Block until shard 0 broadcasts the metadata factory bundle, or the +/// cross-thread shutdown flag flips. The recv future is polled in a +/// `SHUTDOWN_POLL_INTERVAL` loop so a shard 0 that panics before it +/// broadcasts cannot strand peer shards: the shutdown path flips the +/// flag, every waiter observes it on the next tick, and the cluster +/// tears down instead of hanging. +async fn await_metadata_bundle( + shard_id: u16, + bundle_rx: &crossfire::MAsyncRx>, + shutdown_flag: &Arc, +) -> Result { + loop { + match compio::time::timeout(SHUTDOWN_POLL_INTERVAL, bundle_rx.recv()).await { + Ok(Ok(bundle)) => return Ok(bundle), + Ok(Err(_)) => return Err(ServerNgError::MetadataHandoffAborted { shard_id }), + Err(_) => { + if shutdown_flag.load(Ordering::Relaxed) { + return Err(ServerNgError::MetadataHandoffAborted { shard_id }); + } + } + } } +} - consensus +/// Push `peers` cloned bundles onto `bundle_tx`, polling each send in a +/// `SHUTDOWN_POLL_INTERVAL` loop so the cross-thread shutdown flag can +/// interrupt a stalled handoff. Symmetric to [`await_metadata_bundle`]: +/// shutdown observed mid-handshake aborts cleanly rather than stalling +/// on a `send` future that can no longer make progress. +async fn broadcast_metadata_bundle( + shard_id: u16, + bundle_tx: &crossfire::MAsyncTx>, + bundle: ServerNgMetadataBundle, + peers: u16, + shutdown_flag: &Arc, +) -> Result<(), ServerNgError> { + for _ in 0..peers { + loop { + match compio::time::timeout(SHUTDOWN_POLL_INTERVAL, bundle_tx.send(bundle.clone())) + .await + { + Ok(Ok(())) => break, + Ok(Err(_)) => { + // Every peer dropped its `bundle_rx` before recv; the + // cluster is tearing itself down. Stop sending and + // let the shutdown path handle teardown. + return Ok(()); + } + Err(_) => { + if shutdown_flag.load(Ordering::Relaxed) { + return Err(ServerNgError::MetadataHandoffAborted { shard_id }); + } + } + } + } + } + Ok(()) } -async fn build_single_shard( +/// Spawn a per-shard polling task that watches the cross-thread shutdown +/// flag and triggers this shard's bus shutdown on transition. The flag +/// is the only Send signal we have; the bus' shutdown machinery is +/// `!Send` (`Rc>` + per-shard `async_channel`), so it must be +/// triggered from within the runtime that owns the bus. +#[allow(clippy::needless_pass_by_value)] +fn spawn_shutdown_watchdog(bus: Rc, shutdown_flag: Arc) { + let bus_for_task = Rc::clone(&bus); + let bus_token = bus.token(); + let watchdog = compio::runtime::spawn(async move { + loop { + if shutdown_flag.load(Ordering::Relaxed) { + break; + } + if bus_token.is_triggered() { + // Bus shutdown was driven from elsewhere (e.g. internal + // failure path). Watchdog has nothing left to do. + return; + } + compio::time::sleep(SHUTDOWN_POLL_INTERVAL).await; + } + let _ = bus_for_task.shutdown(SHARD_SHUTDOWN_TIMEOUT).await; + }); + watchdog.detach(); +} + +#[allow(clippy::too_many_arguments, clippy::too_many_lines)] +async fn build_shard_for_thread( + shard_id: u16, + total_shards: u16, config: &ServerNgConfig, topology: &TcpTopology, metadata: ServerNgMetadata, bus: Rc, + senders: Vec, + inbox: ShardReceiver, + metrics: ShardMetrics, ) -> Result, ServerNgError> { - let shard_id = ShardId::new(SHARD_ID); - let partition_count = metadata.mux_stm.streams().read(|inner| { + let shard_local_id = ShardId::new(shard_id); + let total_partitions = metadata.mux_stm.streams().read(|inner| { inner .items .iter() @@ -301,10 +789,20 @@ async fn build_single_shard( .map(|(_, topic)| topic.partitions.len()) .sum::() }) - .sum() + .sum::() }); + + // IggyPartitions holds only the partitions owned by this shard + // (see the filter below at insert time), so the cluster total is + // an N-fold overshoot. `ceil(total / shards) * 2` is a coarse upper + // bound that absorbs hash skew without paying the full multiplier. + // PapayaShardsTable below stays sized to the cluster total because + // every shard routes every namespace. + let owned_partitions_capacity = total_partitions + .div_ceil(usize::from(total_shards).max(1)) + .saturating_mul(2); let mut partitions = IggyPartitions::with_capacity( - shard_id, + shard_local_id, PartitionsConfig { messages_required_to_save: config.system.partition.messages_required_to_save, size_of_messages_required_to_save: config @@ -314,13 +812,13 @@ async fn build_single_shard( enforce_fsync: config.system.partition.enforce_fsync, segment_size: config.system.segment.size, }, - partition_count, + owned_partitions_capacity, ); - let shards_table = PapayaShardsTable::with_capacity(partition_count); + let shards_table = PapayaShardsTable::with_capacity(total_partitions); let (topic_stats, namespaces) = metadata.mux_stm.streams().read(|inner| { let mut topic_stats = Vec::new(); - let mut namespaces = Vec::with_capacity(partition_count); + let mut namespaces = Vec::with_capacity(total_partitions); for (_, stream) in &inner.items { for (topic_id, topic) in &stream.topics { topic_stats.push(topic.stats.clone()); @@ -332,6 +830,12 @@ async fn build_single_shard( (topic_stats, namespaces) }); + // Every shard owns its own `Arc` after recovery (the + // `mux_stm` is per-shard, no sharing). Each shard must zero from the + // snapshot total before re-adding its own per-partition deltas; + // otherwise the non-zero shards inherit the cluster-wide totals and + // decrement them into the parent `StreamStats`, leaving every + // per-shard exporter / query inconsistent. for topic_stats in topic_stats { topic_stats.zero_out_all(); } @@ -339,30 +843,40 @@ async fn build_single_shard( for (stream_id, topic_id, topic_stats, partition_metadata) in namespaces { validate_recovered_namespace(config, stream_id, topic_id, partition_metadata.id)?; let namespace = IggyNamespace::new(stream_id, topic_id, partition_metadata.id); - let partition = load_partition( - config, - namespace, - topic_stats, - &partition_metadata, - topology.self_replica_id, - topology.replica_count, - Rc::clone(&bus), - ) - .await?; - let local_idx = partitions.insert(namespace, partition); - shards_table.insert( - namespace, - PartitionLocation::new(shard_id, LocalIdx::new(*local_idx)), - ); + let owning_shard = calculate_shard_assignment(&namespace, u32::from(total_shards)); + if owning_shard == shard_id { + let partition = load_partition( + config, + namespace, + topic_stats, + &partition_metadata, + topology.self_replica_id, + topology.replica_count, + Rc::clone(&bus), + ) + .await?; + let local_idx = partitions.insert(namespace, partition); + shards_table.insert( + namespace, + PartitionLocation::new(ShardId::new(owning_shard), LocalIdx::new(*local_idx)), + ); + } else { + // Non-owning entry: only `shard_id` is consulted by the + // router; `local_idx` is meaningful only on the owning + // shard, so a sentinel `0` is safe here. + shards_table.insert( + namespace, + PartitionLocation::new(ShardId::new(owning_shard), LocalIdx::new(0)), + ); + } } - let (sender, inbox) = shard_channel::<()>(SHARD_ID, SHARD_INBOX_CAPACITY); - let senders = vec![sender]; let shard_handle = Rc::new(RefCell::new(None)); let on_replica_message = make_deferred_replica_message_handler(&shard_handle); let on_client_request = make_deferred_client_request_handler(&shard_handle); + let shard_name = format!("server-ng-shard-{shard_id}"); let built = IggyShardBuilder::new( - ShardIdentity::new(SHARD_ID, SHARD_NAME.to_string()), + ShardIdentity::new(shard_id, shard_name), Rc::clone(&bus), on_replica_message, on_client_request, @@ -373,18 +887,52 @@ async fn build_single_shard( shards_table, PartitionConsensusConfig::new(CLUSTER_ID, topology.replica_count, Rc::clone(&bus)), CoordinatorConfig::default(), - bus.token(), + metrics, ) .build(); - if let Some(refresh_task) = built.refresh_task { - bus.track_background(refresh_task); - } let shard = Rc::new(built.shard); *shard_handle.borrow_mut() = Some(Rc::downgrade(&shard)); Ok(shard) } +fn restore_metadata_consensus( + journal: &PrepareJournal, + restored_op: u64, + self_replica_id: u8, + replica_count: u8, + bus: Rc, +) -> VsrConsensus> { + let mut consensus = VsrConsensus::new( + CLUSTER_ID, + self_replica_id, + replica_count, + 0, + bus, + LocalPipeline::new(), + ); + + let last_header = journal + .last_op() + .and_then(|op| usize::try_from(op).ok()) + .and_then(|op| journal.header(op).map(|header| *header)); + if let Some(header) = last_header { + consensus.set_view(header.view); + } + + consensus.init(); + consensus.sequencer().set_sequence(restored_op); + // Known gap: clustered bootstrap does not yet persist a durable commit + // watermark. Until that exists, recovery assumes the not-yet-supported + // case where replicas are not rejoining with divergent commit state. + consensus.restore_commit_state(restored_op, restored_op); + if let Some(header) = last_header { + consensus.set_last_prepare_checksum(header.checksum); + } + + consensus +} + const fn validate_recovered_namespace( config: &ServerNgConfig, stream_id: usize, @@ -861,16 +1409,20 @@ fn resolve_tcp_topology( let default_quic_addr = resolve_optional_listener_addr(config.quic.enabled, "quic.address", &config.quic.address)?; if !config.cluster.enabled { - if let Some(replica_id) = current_replica_id { - warn!( - replica_id, - "cluster is disabled, ignoring --replica-id for single-node server-ng startup" - ); + if let Some(replica_id) = current_replica_id + && replica_id != SHARD_REPLICA_ID + { + return Err(ServerNgError::ReplicaIdRequiresCluster { + supplied: replica_id, + default: SHARD_REPLICA_ID, + }); } return Ok(TcpTopology { // Keep parity with the current server binary and the integration - // harness: `--replica-id` may be passed unconditionally, but in - // single-node mode there is only replica 0. + // harness: `--replica-id 0` may be passed unconditionally in + // single-node mode; any other id is rejected above so the WAL + // cannot commit under an identity that will later disagree with + // a cluster.nodes[] entry. self_replica_id: SHARD_REPLICA_ID, replica_count: 1, client_listen_addr: default_client_addr, @@ -1209,13 +1761,6 @@ async fn start_manual_runtime( Ok(()) } -fn make_replica_message_handler(shard: &Rc) -> MessageHandler { - let shard = Rc::clone(shard); - Rc::new(move |_replica_id, message| { - shard.dispatch(message); - }) -} - fn make_client_request_handler(shard: &Rc) -> RequestHandler { let shard = Rc::clone(shard); Rc::new(move |client_id, message| { @@ -1271,87 +1816,73 @@ fn upgrade_shard_handle(shard_handle: &ServerNgShardHandle) -> Option, - on_message: MessageHandler, +/// Replica accept callback that ships every inbound connection through +/// the shard-0 coordinator's round-robin fd-delegation. The fd lands on +/// the target shard's inbox as a [`shard::LifecycleFrame::ReplicaConnectionSetup`] +/// frame and is installed on that shard's bus. +fn make_delegating_replica_accept_fn( + coord: Rc, ) -> AcceptedReplicaFn { - let bus = Rc::clone(bus); - Rc::new(move |stream, peer_id| { - installer::install_replica_tcp(&bus, peer_id, stream, on_message.clone()); - }) + Rc::new( + move |stream, peer_id| match coord.delegate_replica(stream, peer_id) { + Ok(target) => { + info!(peer_id, target, "replica connection delegated"); + } + Err(error) => { + warn!( + peer_id, + error = ?error, + "delegate_replica failed; dropping inbound replica connection" + ); + } + }, + ) } -fn make_local_client_accept_fns( +/// Shard-0 client accept callbacks. TCP and WS clients are delegated via +/// the coordinator (round-robin to peer shards); QUIC and TCP-TLS install +/// locally on shard 0 because their per-connection state is not portable +/// across shards (`compio_quic` endpoint binds one UDP socket; rustls TLS +/// state ties to the post-handshake reactor). +fn make_shard_zero_client_accept_fns( + coord: Rc, bus: &Rc, on_request: RequestHandler, ) -> LocalClientAcceptFns { - let tcp_bus = Rc::clone(bus); - let ws_bus = Rc::clone(bus); let quic_bus = Rc::clone(bus); let tcp_tls_bus = Rc::clone(bus); - let tcp_request = on_request.clone(); - let ws_request = on_request.clone(); let quic_request = on_request.clone(); let tcp_tls_request = on_request; - let counter = Rc::new(Cell::new(1_u128)); - let shard_id = u128::from(bus.shard_id()); - - let tcp_counter = Rc::clone(&counter); - let tcp = Rc::new(move |stream| { - let Some(meta) = client_meta_from_stream( - &stream, - tcp_counter.as_ref(), - shard_id, - ClientTransportKind::Tcp, - ) else { - return; - }; - installer::install_client_tcp(&tcp_bus, meta, stream, tcp_request.clone()); + + let tcp_coord = Rc::clone(&coord); + let tcp = Rc::new(move |stream| match tcp_coord.delegate_client(stream) { + Ok(client_id) => info!(client_id, "TCP client delegated"), + Err(error) => warn!(error = ?error, "delegate_client failed; dropping TCP client"), }); - let ws_counter = Rc::clone(&counter); - let ws = Rc::new(move |stream| { - let Some(meta) = client_meta_from_stream( - &stream, - ws_counter.as_ref(), - shard_id, - ClientTransportKind::Ws, - ) else { - return; - }; - let fd = match fd_transfer::dup_fd(&stream) { - Ok(fd) => fd, - Err(error) => { - warn!( - client_id = meta.client_id, - error = %error, - "dropping accepted websocket client after fd duplication failure" - ); - return; - } - }; - ws_bus.install_client_ws_fd(fd, meta, ws_request.clone()); + let ws_coord = Rc::clone(&coord); + let ws = Rc::new(move |stream| match ws_coord.delegate_ws_client(stream) { + Ok(client_id) => info!(client_id, "WS client delegated"), + Err(error) => warn!(error = ?error, "delegate_ws_client failed; dropping WS client"), }); - let quic_counter = Rc::clone(&counter); + // QUIC and TCP-TLS terminate locally on shard 0 but mint their client + // ids through the coordinator's `client_seq`, the same counter the + // delegated TCP/WS path uses. A separate counter here would let a + // shard-0-local id collide with a delegated id that round-robined to + // shard 0 (both encode target shard 0) in shard 0's connection + // registry. + let quic_coord = Rc::clone(&coord); let quic = Rc::new(move |accepted: message_bus::AcceptedQuicConn| { - let meta = mint_client_meta( - quic_counter.as_ref(), - shard_id, - accepted.peer_addr(), - ClientTransportKind::Quic, - ); + let meta = mint_client_meta(&quic_coord, accepted.peer_addr(), ClientTransportKind::Quic); installer::install_client_quic(&quic_bus, meta, accepted, quic_request.clone()); }); - let tcp_tls_counter = Rc::clone(&counter); + let tcp_tls_coord = coord; let tcp_tls = Rc::new(move |stream, tls_config| { - let Some(meta) = client_meta_from_stream( - &stream, - tcp_tls_counter.as_ref(), - shard_id, - ClientTransportKind::TcpTls, - ) else { + let Some(meta) = + client_meta_from_stream(&stream, &tcp_tls_coord, ClientTransportKind::TcpTls) + else { return; }; installer::install_client_tcp_tls( @@ -1373,34 +1904,25 @@ fn make_local_client_accept_fns( fn client_meta_from_stream( stream: &compio::net::TcpStream, - counter: &Cell, - shard_id: u128, + coord: &shard::coordinator::ShardZeroCoordinator, transport: ClientTransportKind, ) -> Option { let peer_addr = match stream.peer_addr() { Ok(peer_addr) => peer_addr, Err(error) => { - let client_id = preview_client_id(counter, shard_id); - warn!(client_id, error = %error, "dropping accepted client with unknown peer address"); + warn!(error = %error, "dropping accepted client with unknown peer address"); return None; } }; - Some(mint_client_meta(counter, shard_id, peer_addr, transport)) -} - -const fn preview_client_id(counter: &Cell, shard_id: u128) -> u128 { - (shard_id << 112) | counter.get() + Some(mint_client_meta(coord, peer_addr, transport)) } fn mint_client_meta( - counter: &Cell, - shard_id: u128, + coord: &shard::coordinator::ShardZeroCoordinator, peer_addr: SocketAddr, transport: ClientTransportKind, ) -> ClientConnMeta { - let seq = counter.get(); - counter.set(seq.wrapping_add(1)); - ClientConnMeta::new((shard_id << 112) | seq, peer_addr, transport) + ClientConnMeta::new(coord.mint_shard_zero_client_id(), peer_addr, transport) } async fn start_client_listeners( @@ -1571,3 +2093,109 @@ fn socket_addr_from_parts( })?; Ok(SocketAddr::new(ip, port)) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn shutdown_on_drop_armed_flips_flag() { + let flag = Arc::new(AtomicBool::new(false)); + drop(ShutdownOnDrop::new(Arc::clone(&flag))); + assert!( + flag.load(Ordering::Relaxed), + "an armed guard must flip the flag on drop (covers the error `?` \ + and panic-unwind exit paths of run_shard_thread)" + ); + } + + #[test] + fn shutdown_on_drop_disarmed_leaves_flag() { + let flag = Arc::new(AtomicBool::new(false)); + let mut guard = ShutdownOnDrop::new(Arc::clone(&flag)); + guard.disarm(); + drop(guard); + assert!( + !flag.load(Ordering::Relaxed), + "a disarmed guard must not flip the flag (clean `Ok(())` exit)" + ); + } + + #[compio::test] + async fn broadcast_metadata_bundle_returns_immediately_with_no_peers() { + // Single-shard cluster: shard 0 has no peers to fan out to, so + // the handoff must complete without ever calling `send`. + let (bundle_tx, _bundle_rx) = crossfire::mpmc::bounded_async::(0); + let flag = Arc::new(AtomicBool::new(false)); + let mux = ServerNgMuxStateMachine::default(); + broadcast_metadata_bundle(0, &bundle_tx, mux.factory_bundle(), 0, &flag) + .await + .expect("zero peers must not block shard 0"); + } + + #[compio::test] + async fn metadata_bundle_round_trips_through_channel() { + // End-to-end: shard 0 mints a bundle, a peer receives it on + // another runtime, and `from_factory_bundle` constructs a + // reader-mode mux that observes shard 0's writes via the same + // LeftRight pair. + let peers = 1u16; + let (bundle_tx, bundle_rx) = + crossfire::mpmc::bounded_async::(usize::from(peers)); + let flag = Arc::new(AtomicBool::new(false)); + + let owner = ServerNgMuxStateMachine::default(); + let bundle = owner.factory_bundle(); + broadcast_metadata_bundle(0, &bundle_tx, bundle, peers, &flag) + .await + .expect("broadcast must succeed with one peer drained"); + + let received = await_metadata_bundle(1, &bundle_rx, &flag) + .await + .expect("peer must receive the broadcast bundle"); + let _peer_mux = ServerNgMuxStateMachine::from_factory_bundle(received); + } + + #[compio::test] + async fn await_metadata_bundle_aborts_when_owner_drops_without_sending() { + let (bundle_tx, bundle_rx) = crossfire::mpmc::bounded_async::(1); + let flag = Arc::new(AtomicBool::new(false)); + + // Shard 0 dies before broadcasting; the peer must observe the + // disconnect and abort instead of hanging forever. + drop(bundle_tx); + + let err = await_metadata_bundle(1, &bundle_rx, &flag) + .await + .expect_err("a peer whose owner never sends must abort"); + assert!( + matches!(err, ServerNgError::MetadataHandoffAborted { shard_id: 1 }), + "expected MetadataHandoffAborted, got {err:?}" + ); + } + + #[compio::test] + async fn await_metadata_bundle_aborts_on_shutdown_flag() { + let (_bundle_tx, bundle_rx) = crossfire::mpmc::bounded_async::(1); + let flag = Arc::new(AtomicBool::new(false)); + + let waiter = compio::runtime::spawn({ + let flag = Arc::clone(&flag); + async move { await_metadata_bundle(1, &bundle_rx, &flag).await } + }); + + // Owner has not sent yet, but shutdown was requested; the peer + // must exit via the flag poll instead of hanging. + compio::time::sleep(SHUTDOWN_POLL_INTERVAL / 2).await; + flag.store(true, Ordering::Relaxed); + + let err = waiter + .await + .unwrap_or_else(|e| std::panic::resume_unwind(e)) + .expect_err("shutdown flag must abort the bundle wait"); + assert!( + matches!(err, ServerNgError::MetadataHandoffAborted { shard_id: 1 }), + "expected MetadataHandoffAborted on shutdown, got {err:?}" + ); + } +} diff --git a/core/server-ng/src/main.rs b/core/server-ng/src/main.rs index 169f32f65b..e604f4a6ab 100644 --- a/core/server-ng/src/main.rs +++ b/core/server-ng/src/main.rs @@ -23,50 +23,66 @@ mod args; use args::Args; use clap::Parser; -use server_ng::bootstrap::RunServerNg; +use server_ng::bootstrap::{bootstrap, load_config}; use server_ng::server_error::ServerNgError; +use tracing::{error, info}; fn main() -> Result<(), ServerNgError> { - // TODO: decouple runtime creation from the `server` crate and move the shared - // compio executor setup into a lower-level crate/module used by both binaries. - let runtime = match server::bootstrap::create_shard_executor() { + // TODO(hubcio): decouple runtime creation from the `server` crate and + // move the shared compio executor setup into a lower-level crate/module + // used by both binaries. + let bootstrap_runtime = match server::bootstrap::create_shard_executor() { Ok(rt) => rt, Err(e) => { match e.kind() { std::io::ErrorKind::InvalidInput => { - // TODO: decouple io_uring diagnostics from the `server` crate. server::diagnostics::print_invalid_io_uring_args_info(); } std::io::ErrorKind::OutOfMemory => { - // TODO: decouple io_uring diagnostics from the `server` crate. server::diagnostics::print_locked_memory_limit_info(); } std::io::ErrorKind::PermissionDenied => { - // TODO: decouple io_uring diagnostics from the `server` crate. server::diagnostics::print_io_uring_permission_info(); } _ => {} } - panic!("Cannot create server-ng executor: {e}"); + panic!("Cannot create server-ng bootstrap executor: {e}"); } }; - runtime.block_on(async { - let args = Args::parse(); - if let Ok(env_path) = std::env::var("IGGY_ENV_PATH") { - let _ = dotenvy::from_path(&env_path); - } else { - let _ = dotenvy::dotenv(); - } - // TODO: decouple logging from the `server` crate and move the shared - // logging bootstrap into a lower-level crate/module. - let mut logging = server::log::logger::Logging::new(); - logging.early_init(); + // Bootstrap on a temporary runtime: parse args, init logging, load + // config, init the memory pool. Then drop the runtime and spawn the + // per-shard runtimes - each shard thread builds its OWN + // `compio::runtime::Runtime` via `create_shard_executor`, pinned to + // its CPU. + let bootstrap_result: Result<(configs::server_ng::ServerNgConfig, Option), ServerNgError> = + bootstrap_runtime.block_on(async { + let args = Args::parse(); + if let Ok(env_path) = std::env::var("IGGY_ENV_PATH") { + let _ = dotenvy::from_path(&env_path); + } else { + let _ = dotenvy::dotenv(); + } + + // TODO: decouple logging from the `server` crate. + let mut logging = server::log::logger::Logging::new(); + logging.early_init(); + + let config = load_config(&mut logging).await?; + iggy_common::MemoryPool::init_pool(&config.system.memory_pool.into_other()); + + Ok((config, args.replica_id)) + }); + let (config, replica_id) = bootstrap_result?; + drop(bootstrap_runtime); - let config = server_ng::bootstrap::load_config(&mut logging).await?; - iggy_common::MemoryPool::init_pool(&config.system.memory_pool.into_other()); + let cluster = bootstrap(config, replica_id)?; + if let Err(error) = cluster.install_ctrlc_handler() { + error!(error = %error, "failed to install Ctrl-C handler; cluster shutdown will not respond to SIGINT"); + } - let shard = server_ng::bootstrap::bootstrap(&config, args.replica_id).await?; - shard.run(&config, args.replica_id).await - }) + info!("server-ng cluster running; waiting on shard threads"); + cluster.join_all()?; + info!("server-ng cluster shutdown complete"); + Ok(()) } diff --git a/core/server-ng/src/server_error.rs b/core/server-ng/src/server_error.rs index 77e17ee1cb..2af78c2586 100644 --- a/core/server-ng/src/server_error.rs +++ b/core/server-ng/src/server_error.rs @@ -20,6 +20,7 @@ use metadata::impls::recovery::RecoveryError; // TODO: decouple logging errors from the `server` crate. use server::server_error::LogError; +use server::shard_allocator::ShardingError; use thiserror::Error; #[derive(Debug, Error)] @@ -28,6 +29,42 @@ pub enum ServerNgError { Iggy(Box), #[error("failed to load server-ng config")] Config(#[source] configs::ConfigurationError), + #[error("failed to allocate shards from sharding.cpu_allocation")] + ShardAllocator(#[source] ShardingError), + #[error("failed to bind shard {shard_id} to its CPU set")] + CpuAffinityFailed { + shard_id: u16, + #[source] + source: ShardingError, + }, + #[error("failed to bind shard {shard_id} memory to its NUMA node")] + MemoryAffinityFailed { + shard_id: u16, + #[source] + source: ShardingError, + }, + #[error("failed to spawn OS thread for shard {shard_id}")] + ShardSpawnFailed { + shard_id: u16, + #[source] + source: std::io::Error, + }, + #[error("failed to create io_uring runtime for shard {shard_id}")] + ShardRuntimeCreateFailed { + shard_id: u16, + #[source] + source: std::io::Error, + }, + #[error("shard {shard_id} thread panicked: {message}")] + ShardThreadPanicked { shard_id: u16, message: String }, + #[error( + "computed shards_count = {count} is not a valid cluster size; must be \ + 1..={} (fit in u16 and stay below the OWNER_NONE sentinel)", + message_bus::OWNER_NONE - 1 + )] + InvalidShardsCount { count: usize }, + #[error("system.sharding.inbox_capacity must be in 1..={max}; got {value}")] + InvalidInboxCapacity { value: usize, max: usize }, #[error("failed to serialize current server-ng config")] CurrentConfigSerialize(#[source] toml::ser::Error), #[error("failed to write current server-ng config at {path}")] @@ -40,6 +77,11 @@ pub enum ServerNgError { Logging(#[source] LogError), #[error("failed to recover metadata snapshot and journal")] MetadataRecovery(#[source] RecoveryError), + #[error( + "shard {shard_id} aborted while waiting for shard-0 to broadcast the metadata \ + factory bundle; shard 0 dropped its sender (most likely it failed to recover)" + )] + MetadataHandoffAborted { shard_id: u16 }, #[error("failed to parse {context} socket address '{address}'")] SocketAddressParse { context: &'static str, @@ -53,6 +95,12 @@ pub enum ServerNgError { ClusterReplicaCountTooLarge { count: usize }, #[error("cluster mode requires --replica-id to identify the current node")] MissingReplicaId, + #[error( + "--replica-id {supplied} was passed with cluster.enabled=false; the WAL would commit \ + under replica {default} which permanently fixes this node's identity. Either set \ + cluster.enabled=true with a matching nodes[] entry, or drop --replica-id" + )] + ReplicaIdRequiresCluster { supplied: u8, default: u8 }, #[error("cluster node for replica {replica_id} is missing tcp_replica port")] ClusterReplicaPortMissing { replica_id: u8 }, #[error( diff --git a/core/server/config.toml b/core/server/config.toml index 1a58db6c8e..00611bf6ec 100644 --- a/core/server/config.toml +++ b/core/server/config.toml @@ -591,6 +591,17 @@ ports = { tcp = 8091, quic = 8081, http = 3001, websocket = 8093, tcp_replica = # + "numa:nodes=0,1;cores=4;no_ht=true": Use NUMA node 0 and 1, each nodes use 4 cores, and no hyperthreads cpu_allocation = "numa:auto" +# Per-shard inter-shard inbox channel capacity (server-ng only; the legacy +# server uses its own hard-coded sizing). Bounded by design: consensus frame +# drops are recovered by VSR retransmit, but cross-shard client reply drops +# are terminal. Consensus frames and client-reply forwards share this one +# channel, so a consensus burst can starve client replies: size against the +# worst-case consensus working set plus peak client-reply fan-out per shard +# occurring together. Watch the drop-site tracing logs (and the +# frame_drops_total{variant="forward_client_send"} counter once a per-shard +# exporter lands) to detect an under-sized value in production. +inbox_capacity = 1024 + [websocket] enabled = true address = "127.0.0.1:8092" diff --git a/core/shard/Cargo.toml b/core/shard/Cargo.toml index 82bdb69f82..985b8098af 100644 --- a/core/shard/Cargo.toml +++ b/core/shard/Cargo.toml @@ -36,6 +36,7 @@ message_bus = { path = "../message_bus" } metadata = { path = "../metadata" } papaya = { workspace = true } partitions = { path = "../partitions" } +prometheus-client = { workspace = true } tracing = { workspace = true } [lints.clippy] diff --git a/core/shard/src/builder.rs b/core/shard/src/builder.rs index b365b99136..d750859431 100644 --- a/core/shard/src/builder.rs +++ b/core/shard/src/builder.rs @@ -16,65 +16,49 @@ // under the License. //! One-stop construction for an [`IggyShard`] plus, on shard 0 only, its -//! associated [`ShardZeroCoordinator`] and periodic mapping-refresh task. +//! associated [`ShardZeroCoordinator`]. //! -//! The coordinator owns the authoritative `replica_id -> owning_shard` -//! snapshot used by the router's `ConnectionLost` fast path and by the -//! refresh task that re-broadcasts the snapshot to recover shards whose -//! inbox was full when an earlier `ReplicaMappingUpdate` was sent. -//! Both the `set_coordinator` call and the refresh-task spawn must happen -//! together: wiring one without the other breaks the drop-recovery story -//! the coordinator's rustdoc promises. +//! The coordinator owns shard-0's round-robin state for replica and +//! client placement. Cluster-wide `replica_id -> owning_shard` routing +//! flows through the cluster-shared [`message_bus::ReplicaOwnerTable`], +//! stamped by the owning shard's installer and CAS-cleared on disconnect. //! -//! Bootstrap code constructs an [`IggyShardBuilder`] per shard and calls -//! [`IggyShardBuilder::build`]. On shard 0 the returned [`BuiltShard`] -//! carries the refresh task's [`JoinHandle`]; bootstrap is responsible for -//! tracking it on the bus' background-task set so graceful shutdown awaits -//! it. On non-zero shards the handle is `None` and the coordinator field -//! on the shard stays unset. +//! The coordinator is constructed before the shard and passed to +//! [`IggyShard::new`] as a ctor argument, so an `IggyShard` is fully +//! wired the instant it becomes observable to any reader. -use crate::coordinator::ShardZeroCoordinator; +use crate::coordinator::{ShardZeroCoordinator, classify_try_send_err}; +use crate::metrics::{ShardMetrics, frame_drop_variant}; use crate::{ - CoordinatorConfig, IggyShard, PartitionConsensusConfig, Receiver, ShardFrame, - ShardFramePayload, ShardIdentity, TaggedSender, + CoordinatorConfig, IggyShard, LifecycleFrame, PartitionConsensusConfig, Receiver, ShardFrame, + ShardIdentity, TaggedSender, }; -use compio::runtime::JoinHandle; use consensus::VsrConsensus; use journal::JournalHandle; -use message_bus::MessageBus; use message_bus::client_listener::RequestHandler; -use message_bus::lifecycle::ShutdownToken; use message_bus::replica::listener::MessageHandler; +use message_bus::{MessageBus, SendError}; use metadata::IggyMetadata; use metadata::stm::StateMachine; use partitions::IggyPartitions; use std::rc::Rc; -use tracing::warn; use crate::shards_table::ShardsTable; -/// A freshly constructed [`IggyShard`], paired with the refresh-task -/// [`JoinHandle`] when this is shard 0. Bootstrap tracks the handle on -/// the bus' background tasks so graceful shutdown awaits it. -pub struct BuiltShard +/// A freshly constructed [`IggyShard`]. +pub struct BuiltShard where B: MessageBus, - R: Send + 'static, { - pub shard: IggyShard, - /// `Some` on shard 0, `None` otherwise. Pass to - /// `IggyMessageBus::track_background` (or an equivalent tracker) so - /// the task is awaited on shutdown. - pub refresh_task: Option>, + pub shard: IggyShard, } /// Builder that pairs [`IggyShard`] construction with coordinator wiring /// on shard 0. Non-zero shards skip the coordinator entirely; the -/// `coord_config` and `shutdown_token` fields are then ignored. -pub struct IggyShardBuilder +/// `coord_config` field is then ignored. +pub struct IggyShardBuilder where B: MessageBus, - R: Send + 'static, { identity: ShardIdentity, bus: B, @@ -82,26 +66,24 @@ where on_client_request: RequestHandler, metadata: IggyMetadata, MJ, S, M>, partitions: IggyPartitions, - senders: Vec>, - inbox: Receiver>, + senders: Vec, + inbox: Receiver, shards_table: T, partition_consensus: PartitionConsensusConfig, coord_config: CoordinatorConfig, - shutdown_token: ShutdownToken, + metrics: ShardMetrics, } -impl IggyShardBuilder +impl IggyShardBuilder where - B: MessageBus, - R: Send + 'static, + B: MessageBus + Clone + 'static, T: ShardsTable, MJ: JournalHandle, S: Send + 'static, M: StateMachine, { /// Create a builder carrying every input needed by both - /// [`IggyShard::new`] and (for shard 0) `ShardZeroCoordinator::new` - /// plus `spawn_refresh_task`. + /// [`IggyShard::new`] and (for shard 0) `ShardZeroCoordinator::new`. #[allow(clippy::too_many_arguments)] pub fn new( identity: ShardIdentity, @@ -110,12 +92,12 @@ where on_client_request: RequestHandler, metadata: IggyMetadata, MJ, S, M>, partitions: IggyPartitions, - senders: Vec>, - inbox: Receiver>, + senders: Vec, + inbox: Receiver, shards_table: T, partition_consensus: PartitionConsensusConfig, coord_config: CoordinatorConfig, - shutdown_token: ShutdownToken, + metrics: ShardMetrics, ) -> Self { Self { identity, @@ -129,15 +111,14 @@ where shards_table, partition_consensus, coord_config, - shutdown_token, + metrics, } } /// Consume the builder and produce a fully wired [`BuiltShard`]. On - /// shard 0 this constructs the coordinator, attaches it to the shard, - /// and spawns the periodic refresh task. On any other shard the - /// coordinator-specific fields are dropped and `refresh_task` is - /// `None`. + /// shard 0 this constructs the coordinator and passes it to the shard + /// ctor. On any other shard the coordinator-specific fields are + /// dropped. /// /// # Panics /// @@ -145,49 +126,84 @@ where /// (`senders[i].shard_id() != i`) or if `senders.len()` does not /// fit in `u16`. Both are bootstrap programming errors and the /// `u16` overflow check fires on every shard, not only shard 0. - pub fn build(self) -> BuiltShard { + pub fn build(self) -> BuiltShard { let is_shard_zero = self.identity.id == 0; let total_shards = u16::try_from(self.senders.len()) .expect("cluster shard count must fit in u16 (bootstrap invariant)"); - // Wire the bus' connection-lost notifier to push a ConnectionLost - // frame into shard 0's inbox. Every shard's bus needs this so that - // a dead replica connection anywhere surfaces at the coordinator. - // Shard 0's sender is cloned once here; the closure captures that - // clone and runs on the owning shard's reader/writer tasks. - let shard_zero_sender = self.senders[0].sender().clone(); - let connection_lost_fn: message_bus::ConnectionLostFn = Rc::new(move |replica_id: u8| { - let frame = ShardFrame:: { - payload: ShardFramePayload::ConnectionLost { replica_id }, - response_sender: None, - }; - if let Err(e) = shard_zero_sender.try_send(frame) { - // Inbox full: the coordinator's periodic refresh task - // re-broadcasts the authoritative snapshot and the bus - // retries on next dial, so a single drop is recoverable. - // Warn so operators see repeated drops if they occur. - warn!( - replica_id, - "shard-0 inbox rejected ConnectionLost frame: {e}" - ); - } - }); - self.bus.set_connection_lost_fn(connection_lost_fn); + // Install the cross-shard forward closures. The bus' slow path + // (`send_to_replica` / `send_to_client`) calls these when the + // destination connection lives on a different shard; the closure + // wraps the message into a `LifecycleFrame::Forward{Replica, + // Client}Send` and `try_send`s it to the owning shard's inbox. + // The receiving shard's router unwraps the frame and re-enters + // the bus' fast path locally. The `senders` Vec is canonically + // ordered (`senders[i].shard_id() == i`). + debug_assert_eq!( + self.senders[0].shard_id(), + 0, + "shard-0 sender slot misordered: senders[0] carries shard_id={}", + self.senders[0].shard_id() + ); + let senders_for_replica: Rc> = Rc::new(self.senders.clone()); + let senders_for_client = Rc::clone(&senders_for_replica); + let metrics_for_replica = self.metrics.clone(); + let metrics_for_client = self.metrics.clone(); + self.bus + .set_replica_forward_fn(Box::new(move |replica_id, owning_shard, msg| { + senders_for_replica.get(usize::from(owning_shard)).map_or( + Err(SendError::ReplicaForwardFailed(replica_id)), + |sender| { + let frame = ShardFrame::lifecycle(LifecycleFrame::ForwardReplicaSend { + replica_id, + msg, + }); + sender.try_send(frame).map_err(|err| { + metrics_for_replica.record_frame_drop( + frame_drop_variant::FORWARD_REPLICA_SEND, + classify_try_send_err(&err), + ); + SendError::ReplicaForwardFailed(replica_id) + }) + }, + ) + })); + self.bus + .set_client_forward_fn(Box::new(move |client_id, owning_shard, msg| { + senders_for_client.get(usize::from(owning_shard)).map_or( + Err(SendError::ClientForwardFailed(client_id)), + |sender| { + let frame = ShardFrame::lifecycle(LifecycleFrame::ForwardClientSend { + client_id, + msg, + }); + sender.try_send(frame).map_err(|err| { + metrics_for_client.record_frame_drop( + frame_drop_variant::FORWARD_CLIENT_SEND, + classify_try_send_err(&err), + ); + SendError::ClientForwardFailed(client_id) + }) + }, + ) + })); - let coord_inputs = if is_shard_zero { - let coord_senders: Vec> = self.senders.clone(); - Some(( + // On shard 0: build the coordinator BEFORE the shard, so the + // coordinator can be passed to `IggyShard::new` as a ctor arg. + let coordinator = if is_shard_zero { + let coord_senders: Vec = self.senders.clone(); + Some(Rc::new(ShardZeroCoordinator::new( Rc::new(coord_senders), total_shards, - self.coord_config, - self.shutdown_token, - )) + self.coord_config.clone(), + self.metrics.clone(), + ))) } else { None }; - let mut shard = IggyShard::new( + let shard = IggyShard::new( self.identity, self.bus, self.on_replica_message, @@ -198,20 +214,10 @@ where self.inbox, self.shards_table, self.partition_consensus, + coordinator, + self.metrics, ); - let refresh_task = if let Some((senders_rc, total_shards, cfg, token)) = coord_inputs { - let refresh_period = cfg.refresh_period; - let coord = Rc::new(ShardZeroCoordinator::new(senders_rc, total_shards, cfg)); - shard.set_coordinator(Rc::clone(&coord)); - Some(coord.spawn_refresh_task(token, refresh_period)) - } else { - None - }; - - BuiltShard { - shard, - refresh_task, - } + BuiltShard { shard } } } diff --git a/core/shard/src/config.rs b/core/shard/src/config.rs index 5165828342..27535c2e9a 100644 --- a/core/shard/src/config.rs +++ b/core/shard/src/config.rs @@ -23,19 +23,9 @@ //! from `ServerConfig` lands. Kept in-crate for now to avoid churning //! the configs crate ahead of that wiring. -use std::time::Duration; - /// Tunables for [`crate::coordinator::ShardZeroCoordinator`]. #[derive(Debug, Clone)] pub struct CoordinatorConfig { - /// Cadence at which shard 0 re-broadcasts its authoritative - /// `replica_id -> owning_shard` snapshot. A shard whose inbox was - /// full when the original `ReplicaMappingUpdate` was sent recovers - /// its mapping on the next tick rather than staying silently stale. - /// Must be comfortably shorter than the VSR view-change timeout so - /// a missed mapping cannot trigger a spurious view change. - pub refresh_period: Duration, - /// When `total_shards > 1`, exclude shard 0 from replica placement. /// Shard 0 already hosts the coordinator, the metadata writer, and /// both listeners; replicas are long-lived steady flows, so offload @@ -52,7 +42,6 @@ pub struct CoordinatorConfig { impl Default for CoordinatorConfig { fn default() -> Self { Self { - refresh_period: Duration::from_secs(10), skip_shard_zero_for_replicas: true, skip_shard_zero_for_clients: false, } diff --git a/core/shard/src/coordinator.rs b/core/shard/src/coordinator.rs index bcf69f6a21..cdbd1df03f 100644 --- a/core/shard/src/coordinator.rs +++ b/core/shard/src/coordinator.rs @@ -23,12 +23,16 @@ //! //! 1. picks the next target shard via round-robin, //! 2. duplicates the TCP fd, -//! 3. sends a `ShardFramePayload::{Replica,Client}ConnectionSetup` frame to +//! 3. sends a `LifecycleFrame::{Replica,Client}ConnectionSetup` frame to //! the target shard's inbox, //! 4. drops its own `TcpStream` so only the target shard's wrapped fd -//! keeps the socket alive, -//! 5. broadcasts the resulting replica -> owning shard mapping so every -//! bus' `send_to_replica` slow path can route to the owner. +//! keeps the socket alive. +//! +//! The cluster-shared [`message_bus::ReplicaOwnerTable`] is the +//! authoritative `replica_id -> owning_shard` view; the owning shard +//! stamps and CAS-clears its slot synchronously through +//! `mark_replica_owned` / `clear_replica_owned`, so the coordinator does +//! not need a separate mapping cache or broadcast subsystem. //! //! Client ids encode the owning shard in their top 16 bits, so clients do //! not need a mapping table; any shard can route a client reply from the @@ -40,74 +44,39 @@ //! dropped connection. use crate::config::CoordinatorConfig; -use crate::{ShardFrame, ShardFramePayload, TaggedSender, assert_sender_ordering}; +use crate::metrics::{frame_drop_reason, frame_drop_variant}; +use crate::{LifecycleFrame, ShardFrame, TaggedSender, assert_sender_ordering}; use compio::net::TcpStream; -use compio::runtime::JoinHandle; use message_bus::installer::conn_info::{ClientConnMeta, ClientTransportKind}; -use message_bus::lifecycle::ShutdownToken; use message_bus::{SendError, fd_transfer}; -use std::cell::{Cell, RefCell}; +use std::cell::Cell; use std::rc::Rc; -use std::time::Duration; -use tracing::{debug, warn}; +use tracing::warn; /// Coordinator owned by shard 0 only. /// /// Wrapped in `Rc` by the bootstrap and shared with the replica listener, /// the connector, and the client listener so each of those paths can /// delegate immediately. -pub struct ShardZeroCoordinator { +pub struct ShardZeroCoordinator { /// Inter-shard channel senders, indexed by shard id. /// /// Each [`TaggedSender`] carries the id of the shard whose paired - /// receiver drains it, and the ctor asserts `senders[i].shard_id() == i`. - /// The previous plain-`Sender` form was a silent-misroute hazard: a Vec - /// with correct length but permuted ordering had no way to be caught. - senders: Rc>>, + /// receiver drains it; the ctor asserts `senders[i].shard_id() == i` + /// so a permuted Vec is caught at boot rather than misrouting frames. + senders: Rc>, + /// Per-shard observability handle. Cloned at construction; bumps the + /// `frame_drops_total{variant=fd_transfer}` counter when a `try_send` + /// rejection occurs in the lifecycle path. + metrics: crate::metrics::ShardMetrics, total_shards: u16, cfg: CoordinatorConfig, replica_rr: Cell, client_rr: Cell, client_seq: Cell, - /// Authoritative `replica_id -> owning_shard` view as understood by the - /// coordinator. Populated on successful `delegate_replica`, transitioned - /// to [`MappingSlot::Cleared`] by [`forget_mapping`](Self::forget_mapping) - /// when a replica dies. The periodic refresh task re-broadcasts this - /// snapshot so shards whose inbox was full when the original - /// `ReplicaMappingUpdate` or `ReplicaMappingClear` was sent recover on - /// the next tick rather than staying silently stale until the replica - /// reconnects. - /// - /// Three-state slot is what makes refresh complete: `Untouched` slots - /// emit nothing (the cluster never saw that replica id), `Active` slots - /// re-emit `ReplicaMappingUpdate`, and `Cleared` slots re-emit - /// `ReplicaMappingClear`. A two-state `Option` would silently lose - /// clears against shards whose inbox was full at clear-time. - /// - /// Flat `[MappingSlot; 256]` to mirror `ReplicaRegistry` and avoid a - /// hash lookup on the refresh hot path. - mappings: RefCell<[MappingSlot; 256]>, } -/// Per-replica mapping state tracked by the coordinator. -/// -/// `Cleared` is structurally distinct from `Untouched` so the refresh task -/// can re-broadcast `ReplicaMappingClear` for replicas the cluster did once -/// see but is no longer routing to. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] -enum MappingSlot { - /// Coordinator has never delegated this replica id. - #[default] - Untouched, - /// Replica is connected and routed via `owning_shard`. - Active(u16), - /// Replica was once active and is now disconnected. Refresh re-broadcasts - /// a `ReplicaMappingClear` for this slot until a fresh `delegate_replica` - /// flips it back to `Active`. - Cleared, -} - -impl ShardZeroCoordinator { +impl ShardZeroCoordinator { /// # Panics /// /// Panics if `senders.len() != total_shards`, if `total_shards < 1`, or @@ -116,9 +85,10 @@ impl ShardZeroCoordinator { /// invariant from a doc comment to a ctor assertion. #[must_use] pub fn new( - senders: Rc>>, + senders: Rc>, total_shards: u16, cfg: CoordinatorConfig, + metrics: crate::metrics::ShardMetrics, ) -> Self { assert_eq!( senders.len(), @@ -129,12 +99,12 @@ impl ShardZeroCoordinator { assert_sender_ordering(&senders); Self { senders, + metrics, total_shards, cfg, replica_rr: Cell::new(0), client_rr: Cell::new(0), client_seq: Cell::new(1), - mappings: RefCell::new([MappingSlot::Untouched; 256]), } } @@ -170,11 +140,28 @@ impl ShardZeroCoordinator { (u128::from(target_shard) << 112) | seq } + /// Mint a client id for a connection that terminates locally on shard 0 + /// (QUIC, TCP-TLS) instead of being round-robin delegated. + /// + /// Shares the coordinator's `client_seq` counter with the delegated + /// TCP/WS path. Both a shard-0-local connection and a delegated + /// connection that round-robined to shard 0 encode `target_shard = 0` + /// in the top 16 bits; drawing from separate counters would let them + /// mint the same id and collide in shard 0's single connection + /// registry. One counter keeps every shard-0 id distinct. + #[must_use] + pub fn mint_shard_zero_client_id(&self) -> u128 { + self.mint_client_id(0) + } + /// Ship a replica TCP connection to the next round-robin target shard. /// - /// On success broadcasts a `ReplicaMappingUpdate` to every shard and - /// returns `Ok(target_shard)`. On inter-shard channel failure closes - /// the duplicated fd and returns `Err(SendError::RoutingFailed)`. + /// Returns `Ok(target_shard)` on a successful `try_send`. The owning + /// shard records itself in the cluster-shared + /// [`message_bus::ReplicaOwnerTable`] once its installer accepts the + /// fd, so no extra broadcast or mapping cache is needed here. On + /// inter-shard channel failure closes the duplicated fd and returns + /// `Err(SendError::RoutingFailed)`. /// /// # Errors /// @@ -184,11 +171,13 @@ impl ShardZeroCoordinator { let target = self.next_replica_target(); let fd = fd_transfer::dup_fd(&stream).map_err(SendError::DupFailed)?; - let setup = ShardFramePayload::ReplicaConnectionSetup { fd, replica_id }; + let setup = LifecycleFrame::ReplicaConnectionSetup { fd, replica_id }; if let Err(e) = self.senders[target as usize].try_send(ShardFrame::lifecycle(setup)) { // The frame (and the `DupedFd` inside) is returned in `e` and // dropped at end-of-block, which closes the dup. No explicit // `close_fd` needed. + self.metrics + .record_frame_drop(frame_drop_variant::FD_TRANSFER, classify_try_send_err(&e)); warn!( replica_id, target, "delegate_replica try_send failed: {e:?}" @@ -200,35 +189,9 @@ impl ShardZeroCoordinator { // the socket open. drop(stream); - // Record the mapping on the coordinator's authoritative snapshot - // BEFORE broadcasting: if `broadcast_mapping_update` drops a frame - // on a full inbox, the periodic refresh task reads this snapshot - // and re-broadcasts. - self.mappings.borrow_mut()[usize::from(replica_id)] = MappingSlot::Active(target); - self.broadcast_mapping_update(replica_id, target); - Ok(target) } - fn broadcast_mapping_update(&self, replica_id: u8, owning_shard: u16) { - // Broadcast to every shard (including shard 0 and the owner - // itself). Drops on full inbox are tolerable: the owning shard - // already holds the fd and the periodic refresh task reconciles - // any missed mapping on its next tick. - for sender in self.senders.iter() { - let update = ShardFramePayload::ReplicaMappingUpdate { - replica_id, - owning_shard, - }; - if let Err(e) = sender.try_send(ShardFrame::lifecycle(update)) { - debug!( - shard_id = sender.shard_id(), - replica_id, "mapping update try_send failed: {e:?}" - ); - } - } - } - /// Ship a client TCP connection to the next round-robin target shard. /// /// On success returns the minted client id. On failure closes the @@ -247,9 +210,11 @@ impl ShardZeroCoordinator { let fd = fd_transfer::dup_fd(&stream).map_err(SendError::DupFailed)?; let meta = ClientConnMeta::new(client_id, peer_addr, ClientTransportKind::Tcp); - let setup = ShardFramePayload::ClientConnectionSetup { fd, meta }; + let setup = LifecycleFrame::ClientConnectionSetup { fd, meta }; if let Err(e) = self.senders[target as usize].try_send(ShardFrame::lifecycle(setup)) { // The returned frame owns the `DupedFd` and closes it on drop. + self.metrics + .record_frame_drop(frame_drop_variant::FD_TRANSFER, classify_try_send_err(&e)); warn!(client_id, target, "delegate_client try_send failed: {e:?}"); return Err(SendError::RoutingFailed(target)); } @@ -262,7 +227,7 @@ impl ShardZeroCoordinator { /// round-robin target shard. /// /// Identical wire path to [`Self::delegate_client`] but ships - /// [`ShardFramePayload::ClientWsConnectionSetup`] so the receiving + /// [`LifecycleFrame::ClientWsConnectionSetup`] so the receiving /// shard runs `compio_ws::accept_async_with_config` before /// installing the connection. The fd at ship-time is plain TCP; /// the WS state machine only materialises post-upgrade on the @@ -280,8 +245,10 @@ impl ShardZeroCoordinator { let fd = fd_transfer::dup_fd(&stream).map_err(SendError::DupFailed)?; let meta = ClientConnMeta::new(client_id, peer_addr, ClientTransportKind::Ws); - let setup = ShardFramePayload::ClientWsConnectionSetup { fd, meta }; + let setup = LifecycleFrame::ClientWsConnectionSetup { fd, meta }; if let Err(e) = self.senders[target as usize].try_send(ShardFrame::lifecycle(setup)) { + self.metrics + .record_frame_drop(frame_drop_variant::FD_TRANSFER, classify_try_send_err(&e)); warn!( client_id, target, "delegate_ws_client try_send failed: {e:?}" @@ -293,84 +260,21 @@ impl ShardZeroCoordinator { Ok(client_id) } - /// Broadcast a `ReplicaMappingClear` to every shard. Used by the - /// `ConnectionLost` handler before the next `delegate_replica` runs. - pub fn broadcast_mapping_clear(&self, replica_id: u8) { - crate::broadcast_mapping_clear(&self.senders, replica_id); - } - - /// Transition the coordinator's authoritative entry for `replica_id` - /// to [`MappingSlot::Cleared`] so the periodic refresh task re-broadcasts - /// `ReplicaMappingClear` until a fresh delegation revives the slot. A - /// shard whose inbox was full at the original clear-time recovers on the - /// next refresh tick rather than retaining the stale `Active(owner)` - /// view forever. - /// - /// Call from shard 0's `ConnectionLost` handler paired with - /// [`broadcast_mapping_clear`](Self::broadcast_mapping_clear). - pub fn forget_mapping(&self, replica_id: u8) { - self.mappings.borrow_mut()[usize::from(replica_id)] = MappingSlot::Cleared; - } - - /// Re-broadcast every mapping the coordinator is tracking. - /// - /// `delegate_replica` and `forget_mapping` broadcast their respective - /// `ReplicaMappingUpdate` / `ReplicaMappingClear` as best-effort - /// `try_send`: a shard whose inbox was full at the moment of the - /// original broadcast permanently lost the change. Without periodic - /// refresh, an `Update` miss leaves `send_to_replica` returning - /// `ReplicaNotConnected` until the peer reconnects, and a `Clear` miss - /// leaves a stale `owning_shard` view that misroutes traffic to a shard - /// that no longer holds the connection. This refresh closes both gaps. - pub fn broadcast_mapping_snapshot(&self) { - // Stage entries in scratch `Vec`s so the `RefCell` borrow is - // released before any `try_send` runs, matching the borrow - // discipline of the other mapping paths. Heap allocation is - // acceptable: this snapshot runs on a periodic refresh tick, - // not on the per-message hot path. - let mut updates: Vec<(u8, u16)> = Vec::new(); - let mut clears: Vec = Vec::new(); - for (idx, slot) in self.mappings.borrow().iter().enumerate() { - #[allow(clippy::cast_possible_truncation)] - let replica_id = idx as u8; - match *slot { - MappingSlot::Active(owner) => updates.push((replica_id, owner)), - MappingSlot::Cleared => clears.push(replica_id), - MappingSlot::Untouched => {} - } - } - for (replica_id, owning_shard) in updates { - self.broadcast_mapping_update(replica_id, owning_shard); - } - for replica_id in clears { - crate::broadcast_mapping_clear(&self.senders, replica_id); - } - } - - /// Spawn a compio task that calls [`Self::broadcast_mapping_snapshot`] - /// every `period` until `token` fires. Bootstrap is expected to track - /// the returned handle on the bus's background tasks so graceful - /// shutdown awaits it. - pub fn spawn_refresh_task( - self: &Rc, - token: ShutdownToken, - period: Duration, - ) -> JoinHandle<()> { - let coord = Rc::clone(self); - compio::runtime::spawn(async move { - while token.sleep_or_shutdown(period).await { - coord.broadcast_mapping_snapshot(); - } - debug!("coordinator mapping refresh task exiting"); - }) - } - #[must_use] pub const fn total_shards(&self) -> u16 { self.total_shards } } +/// Map `crossfire::TrySendError` to the `frame_drop_reason` label set so +/// every drop site picks the same string. +pub(crate) const fn classify_try_send_err(err: &crossfire::TrySendError) -> &'static str { + match err { + crossfire::TrySendError::Full(_) => frame_drop_reason::FULL, + crossfire::TrySendError::Disconnected(_) => frame_drop_reason::DISCONNECTED, + } +} + /// Advance `counter` and return the next target shard. /// /// When `skip_zero` is true and `total_shards > 1`, wraps over @@ -394,7 +298,7 @@ mod tests { fn build_senders(total: u16) -> Rc> { let mut senders = Vec::with_capacity(total as usize); for shard_id in 0..total { - let (tx, _rx) = crate::shard_channel::<()>(shard_id, 16); + let (tx, _rx) = crate::shard_channel(shard_id, 16); senders.push(tx); } Rc::new(senders) @@ -406,7 +310,7 @@ mod tests { let mut senders = Vec::with_capacity(total as usize); let mut receivers = Vec::with_capacity(total as usize); for shard_id in 0..total { - let (tx, rx) = crate::shard_channel::<()>(shard_id, 16); + let (tx, rx) = crate::shard_channel(shard_id, 16); senders.push(tx); receivers.push(rx); } @@ -418,19 +322,29 @@ mod tests { fn ctor_rejects_permuted_sender_vec() { // Build senders in correct order, then swap two entries so the // indexed position no longer matches the tagged shard id. - let (tx0, _rx0) = crate::shard_channel::<()>(0, 16); - let (tx1, _rx1) = crate::shard_channel::<()>(1, 16); - let (tx2, _rx2) = crate::shard_channel::<()>(2, 16); - let (tx3, _rx3) = crate::shard_channel::<()>(3, 16); + let (tx0, _rx0) = crate::shard_channel(0, 16); + let (tx1, _rx1) = crate::shard_channel(1, 16); + let (tx2, _rx2) = crate::shard_channel(2, 16); + let (tx3, _rx3) = crate::shard_channel(3, 16); let permuted = Rc::new(vec![tx0, tx2, tx1, tx3]); - let _coord = ShardZeroCoordinator::<()>::new(permuted, 4, CoordinatorConfig::default()); + let _coord = ShardZeroCoordinator::new( + permuted, + 4, + CoordinatorConfig::default(), + crate::metrics::ShardMetrics::for_shard(0), + ); } #[test] fn replica_rr_default_skips_shard_zero() { // Default config: replicas skip shard 0, clients include it. let senders = build_senders(4); - let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default()); + let coord = ShardZeroCoordinator::new( + senders, + 4, + CoordinatorConfig::default(), + crate::metrics::ShardMetrics::for_shard(0), + ); // Replicas wrap over [1, 4). assert_eq!(coord.next_replica_target(), 1); @@ -453,9 +367,9 @@ mod tests { let cfg = CoordinatorConfig { skip_shard_zero_for_replicas: false, skip_shard_zero_for_clients: false, - ..CoordinatorConfig::default() }; - let coord = ShardZeroCoordinator::<()>::new(senders, 4, cfg); + let coord = + ShardZeroCoordinator::new(senders, 4, cfg, crate::metrics::ShardMetrics::for_shard(0)); assert_eq!(coord.next_replica_target(), 0); assert_eq!(coord.next_replica_target(), 1); @@ -473,9 +387,9 @@ mod tests { let cfg = CoordinatorConfig { skip_shard_zero_for_replicas: true, skip_shard_zero_for_clients: true, - ..CoordinatorConfig::default() }; - let coord = ShardZeroCoordinator::<()>::new(senders, 4, cfg); + let coord = + ShardZeroCoordinator::new(senders, 4, cfg, crate::metrics::ShardMetrics::for_shard(0)); for _ in 0..8 { let r = coord.next_replica_target(); @@ -491,9 +405,9 @@ mod tests { let cfg = CoordinatorConfig { skip_shard_zero_for_replicas: true, skip_shard_zero_for_clients: true, - ..CoordinatorConfig::default() }; - let coord = ShardZeroCoordinator::<()>::new(senders, 1, cfg); + let coord = + ShardZeroCoordinator::new(senders, 1, cfg, crate::metrics::ShardMetrics::for_shard(0)); for _ in 0..4 { assert_eq!(coord.next_replica_target(), 0); assert_eq!(coord.next_client_target(), 0); @@ -503,18 +417,55 @@ mod tests { #[test] fn mint_client_id_encodes_target_shard() { let senders = build_senders(8); - let coord = ShardZeroCoordinator::<()>::new(senders, 8, CoordinatorConfig::default()); + let coord = ShardZeroCoordinator::new( + senders, + 8, + CoordinatorConfig::default(), + crate::metrics::ShardMetrics::for_shard(0), + ); let id = coord.mint_client_id(5); assert_eq!((id >> 112) as u16, 5); assert_eq!(id & ((1u128 << 112) - 1), 1, "first seq is 1"); } + #[test] + fn shard_zero_local_and_delegated_ids_never_collide() { + let senders = build_senders(4); + let coord = ShardZeroCoordinator::new( + senders, + 4, + CoordinatorConfig::default(), + crate::metrics::ShardMetrics::for_shard(0), + ); + + // Interleave shard-0-local mints (QUIC / TCP-TLS path) with + // delegated mints that round-robin to shard 0. Both encode target + // shard 0 in the top 16 bits; the shared counter keeps every id + // distinct so they cannot overwrite each other in shard 0's + // connection registry. + let mut ids = std::collections::HashSet::new(); + for _ in 0..8 { + assert!(ids.insert(coord.mint_shard_zero_client_id())); + assert!(ids.insert(coord.mint_client_id(0))); + } + assert_eq!( + ids.len(), + 16, + "every minted shard-0 client id must be unique" + ); + } + #[compio::test] #[allow(clippy::future_not_send)] - async fn delegate_replica_sends_setup_then_broadcasts_mapping() { + async fn delegate_replica_ships_setup_to_target_shard() { let (senders, receivers) = build_senders_with_rx(4); - let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default()); + let coord = ShardZeroCoordinator::new( + senders, + 4, + CoordinatorConfig::default(), + crate::metrics::ShardMetrics::for_shard(0), + ); // Loopback TCP pair so delegate_replica has a real fd to dup. let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); @@ -531,29 +482,25 @@ mod tests { // Target shard should observe ReplicaConnectionSetup. let setup_frame = receivers[target as usize].recv().await.unwrap(); - match setup_frame.payload { - ShardFramePayload::ReplicaConnectionSetup { fd, replica_id } => { + match setup_frame { + ShardFrame::Lifecycle(LifecycleFrame::ReplicaConnectionSetup { fd, replica_id }) => { assert_eq!(replica_id, 7); - // Drop closes the dup'd fd via `DupedFd::Drop`. drop(fd); } _ => panic!("expected ReplicaConnectionSetup on target shard"), } - // Every shard (including shard 0 and the target itself) should - // observe the ReplicaMappingUpdate broadcast. + // Non-target shards must not receive any frame: delegate_replica + // no longer broadcasts a mapping update, the cluster-shared + // owner table covers cross-shard routing. for (idx, rx) in receivers.iter().enumerate() { - let frame = rx.recv().await.unwrap(); - match frame.payload { - ShardFramePayload::ReplicaMappingUpdate { - replica_id, - owning_shard, - } => { - assert_eq!(replica_id, 7, "shard {idx} mapping update replica id"); - assert_eq!(owning_shard, target, "shard {idx} mapping update owner"); - } - _ => panic!("shard {idx} expected ReplicaMappingUpdate"), + if idx == target as usize { + continue; } + assert!( + rx.try_recv().is_err(), + "shard {idx} unexpectedly received a frame after delegate_replica", + ); } } @@ -561,7 +508,12 @@ mod tests { #[allow(clippy::future_not_send)] async fn delegate_client_ships_setup_with_meta_transport_tcp() { let (senders, receivers) = build_senders_with_rx(4); - let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default()); + let coord = ShardZeroCoordinator::new( + senders, + 4, + CoordinatorConfig::default(), + crate::metrics::ShardMetrics::for_shard(0), + ); let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); let addr = listener.local_addr().unwrap(); @@ -577,8 +529,8 @@ mod tests { ); let setup_frame = receivers[target as usize].recv().await.unwrap(); - match setup_frame.payload { - ShardFramePayload::ClientConnectionSetup { fd, meta } => { + match setup_frame { + ShardFrame::Lifecycle(LifecycleFrame::ClientConnectionSetup { fd, meta }) => { assert_eq!(meta.client_id, client_id); assert!(matches!(meta.transport, ClientTransportKind::Tcp)); drop(fd); @@ -591,7 +543,12 @@ mod tests { #[allow(clippy::future_not_send)] async fn delegate_ws_client_ships_setup_with_meta_transport_ws() { let (senders, receivers) = build_senders_with_rx(4); - let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default()); + let coord = ShardZeroCoordinator::new( + senders, + 4, + CoordinatorConfig::default(), + crate::metrics::ShardMetrics::for_shard(0), + ); let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); let addr = listener.local_addr().unwrap(); @@ -607,8 +564,8 @@ mod tests { ); let setup_frame = receivers[target as usize].recv().await.unwrap(); - match setup_frame.payload { - ShardFramePayload::ClientWsConnectionSetup { fd, meta } => { + match setup_frame { + ShardFrame::Lifecycle(LifecycleFrame::ClientWsConnectionSetup { fd, meta }) => { assert_eq!(meta.client_id, client_id); assert!( matches!(meta.transport, ClientTransportKind::Ws), @@ -620,156 +577,4 @@ mod tests { _ => panic!("expected ClientWsConnectionSetup variant"), } } - - #[compio::test] - #[allow(clippy::future_not_send)] - async fn broadcast_mapping_clear_reaches_every_shard() { - let (senders, receivers) = build_senders_with_rx(4); - let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default()); - - coord.broadcast_mapping_clear(9); - - for (idx, rx) in receivers.iter().enumerate() { - let frame = rx.recv().await.unwrap(); - match frame.payload { - ShardFramePayload::ReplicaMappingClear { replica_id } => { - assert_eq!(replica_id, 9, "shard {idx} mapping clear replica id"); - } - _ => panic!("shard {idx} expected ReplicaMappingClear"), - } - } - } - - #[compio::test] - #[allow(clippy::future_not_send)] - async fn snapshot_rebroadcasts_every_tracked_mapping() { - let (senders, receivers) = build_senders_with_rx(4); - let coord = Rc::new(ShardZeroCoordinator::<()>::new( - senders, - 4, - CoordinatorConfig::default(), - )); - - // Seed the coordinator's tracked mappings directly (delegate_replica - // needs a real TCP fd; the snapshot path is orthogonal to dup). - coord.mappings.borrow_mut()[3] = MappingSlot::Active(1); - coord.mappings.borrow_mut()[7] = MappingSlot::Active(2); - - coord.broadcast_mapping_snapshot(); - - // Each shard must observe an update for each seeded mapping. The - // order across replica ids is unspecified (HashMap->Vec iteration - // on the collected snapshot), so collect and compare as a set. - for (idx, rx) in receivers.iter().enumerate() { - let mut observed = std::collections::BTreeSet::new(); - for _ in 0..2 { - let frame = rx.recv().await.unwrap(); - match frame.payload { - ShardFramePayload::ReplicaMappingUpdate { - replica_id, - owning_shard, - } => { - observed.insert((replica_id, owning_shard)); - } - _ => panic!("shard {idx} expected ReplicaMappingUpdate"), - } - } - let expected: std::collections::BTreeSet<_> = - [(3u8, 1u16), (7u8, 2u16)].into_iter().collect(); - assert_eq!( - observed, expected, - "shard {idx} did not receive the full snapshot" - ); - } - } - - #[compio::test] - #[allow(clippy::future_not_send)] - async fn forget_mapping_transitions_slot_to_cleared_state() { - let (senders, _receivers) = build_senders_with_rx(2); - let coord = ShardZeroCoordinator::<()>::new(senders, 2, CoordinatorConfig::default()); - - coord.mappings.borrow_mut()[4] = MappingSlot::Active(1); - coord.mappings.borrow_mut()[5] = MappingSlot::Active(1); - coord.forget_mapping(4); - - assert_eq!(coord.mappings.borrow()[4], MappingSlot::Cleared); - assert_eq!(coord.mappings.borrow()[5], MappingSlot::Active(1)); - } - - #[compio::test] - #[allow(clippy::future_not_send)] - async fn snapshot_includes_clear_for_forgotten_slots() { - let (senders, receivers) = build_senders_with_rx(3); - let coord = Rc::new(ShardZeroCoordinator::<()>::new( - senders, - 3, - CoordinatorConfig::default(), - )); - - // Two live mappings + one slot that was previously delegated and is - // now forgotten. Refresh must rebroadcast a Clear for the forgotten - // slot so a shard that missed the original Clear (full inbox) can - // reconcile its stale Active view on the next tick. - coord.mappings.borrow_mut()[3] = MappingSlot::Active(1); - coord.mappings.borrow_mut()[7] = MappingSlot::Active(2); - coord.mappings.borrow_mut()[5] = MappingSlot::Cleared; - - coord.broadcast_mapping_snapshot(); - - for (idx, rx) in receivers.iter().enumerate() { - let mut updates = std::collections::BTreeSet::new(); - let mut clears = std::collections::BTreeSet::new(); - for _ in 0..3 { - let frame = rx.recv().await.unwrap(); - match frame.payload { - ShardFramePayload::ReplicaMappingUpdate { - replica_id, - owning_shard, - } => { - updates.insert((replica_id, owning_shard)); - } - ShardFramePayload::ReplicaMappingClear { replica_id } => { - clears.insert(replica_id); - } - _ => panic!("shard {idx} expected mapping update or clear"), - } - } - - let expected_updates: std::collections::BTreeSet<_> = - [(3u8, 1u16), (7u8, 2u16)].into_iter().collect(); - let expected_clears: std::collections::BTreeSet<_> = std::iter::once(5u8).collect(); - assert_eq!( - updates, expected_updates, - "shard {idx} did not receive both Update frames" - ); - assert_eq!( - clears, expected_clears, - "shard {idx} did not receive a Clear for the forgotten slot" - ); - } - } - - #[compio::test] - #[allow(clippy::future_not_send)] - async fn snapshot_skips_untouched_slots() { - let (senders, receivers) = build_senders_with_rx(2); - let coord = Rc::new(ShardZeroCoordinator::<()>::new( - senders, - 2, - CoordinatorConfig::default(), - )); - - // No mappings populated: every slot is `Untouched`. Snapshot must - // be silent so the cluster does not pay 256 try_sends per shard per - // refresh tick when nothing has been delegated yet. - coord.broadcast_mapping_snapshot(); - - for (idx, rx) in receivers.iter().enumerate() { - assert!( - rx.try_recv().is_err(), - "shard {idx} should not receive any frame from an empty snapshot" - ); - } - } } diff --git a/core/shard/src/lib.rs b/core/shard/src/lib.rs index 2dc7a4942f..d6efa5074a 100644 --- a/core/shard/src/lib.rs +++ b/core/shard/src/lib.rs @@ -18,6 +18,7 @@ pub mod builder; pub mod config; pub mod coordinator; +pub mod metrics; mod router; pub mod shards_table; @@ -105,14 +106,36 @@ pub fn channel(capacity: usize) -> (Sender, Receiver) { /// Bootstrap uses this to build the per-shard sender `Vec` such that /// `vec[i]` necessarily reaches shard `i`. #[must_use] -pub fn shard_channel( - owner_shard: u16, - capacity: usize, -) -> (TaggedSender, Receiver>) { - let (tx, rx) = channel::>(capacity); +pub fn shard_channel(owner_shard: u16, capacity: usize) -> (TaggedSender, Receiver) { + let (tx, rx) = channel::(capacity); (TaggedSender::new(owner_shard, tx), rx) } +/// Build canonical-ordered `(senders, inboxes)` pair for an N-shard cluster. +/// +/// Each `inboxes[i]` drains exclusively on the runtime owning shard `i`. The +/// returned `senders` Vec satisfies `senders[i].shard_id() == i` by +/// construction; clone it into every shard before spawning so all shards +/// share the same mesh. +/// +/// Receivers are wrapped in `Option` because [`Receiver`] (crossfire +/// `AsyncRx`) is non-cloneable on purpose; bootstrap takes the slot for +/// shard `i` exactly once when spawning the owning thread. +#[must_use] +pub fn shard_cluster_channels( + total_shards: u16, + capacity: usize, +) -> (Vec, Vec>>) { + let mut senders = Vec::with_capacity(total_shards as usize); + let mut inboxes = Vec::with_capacity(total_shards as usize); + for shard_id in 0..total_shards { + let (tx, rx) = shard_channel(shard_id, capacity); + senders.push(tx); + inboxes.push(Some(rx)); + } + (senders, inboxes) +} + /// A [`Sender`] annotated with the id of the shard whose paired receiver it /// feeds. /// @@ -123,18 +146,18 @@ pub fn shard_channel( /// [`TaggedSender::new`]) at the channel-creation site; the coordinator and /// [`IggyShard`] ctors then assert `senders[i].shard_id() == i`, turning the /// ordering invariant from a doc comment into a ctor-checked type property. -pub struct TaggedSender { +pub struct TaggedSender { shard_id: u16, - inner: Sender>, + inner: Sender, } -impl TaggedSender { +impl TaggedSender { /// Wrap an already-constructed sender with the id of the shard whose /// paired receiver drains it. Prefer [`shard_channel`] unless an /// existing sender is being re-tagged (e.g., tests that build senders /// manually and know the ordering is correct). #[must_use] - pub const fn new(shard_id: u16, inner: Sender>) -> Self { + pub const fn new(shard_id: u16, inner: Sender) -> Self { Self { shard_id, inner } } @@ -144,12 +167,12 @@ impl TaggedSender { } #[must_use] - pub const fn sender(&self) -> &Sender> { + pub const fn sender(&self) -> &Sender { &self.inner } } -impl Clone for TaggedSender { +impl Clone for TaggedSender { fn clone(&self) -> Self { Self { shard_id: self.shard_id, @@ -158,8 +181,8 @@ impl Clone for TaggedSender { } } -impl std::ops::Deref for TaggedSender { - type Target = Sender>; +impl std::ops::Deref for TaggedSender { + type Target = Sender; fn deref(&self) -> &Self::Target { &self.inner @@ -167,13 +190,13 @@ impl std::ops::Deref for TaggedSender { } /// Assert the canonical ordering `senders[i].shard_id() == i`. Called from -/// every ctor that accepts a pre-built `Vec>`. +/// every ctor that accepts a pre-built `Vec`. /// /// # Panics /// /// Panics if any sender in `senders` carries a `shard_id` that does not /// match its index. Bootstrap programming error. -fn assert_sender_ordering(senders: &[TaggedSender]) { +fn assert_sender_ordering(senders: &[TaggedSender]) { for (idx, sender) in senders.iter().enumerate() { let expected = u16::try_from(idx).expect("shard count must fit in u16"); assert_eq!( @@ -185,11 +208,22 @@ fn assert_sender_ordering(senders: &[TaggedSender]) { } } -/// Payload carried by a [`ShardFrame`]. +/// Lifecycle frame variants. +/// +/// Connection setup and cross-shard forwards: every frame the inter-shard +/// channel carries that is NOT a consensus protocol message lives here. +/// Splitting these out from [`ShardFrame::Consensus`] keeps the consensus +/// dispatch path hot and cache-tight while leaving lifecycle traffic on +/// the same single channel (preserving relative ordering between consensus +/// and lifecycle frames at near-zero cost). +/// +/// Trade-off: consensus and lifecycle traffic compete for one bounded +/// inbox. A consensus burst or retransmit storm can fill it exactly when +/// a terminal-drop [`LifecycleFrame::ForwardClientSend`] needs the space; +/// `inbox_capacity` is a single knob and cannot isolate the two frame +/// classes. #[non_exhaustive] -pub enum ShardFramePayload { - /// A consensus protocol message routed between shards. - Consensus(Message), +pub enum LifecycleFrame { /// Shard 0 distributes an inbound replica TCP connection fd to the owning /// shard. The receiving shard wraps the fd in a `TcpStream` and spawns /// writer + reader tasks on its own compio runtime. The `fd` is an @@ -221,12 +255,6 @@ pub enum ShardFramePayload { /// Shard 0 therefore terminates QUIC locally and uses the existing /// `ForwardClientSend` variant for outbound traffic. ClientWsConnectionSetup { fd: DupedFd, meta: ClientConnMeta }, - /// Shard 0 broadcasts the owner for a replica to every shard so each - /// bus' `send_to_replica` slow path can route through the correct owner. - ReplicaMappingUpdate { replica_id: u8, owning_shard: u16 }, - /// Shard 0 broadcasts that a replica mapping should be forgotten (e.g. - /// after a connection loss and before the next allocate). - ReplicaMappingClear { replica_id: u8 }, /// A non-owning shard forwards a replica send to the owning shard's /// local bus; the owning shard then takes the fast path. ForwardReplicaSend { @@ -239,83 +267,44 @@ pub enum ShardFramePayload { client_id: u128, msg: Frozen, }, - /// Owning shard notifies shard 0 that a replica connection died. - /// Shard 0 clears the mapping cluster-wide and drives a reconnect. - ConnectionLost { replica_id: u8 }, } -/// Envelope for inter-shard channel messages. -/// -/// Wraps a [`ShardFramePayload`] together with an optional one-shot response -/// channel. Fire-and-forget dispatches leave `response_sender` as `None`; -/// request-response dispatches provide a sender that the message pump will -/// notify once the message has been processed. +/// Inter-shard channel envelope. /// -/// The response type `R` is generic so that higher layers (e.g. HTTP handlers) -/// can carry a response enum while the consensus layer can default to `()`. -pub struct ShardFrame { - pub payload: ShardFramePayload, - pub response_sender: Option>, +/// Concrete enum; no generic. Consensus dispatches are fire-and-forget by +/// VSR design (replies travel as their own wire-level messages: `Reply` +/// to clients, `PrepareOk` to the primary), so no response channel rides +/// in the frame. +#[non_exhaustive] +pub enum ShardFrame { + /// A consensus protocol message (Request / Prepare / `PrepareOk` / + /// view-change family / Commit). Fire-and-forget. Drops on full inbox + /// are recovered by VSR retransmit timers. + Consensus(Message), + /// A connection setup or cross-shard forward frame. Drop recovery + /// depends on the frame class: [`LifecycleFrame::ForwardReplicaSend`] + /// is VSR-covered, connection-setup frames are recovered by the + /// connector's periodic reconnect sweep, but + /// [`LifecycleFrame::ForwardClientSend`] is terminal - no retransmit, + /// the client never receives the reply. + Lifecycle(LifecycleFrame), } -impl ShardFrame { - /// Create a fire-and-forget consensus message frame. +impl ShardFrame { + /// Create a consensus frame. #[must_use] - pub const fn fire_and_forget(message: Message) -> Self { - Self { - payload: ShardFramePayload::Consensus(message), - response_sender: None, - } + pub const fn consensus(message: Message) -> Self { + Self::Consensus(message) } - /// Create a request-response consensus message frame. Returns the frame - /// and a receiver that the caller can await for completion notification. + /// Create a lifecycle frame. #[must_use] - pub fn with_response(message: Message) -> (Self, Receiver) { - let (tx, rx) = channel(1); - ( - Self { - payload: ShardFramePayload::Consensus(message), - response_sender: Some(tx), - }, - rx, - ) - } - - /// Create a fire-and-forget lifecycle frame (connection setup, loss). - #[must_use] - pub const fn lifecycle(payload: ShardFramePayload) -> Self { - Self { - payload, - response_sender: None, - } + pub const fn lifecycle(payload: LifecycleFrame) -> Self { + Self::Lifecycle(payload) } } -/// Broadcast a `ReplicaMappingClear` to every shard (including sender-self). -/// -/// Used by shard 0's `ConnectionLost` handler and by -/// [`coordinator::ShardZeroCoordinator::broadcast_mapping_clear`] so both -/// paths go through the same try-send-and-log logic. `try_send` failures -/// (inbox full or disconnected) are logged at debug: the next mapping -/// broadcast or reconnect sweep will reconcile. -pub(crate) fn broadcast_mapping_clear( - senders: &[TaggedSender], - replica_id: u8, -) { - for sender in senders { - let clear = ShardFramePayload::ReplicaMappingClear { replica_id }; - if let Err(e) = sender.try_send(ShardFrame::lifecycle(clear)) { - tracing::debug!( - shard_id = sender.shard_id(), - replica_id, - "mapping clear try_send failed: {e:?}" - ); - } - } -} - -pub struct IggyShard +pub struct IggyShard where B: MessageBus, { @@ -324,8 +313,8 @@ where pub plane: ShardPlane, /// Handle to the local bus. Retained alongside the bus owned by every - /// consensus plane so the router can reach the `ConnectionInstaller` / - /// mapping surface without going through consensus. + /// consensus plane so the router can reach the `ConnectionInstaller` + /// surface without going through consensus. pub bus: B, /// Callback attached to every delegated replica connection installed @@ -344,26 +333,33 @@ where /// [`assert_sender_ordering`] is invoked in the ctor so `senders[i]` /// is guaranteed to feed the shard whose `id == i`. Call sites can /// therefore index by `target_shard` without re-checking. - senders: Vec>, + senders: Vec, + + /// Cluster shard count, cached from `senders.len()` at construction. + /// `senders` is immutable post-ctor, so consensus routing reads this + /// rather than recomputing the `usize -> u32` conversion per frame. + shard_count: u32, /// Receiver end of this shard's inbox. Peer shards (and self) send /// messages here via the corresponding sender. - inbox: Receiver>, + inbox: Receiver, /// Partition namespace -> owning shard lookup. shards_table: T, partition_consensus: PartitionConsensusConfig, - /// Shard 0 coordinator, set at bootstrap on shard 0 only. The router's - /// `ConnectionLost` handler calls [`coordinator::ShardZeroCoordinator::forget_mapping`] - /// so the periodic refresh task stops re-broadcasting the mapping of - /// a replica that has disconnected. `None` on non-zero shards and in - /// single-shard tests that bypass the coordinator. - coordinator: Option>>, + /// Shard 0 coordinator, supplied at construction. Holds round-robin + /// state for replica and client delegation. `None` on non-zero shards + /// and in single-shard tests that bypass the coordinator. + coordinator: Option>, + + /// Per-shard observability counters. Cloned at metric increment sites, + /// so cheap (`Arc` clone) regardless of label cardinality. + metrics: crate::metrics::ShardMetrics, } -impl IggyShard +impl IggyShard where B: MessageBus, T: ShardsTable, @@ -371,14 +367,20 @@ where /// Create a new shard with channel links and a shards table. /// /// * `bus` - shard-local bus handle (kept alongside the buses owned - /// by the consensus planes so the router can reach installer / - /// mapping operations directly). + /// by the consensus planes so the router can reach the + /// `ConnectionInstaller` surface directly). /// * `senders` - one [`TaggedSender`] per shard. The ctor asserts /// `senders[i].shard_id() == i`; use [`shard_channel`] at /// construction time so every sender carries the id of the shard /// whose receiver drains it. /// * `inbox` - the receiver that this shard drains in its message pump. /// * `shards_table` - namespace -> shard routing table. + /// * `coordinator` - `Some` on shard 0 (supplied by the builder when + /// `is_shard_zero`), `None` everywhere else. Immutable post-ctor: + /// the coordinator is injected at construction time so an + /// `IggyShard` cannot appear half-wired to a reader. + /// * `metrics` - per-shard observability handle; currently the + /// `frame_drops_total` counter. /// /// # Panics /// @@ -395,12 +397,15 @@ where on_client_request: RequestHandler, metadata: IggyMetadata, MJ, S, M>, partitions: IggyPartitions, - senders: Vec>, - inbox: Receiver>, + senders: Vec, + inbox: Receiver, shards_table: T, partition_consensus: PartitionConsensusConfig, + coordinator: Option>, + metrics: crate::metrics::ShardMetrics, ) -> Self { assert_sender_ordering(&senders); + let shard_count = u32::try_from(senders.len()).unwrap_or(u32::MAX); let plane = MuxPlane::new(variadic!(metadata, partitions)); let ShardIdentity { id, name } = identity; Self { @@ -411,29 +416,22 @@ where on_replica_message, on_client_request, senders, + shard_count, inbox, shards_table, partition_consensus, - coordinator: None, + coordinator, + metrics, } } - /// Attach a shard-0 coordinator. Bootstrap calls this on the shard 0 - /// `IggyShard` only; other shards keep `coordinator = None`. The - /// router's `ConnectionLost` handler will call - /// [`coordinator::ShardZeroCoordinator::forget_mapping`] via this - /// reference so the periodic refresh stops re-broadcasting dead - /// replica mappings. - pub fn set_coordinator(&mut self, coord: Rc>) { - self.coordinator = Some(coord); - } - - /// `true` when a [`coordinator::ShardZeroCoordinator`] has been attached - /// via [`set_coordinator`](Self::set_coordinator). Shard 0 bootstrap is - /// expected to flip this on; every other shard keeps it `false`. + /// Return a clone of the shard-0 coordinator handle, if attached. + /// Bootstrap uses this to wire the listener accept callbacks + /// (replica + client) to coordinator-driven fd-delegation instead + /// of installing connections locally on shard 0. #[must_use] - pub const fn has_coordinator(&self) -> bool { - self.coordinator.is_some() + pub fn coordinator(&self) -> Option> { + self.coordinator.clone() } /// Create a shard without inter-shard channels or delegated connections. @@ -451,9 +449,9 @@ where shards_table: T, partition_consensus: PartitionConsensusConfig, ) -> Self { - // TODO: previously we used unbounded channel with flume, - // but this is not possible with crossfire without mangling types due to Flavor trait in crossfire. - // This needs to be revisited in the future. + // TODO(hubcio): crossfire's Flavor trait blocks unbounded channels + // with the current type setup; revisit when crossfire grows an + // unbounded variant or we replace it. let (_tx, inbox) = channel(1); let plane = MuxPlane::new(variadic!(metadata, partitions)); let ShardIdentity { id, name } = identity; @@ -466,9 +464,14 @@ where plane, coordinator: None, senders: Vec::new(), + // The simulator delivers inbound messages straight to + // `on_message`, bypassing the inter-shard router, so the + // routing-only `shard_count` is never read on this path. + shard_count: 0, inbox, shards_table, partition_consensus, + metrics: crate::metrics::ShardMetrics::for_shard(id), } } @@ -480,7 +483,7 @@ where /// Local message processing — these methods handle messages that have been /// routed to this shard via the message pump. -impl IggyShard +impl IggyShard where B: MessageBus, { @@ -587,7 +590,11 @@ where /// # Panics /// Panics if a loopback message is not a valid `PrepareOk` message. #[allow(clippy::future_not_send)] - pub async fn process_loopback(&self, buf: &mut Vec>) -> usize + pub async fn process_loopback( + &self, + buf: &mut Vec>, + namespace_scratch: &mut Vec, + ) -> usize where B: MessageBus, MJ: JournalHandle, @@ -603,6 +610,10 @@ where > + StreamsFrontend, { debug_assert!(buf.is_empty(), "buf must be empty on entry"); + debug_assert!( + namespace_scratch.is_empty(), + "namespace_scratch must be empty on entry", + ); let mut total = 0; let planes = self.plane.inner(); @@ -619,8 +630,8 @@ where } } - let namespaces: Vec<_> = planes.1.0.namespaces().copied().collect(); - for namespace in namespaces { + namespace_scratch.extend(planes.1.0.namespaces().copied()); + for namespace in namespace_scratch.drain(..) { let partition = planes .1 .0 @@ -642,9 +653,17 @@ where /// Initializes a partition and its dedicated consensus instance on this shard. /// + /// Used only by the `simulator` crate to seed partitions on the + /// in-process replica array. Production boots via the + /// `load_partition` helper in `server-ng` bootstrap, which takes the + /// cluster `self_replica_id` explicitly from `topology` and never + /// folds the local shard id into the consensus replica space. + /// /// # Panics - /// Panics if the shard id does not fit in `u8`, which is currently required - /// by the partition consensus replica id. + /// Panics if `self.id > u8::MAX` (256+). The simulator currently + /// caps replica count at `MAX_REPLICAS = 256`, so the cast always + /// succeeds in the only caller; the constraint is recorded here so + /// any future caller in production code reads it before the cast. pub fn init_partition(&mut self, namespace: IggyNamespace) where B: MessageBus + Clone, diff --git a/core/shard/src/metrics.rs b/core/shard/src/metrics.rs new file mode 100644 index 0000000000..330a813ac6 --- /dev/null +++ b/core/shard/src/metrics.rs @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Per-shard frame-drop accounting. +//! +//! [`ShardMetrics`] holds `frame_drops_total{shard_id, variant, reason}`, +//! bumped whenever an inter-shard `try_send` is rejected (`Full` / +//! `Disconnected`) or its target shard id is out of range +//! (`Unroutable`): +//! - [`crate::coordinator::ShardZeroCoordinator`] - fd-transfer delegation. +//! - the cross-shard forward closures built in [`crate::builder`]. +//! - `IggyShard::try_send_to_target` - consensus frames. +//! +//! The counter uses atomic interior mutability, safe to bump from `!Send` +//! compio reactor contexts. Each shard owns its own instance. It is not +//! yet exposed through a scrape endpoint; every drop site also logs via +//! `tracing`, so the counter is a structured complement to those logs +//! until a per-shard exporter lands. +//! +//! TODO(hubcio): register `frame_drops_total` with a per-shard prometheus +//! exporter so the counter is scrape-able; until then the `tracing` +//! drop-site logs are the only alertable signal. + +use prometheus_client::encoding::EncodeLabelSet; +use prometheus_client::metrics::counter::Counter; +use prometheus_client::metrics::family::Family; + +/// Label for `frame_drops_total`. +/// +/// `variant` describes the dropped frame class; `reason` is `"full"` or +/// `"disconnected"` per crossfire `TrySendError`, or `"unroutable"` when +/// the target shard id has no sender slot. +#[derive(Clone, Hash, Eq, PartialEq, EncodeLabelSet, Debug)] +pub struct FrameDropLabel { + pub shard_id: u16, + pub variant: &'static str, + pub reason: &'static str, +} + +/// Variant labels used in `frame_drops_total`. Exposed as constants to +/// catch typos at compile time and to keep the cardinality bounded. +/// +/// `FORWARD_CLIENT_SEND` ticks when the cross-shard client-reply forward +/// closure fails. Unlike `CONSENSUS` drops (which VSR retransmit +/// recovers), a `FORWARD_CLIENT_SEND` drop is terminal: the client never +/// receives the reply and request / response semantics break above the +/// bus. Operators should alert on the drop-site `tracing` logs (the +/// counter is not scrape-able yet, see the module doc) and size +/// `inbox_capacity` for the worst-case cross-shard reply burst. +/// `FORWARD_REPLICA_SEND` is the symmetric variant for replica forwards; +/// VSR retransmit covers its loss so it stays informational. +pub mod frame_drop_variant { + pub const CONSENSUS: &str = "consensus"; + pub const LIFECYCLE: &str = "lifecycle"; + pub const FD_TRANSFER: &str = "fd_transfer"; + pub const FORWARD_CLIENT_SEND: &str = "forward_client_send"; + pub const FORWARD_REPLICA_SEND: &str = "forward_replica_send"; +} + +/// Reason labels used in `frame_drops_total`. +/// +/// `UNROUTABLE` ticks when a frame's target shard id has no sender slot +/// (`target >= senders.len()`). Unreachable while every shard seeds its +/// `ShardsTable` identically at boot, but `shard_for` returns a stored +/// `u16` so the index is guarded rather than trusted. +pub mod frame_drop_reason { + pub const FULL: &str = "full"; + pub const DISCONNECTED: &str = "disconnected"; + pub const UNROUTABLE: &str = "unroutable"; +} + +/// Per-shard metric handles. +/// +/// Cheap to clone (`Arc` of a `Family` under the hood). Each shard owns +/// one instance produced by [`ShardMetrics::for_shard`]. +#[derive(Clone)] +pub struct ShardMetrics { + shard_id: u16, + frame_drops_total: Family, +} + +impl ShardMetrics { + /// Create a metrics handle bound to `shard_id`. + #[must_use] + pub fn for_shard(shard_id: u16) -> Self { + Self { + shard_id, + frame_drops_total: Family::default(), + } + } + + /// Increment `frame_drops_total{shard_id, variant, reason}` by 1. + pub fn record_frame_drop(&self, variant: &'static str, reason: &'static str) { + self.frame_drops_total + .get_or_create(&FrameDropLabel { + shard_id: self.shard_id, + variant, + reason, + }) + .inc(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn frame_drop_counter_increments_per_label_set() { + let metrics = ShardMetrics::for_shard(0); + metrics.record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL); + metrics.record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL); + metrics.record_frame_drop( + frame_drop_variant::CONSENSUS, + frame_drop_reason::DISCONNECTED, + ); + + let count = |variant, reason| { + metrics + .frame_drops_total + .get_or_create(&FrameDropLabel { + shard_id: 0, + variant, + reason, + }) + .get() + }; + + assert_eq!( + count(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL), + 2, + "two FULL drops land on one label set", + ); + assert_eq!( + count( + frame_drop_variant::CONSENSUS, + frame_drop_reason::DISCONNECTED, + ), + 1, + "a distinct reason gets its own counter", + ); + } +} diff --git a/core/shard/src/router.rs b/core/shard/src/router.rs index c09ae2c6f5..baf8d77195 100644 --- a/core/shard/src/router.rs +++ b/core/shard/src/router.rs @@ -15,12 +15,15 @@ // specific language governing permissions and limitations // under the License. +use crate::metrics::{frame_drop_reason, frame_drop_variant}; use crate::shards_table::{ShardsTable, calculate_shard_from_consensus_ns}; -use crate::{IggyShard, Receiver, ShardFrame}; +use crate::{IggyShard, LifecycleFrame, Receiver, ShardFrame}; use crossfire::TrySendError; use futures::FutureExt; use iggy_binary_protocol::{ - ConsensusError, ConsensusHeader, GenericHeader, Message, MessageBag, PrepareHeader, + CommitHeader, ConsensusHeader, DoViewChangeHeader, GenericHeader, Message, MessageBag, + Operation, PrepareHeader, PrepareOkHeader, RequestHeader, StartViewChangeHeader, + StartViewHeader, }; use iggy_common::sharding::IggyNamespace; use journal::{Journal, JournalHandle}; @@ -28,24 +31,100 @@ use message_bus::{ConnectionInstaller, MessageBus}; use metadata::impls::metadata::StreamsFrontend; use metadata::stm::StateMachine; +/// Decompose a [`MessageBag`] into the routing-relevant tuple +/// `(operation, namespace, generic_message)`. +/// +/// Single source of truth used by every dispatch entry point so the +/// operation / namespace extraction never drifts between call sites. +fn extract_routing(bag: MessageBag) -> (Operation, u64, Message) { + match bag { + MessageBag::Request(r) => { + let h = *r.header(); + (h.operation, h.namespace, r.into_generic()) + } + MessageBag::Prepare(p) => { + let h = *p.header(); + (h.operation, h.namespace, p.into_generic()) + } + MessageBag::PrepareOk(p) => { + let h = *p.header(); + (h.operation, h.namespace, p.into_generic()) + } + MessageBag::StartViewChange(m) => { + let h = *m.header(); + (h.operation(), h.namespace, m.into_generic()) + } + MessageBag::DoViewChange(m) => { + let h = *m.header(); + (h.operation(), h.namespace, m.into_generic()) + } + MessageBag::StartView(m) => { + let h = *m.header(); + (h.operation(), h.namespace, m.into_generic()) + } + MessageBag::Commit(m) => { + let h = *m.header(); + (h.operation(), h.namespace, m.into_generic()) + } + } +} + +/// Borrowing variant of [`extract_routing`] used by +/// [`Router::accept_frame_for_self`]. Reads `operation` and `namespace` +/// through [`Message::try_as_typed`] so the check no longer clones the +/// full `Message` (heap memcpy of `Owned`) on every +/// Consensus frame. Returns `None` when the command is unknown or the +/// typed view fails to validate; the caller treats either as +/// always-accept rather than a routing mismatch. +fn extract_routing_ref(message: &Message) -> Option<(Operation, u64)> { + use iggy_binary_protocol::Command2; + match message.header().command { + Command2::Request => { + let typed = message.try_as_typed::().ok()?; + Some((typed.header().operation, typed.header().namespace)) + } + Command2::Prepare => { + let typed = message.try_as_typed::().ok()?; + Some((typed.header().operation, typed.header().namespace)) + } + Command2::PrepareOk => { + let typed = message.try_as_typed::().ok()?; + Some((typed.header().operation, typed.header().namespace)) + } + Command2::StartViewChange => { + let typed = message.try_as_typed::().ok()?; + Some((typed.header().operation(), typed.header().namespace)) + } + Command2::DoViewChange => { + let typed = message.try_as_typed::().ok()?; + Some((typed.header().operation(), typed.header().namespace)) + } + Command2::StartView => { + let typed = message.try_as_typed::().ok()?; + Some((typed.header().operation(), typed.header().namespace)) + } + Command2::Commit => { + let typed = message.try_as_typed::().ok()?; + Some((typed.header().operation(), typed.header().namespace)) + } + _ => None, + } +} + /// Inter-shard dispatch logic. /// /// All messages — whether destined for a local or remote shard — are routed /// through the channel into the target shard's message pump. This ensures /// that every mutation on a shard is serialized through a single point (the /// pump), preventing concurrent access from independent async tasks. -impl IggyShard +impl IggyShard where B: MessageBus + ConnectionInstaller, T: ShardsTable, - R: Send + 'static, { - /// Classify a raw network message and route it to - /// the correct shard's message pump. - /// - /// Decomposes the generic message into its typed form (Request, Prepare, - /// or `PrepareOk`) to access the operation and namespace, then resolves - /// the target shard and enqueues the message via its channel sender. + /// Network-receive entry point. Classifies the raw + /// `Message` and routes it to the owning shard via + /// `route_typed`. pub fn dispatch(&self, message: Message) { let bag = match MessageBag::try_from(message) { Ok(bag) => bag, @@ -54,41 +133,38 @@ where return; } }; - let (operation, namespace, generic) = match bag { - MessageBag::Request(r) => { - let h = *r.header(); - (h.operation, h.namespace, r.into_generic()) - } - MessageBag::Prepare(p) => { - let h = *p.header(); - (h.operation, h.namespace, p.into_generic()) - } - MessageBag::PrepareOk(p) => { - let h = *p.header(); - (h.operation, h.namespace, p.into_generic()) - } - MessageBag::StartViewChange(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } - MessageBag::DoViewChange(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } - MessageBag::StartView(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } - MessageBag::Commit(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } - }; - let raw_namespace = namespace; - let partition_namespace = IggyNamespace::from_raw(raw_namespace); - let target = if operation.is_metadata() { - 0 - } else if operation.is_partition() { + let (operation, namespace, generic) = extract_routing(bag); + self.route_typed(operation, namespace, generic); + } + + /// Route a consensus-control message (`StartViewChange`, `DoViewChange`, + /// `StartView`, `Commit`) by hashing its raw `u64` consensus namespace. + /// Every node uses the same hash so the shard owning the consensus + /// group is deterministic across the cluster. + pub(crate) fn route_consensus_control( + &self, + message: Message, + namespace_u64: u64, + operation: Operation, + ) { + let target = calculate_shard_from_consensus_ns(namespace_u64, self.shard_count); + self.try_send_to_target(target, message, operation); + } + + /// Branch on operation class to pick the right routing rule. Used by + /// [`Self::dispatch`]. + fn route_typed( + &self, + operation: Operation, + namespace_u64: u64, + generic: Message, + ) { + if operation.is_metadata() { + self.try_send_to_target(0, generic, operation); + return; + } + if operation.is_partition() { + let partition_namespace = IggyNamespace::from_raw(namespace_u64); let Some(target) = self.shards_table.shard_for(partition_namespace) else { tracing::error!( shard = self.id, @@ -99,144 +175,67 @@ where ); return; }; - target - } else { - // View-change / Commit messages carry an opaque u64 consensus - // namespace (not an `IggyNamespace`). Hash it with the same - // function the partition-plane lookup table uses so the - // shard owning the consensus group is deterministically the - // same across every node. Single-shard deployments trivially - // collapse to 0. - #[allow(clippy::cast_lossless)] - let shard_count = u32::try_from(self.senders.len()).unwrap_or(u32::MAX); - calculate_shard_from_consensus_ns(raw_namespace, shard_count) - }; - // `senders[target]` is a `crossfire::MTx`, which in compio is - // running on an io_uring reactor: a blocking `send` on a full - // inbox would park the reactor thread and stall every other - // connection on the shard. Drop instead - consensus recovers via - // WAL retransmit or a view change. - match self.senders[target as usize].try_send(ShardFrame::fire_and_forget(generic)) { - Ok(()) => {} - Err(TrySendError::Full(_)) => { - tracing::warn!( - shard = self.id, - target, - ?operation, - "dispatch: shard inbox full, message dropped" - ); - } - Err(TrySendError::Disconnected(_)) => { - tracing::warn!( - shard = self.id, - target, - ?operation, - "dispatch: shard inbox closed, message dropped" - ); - } + self.try_send_to_target(target, generic, operation); + return; } + self.route_consensus_control(generic, namespace_u64, operation); } - /// Dispatch a message and return a receiver that resolves when the target - /// shard has finished processing it. - /// - /// # Errors - /// Returns `ConsensusError` if the message cannot be routed. - pub fn dispatch_request( + /// Send `message` into `senders[target]`. Honors the `io_uring` reactor + /// constraint: never blocks; drops on `Full` / `Disconnected` and + /// records the drop in `frame_drops_total{variant=consensus}`. VSR + /// retransmit recovers consensus drops. A `target` past the end of + /// `senders` (a stored `u16` from `shard_for`, not a trusted index) + /// is dropped with `reason=unroutable` rather than panicking. + /// Metadata frames always pass `target = 0` here, since `is_metadata` + /// operations are owned by shard 0. + fn try_send_to_target( &self, + target: u16, message: Message, - ) -> Result, ConsensusError> { - let bag = MessageBag::try_from(message)?; - let (operation, namespace, generic) = match bag { - MessageBag::Request(r) => { - let h = *r.header(); - (h.operation, h.namespace, r.into_generic()) - } - MessageBag::Prepare(p) => { - let h = *p.header(); - (h.operation, h.namespace, p.into_generic()) - } - MessageBag::PrepareOk(p) => { - let h = *p.header(); - (h.operation, h.namespace, p.into_generic()) - } - MessageBag::StartViewChange(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } - MessageBag::DoViewChange(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } - MessageBag::StartView(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } - MessageBag::Commit(m) => { - let h = *m.header(); - (h.operation(), h.namespace, m.into_generic()) - } + operation: Operation, + ) { + let Some(sender) = self.senders.get(target as usize) else { + self.metrics + .record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::UNROUTABLE); + tracing::error!( + shard = self.id, + target, + ?operation, + "dispatch: target shard id out of range, message dropped" + ); + return; }; - let raw_namespace = namespace; - let partition_namespace = IggyNamespace::from_raw(raw_namespace); - - // Determine which shard should handle a message given its operation and - // namespace. - // - // - Metadata operations always route to shard 0 (the control plane). - // - Partition operations route to the shard that owns the namespace, - // looked up via the [`ShardsTable`]. - // - Consensus control-plane (`StartViewChange`, `DoViewChange`, - // `StartView`, `Commit`) carries a raw `u64` consensus namespace; - // route it via a deterministic hash function so every node agrees - // on the owning shard without consulting the partitions table. - let target = if operation.is_metadata() { - 0 - } else if operation.is_partition() { - self.shards_table - .shard_for(partition_namespace) - .ok_or_else(|| { - ConsensusError::InvalidField(format!( - "namespace {raw_namespace} is not registered in shards_table" - )) - })? - } else { - #[allow(clippy::cast_lossless)] - let shard_count = u32::try_from(self.senders.len()).unwrap_or(u32::MAX); - calculate_shard_from_consensus_ns(raw_namespace, shard_count) - }; - // Create a frame and send it to the target shard. Same non- - // blocking `try_send` reason as `dispatch` above: blocking on a - // full inbox would park the io_uring reactor. - // - // On `Full` / `Disconnected` we drop the frame and let the - // caller's `rx` observe the disconnect (the `response_sender` - // inside the frame is dropped together with the frame). Callers - // see an early error rather than hanging. - let (frame, rx) = ShardFrame::::with_response(generic); - match self.senders[target as usize].try_send(frame) { + match sender.try_send(ShardFrame::consensus(message)) { Ok(()) => {} Err(TrySendError::Full(_)) => { + self.metrics + .record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL); tracing::warn!( shard = self.id, target, ?operation, - "dispatch_request: shard inbox full, message dropped" + "dispatch: shard inbox full, message dropped" ); } Err(TrySendError::Disconnected(_)) => { + self.metrics.record_frame_drop( + frame_drop_variant::CONSENSUS, + frame_drop_reason::DISCONNECTED, + ); tracing::warn!( shard = self.id, target, ?operation, - "dispatch_request: shard inbox closed, message dropped" + "dispatch: shard inbox closed, message dropped" ); } } - Ok(rx) } - /// Drain this shard's inbox and process each frame locally. + /// Drain this shard's inbox and process each frame locally until the + /// `stop` signal fires or the inbox disconnects, then drain any frames + /// still queued so in-flight requests still get a response. #[allow(clippy::future_not_send)] pub async fn run_message_pump(&self, stop: Receiver<()>) where @@ -254,14 +253,17 @@ where > + StreamsFrontend, { let mut loopback_buf = Vec::new(); + let mut namespace_scratch: Vec = Vec::new(); loop { futures::select! { _ = stop.recv().fuse() => break, frame = self.inbox.recv().fuse() => { match frame { Ok(frame) => { - self.process_frame(frame).await; - self.process_loopback(&mut loopback_buf).await; + if self.accept_frame_for_self(&frame) { + self.process_frame(frame).await; + self.process_loopback(&mut loopback_buf, &mut namespace_scratch).await; + } } Err(_) => break, } @@ -271,13 +273,79 @@ where // Drain remaining frames so in-flight requests get a response. while let Ok(frame) = self.inbox.try_recv() { - self.process_frame(frame).await; - self.process_loopback(&mut loopback_buf).await; + if self.accept_frame_for_self(&frame) { + self.process_frame(frame).await; + self.process_loopback(&mut loopback_buf, &mut namespace_scratch) + .await; + } + } + } + + /// Sanity check at pump entry: every Consensus frame routed through + /// [`Self::dispatch`] must land on the shard whose `id` matches + /// `senders[target]`. The ctor `assert_sender_ordering` proves + /// `senders[i].shard_id() == i`, but only checks the sender vec - + /// it cannot catch a receiver moved to the wrong runtime. This + /// check closes that gap. Non-Consensus frames, and Consensus frames + /// whose namespace is not (yet) routable, are always accepted; a + /// Consensus frame that resolves to a *different* shard fires a + /// `debug_assert_eq!` in test/debug builds and is logged + dropped + /// in release. Returns `true` when the frame may be processed, + /// `false` when it must be dropped. + fn accept_frame_for_self(&self, frame: &ShardFrame) -> bool { + // Single-shard clusters cannot misroute: every namespace maps to + // shard 0, so the per-frame routing re-derivation is pure waste. + if self.shard_count <= 1 { + return true; + } + let ShardFrame::Consensus(message) = frame else { + return true; + }; + let Some((operation, namespace_u64)) = extract_routing_ref(message) else { + return true; + }; + let expected = if operation.is_metadata() { + 0 + } else if operation.is_partition() { + let ns = IggyNamespace::from_raw(namespace_u64); + let Some(target) = self.shards_table.shard_for(ns) else { + // Namespace absent from the shards_table. Until runtime + // namespace propagation lands this cannot happen (all + // shards seed the table identically); once it does, an + // unpropagated entry is a transient state, not a routing + // bug - accept and log rather than panic or drop. + tracing::error!( + shard = self.id, + stream = ns.stream_id(), + topic = ns.topic_id(), + partition = ns.partition_id(), + "Consensus frame namespace missing from shards_table; accepting frame" + ); + return true; + }; + target + } else { + calculate_shard_from_consensus_ns(namespace_u64, self.shard_count) + }; + if expected == self.id { + return true; } + debug_assert_eq!( + expected, self.id, + "shard {} pump received Consensus frame whose target_shard={expected}; \ + senders/runtime incorrectly bound", + self.id + ); + tracing::error!( + shard = self.id, + expected, + "Consensus frame routed to wrong shard; dropping frame to preserve safety" + ); + false } #[allow(clippy::future_not_send)] - async fn process_frame(&self, frame: ShardFrame) + async fn process_frame(&self, frame: ShardFrame) where B: MessageBus, MJ: JournalHandle, @@ -292,11 +360,21 @@ where Error = iggy_common::IggyError, > + StreamsFrontend, { - match frame.payload { - crate::ShardFramePayload::Consensus(message) => { + match frame { + ShardFrame::Consensus(message) => { self.on_message(message).await; } - crate::ShardFramePayload::ReplicaConnectionSetup { fd, replica_id } => { + ShardFrame::Lifecycle(payload) => self.process_lifecycle(payload).await, + } + } + + #[allow(clippy::future_not_send)] + async fn process_lifecycle(&self, payload: LifecycleFrame) + where + B: MessageBus, + { + match payload { + LifecycleFrame::ReplicaConnectionSetup { fd, replica_id } => { tracing::info!( shard = self.id, replica_id, @@ -306,7 +384,7 @@ where self.bus .install_replica_tcp_fd(fd, replica_id, self.on_replica_message.clone()); } - crate::ShardFramePayload::ClientConnectionSetup { fd, meta } => { + LifecycleFrame::ClientConnectionSetup { fd, meta } => { tracing::info!( shard = self.id, client_id = meta.client_id, @@ -316,7 +394,7 @@ where self.bus .install_client_fd(fd, meta, self.on_client_request.clone()); } - crate::ShardFramePayload::ClientWsConnectionSetup { fd, meta } => { + LifecycleFrame::ClientWsConnectionSetup { fd, meta } => { tracing::info!( shard = self.id, client_id = meta.client_id, @@ -326,18 +404,7 @@ where self.bus .install_client_ws_fd(fd, meta, self.on_client_request.clone()); } - crate::ShardFramePayload::ReplicaMappingUpdate { - replica_id, - owning_shard, - } => { - self.bus.set_shard_mapping(replica_id, owning_shard); - } - crate::ShardFramePayload::ReplicaMappingClear { replica_id } => { - self.bus.remove_shard_mapping(replica_id); - } - crate::ShardFramePayload::ForwardReplicaSend { replica_id, msg } => { - // Fast path on the owning shard: re-enter `send_to_replica` - // which finds the local `BusSender` in the replicas registry. + LifecycleFrame::ForwardReplicaSend { replica_id, msg } => { if let Err(e) = self.bus.send_to_replica(replica_id, msg).await { tracing::debug!( shard = self.id, @@ -347,7 +414,7 @@ where ); } } - crate::ShardFramePayload::ForwardClientSend { client_id, msg } => { + LifecycleFrame::ForwardClientSend { client_id, msg } => { if let Err(e) = self.bus.send_to_client(client_id, msg).await { tracing::debug!( shard = self.id, @@ -357,35 +424,6 @@ where ); } } - crate::ShardFramePayload::ConnectionLost { replica_id } => { - // Shard 0 is the sole responder; the next reconnect sweep - // will re-dial the peer. Non-zero shards should not see this - // variant. - if self.id == 0 { - tracing::warn!(replica_id, "shard 0 observed replica connection loss"); - // If a coordinator is attached, let it broadcast the - // clear AND drop its tracked mapping so the periodic - // refresh task stops re-broadcasting a dead replica's - // mapping. Tests that bypass the coordinator fall back - // to the direct broadcast. - if let Some(coord) = &self.coordinator { - coord.forget_mapping(replica_id); - coord.broadcast_mapping_clear(replica_id); - } else { - crate::broadcast_mapping_clear(&self.senders, replica_id); - } - } else { - tracing::warn!( - shard = self.id, - replica_id, - "non-zero shard received ConnectionLost; dropping" - ); - } - } } - // TODO: once on_message returns an R (e.g. ShardResponse), send it - // back via frame.response_sender. For now the sender is dropped and - // the caller's receiver will observe a disconnect. - drop(frame.response_sender); } } diff --git a/core/shard/src/shards_table.rs b/core/shard/src/shards_table.rs index 0a0a940644..6829f28cdb 100644 --- a/core/shard/src/shards_table.rs +++ b/core/shard/src/shards_table.rs @@ -32,7 +32,11 @@ pub trait ShardsTable { fn shard_for(&self, namespace: IggyNamespace) -> Option; } -/// No-op shards table for single-shard setups. +/// Always-`None` impl. Satisfies the trait for test / simulator paths +/// that never route via the table (e.g. [`crate::IggyShard::without_inbox`]). +/// Do not use in production: the router drops any partition frame whose +/// `shard_for` returns `None`, so a real cluster wired to this impl would +/// silently shed every partition request. impl ShardsTable for () { fn shard_for(&self, _namespace: IggyNamespace) -> Option { None diff --git a/core/simulator/src/bus.rs b/core/simulator/src/bus.rs index d185e77be6..5e6769b77a 100644 --- a/core/simulator/src/bus.rs +++ b/core/simulator/src/bus.rs @@ -143,6 +143,10 @@ impl MessageBus for SimOutbox { Ok(()) } + + fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {} + fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {} + fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {} } /// Newtype wrapper for shared [`SimOutbox`] that implements [`MessageBus`] @@ -172,4 +176,16 @@ impl MessageBus for SharedSimOutbox { ) -> Result<(), SendError> { self.0.send_to_replica(replica, data).await } + + fn set_connection_lost_fn(&self, f: message_bus::ConnectionLostFn) { + self.0.set_connection_lost_fn(f); + } + + fn set_replica_forward_fn(&self, f: message_bus::ReplicaForwardFn) { + self.0.set_replica_forward_fn(f); + } + + fn set_client_forward_fn(&self, f: message_bus::ClientForwardFn) { + self.0.set_client_forward_fn(f); + } } diff --git a/core/simulator/src/lib.rs b/core/simulator/src/lib.rs index 51c97ce37d..28d777a5f9 100644 --- a/core/simulator/src/lib.rs +++ b/core/simulator/src/lib.rs @@ -325,8 +325,10 @@ impl Simulator { futures::executor::block_on(replica.on_message(message)); let mut buf = Vec::new(); - futures::executor::block_on(replica.process_loopback(&mut buf)); - let loopback_count = futures::executor::block_on(replica.process_loopback(&mut buf)); + let mut ns_scratch: Vec = Vec::new(); + futures::executor::block_on(replica.process_loopback(&mut buf, &mut ns_scratch)); + let loopback_count = + futures::executor::block_on(replica.process_loopback(&mut buf, &mut ns_scratch)); debug_assert_eq!( loopback_count, 0, "on_ack must not re-enqueue loopback messages"