diff --git a/Cargo.lock b/Cargo.lock
index ba141f6fa1..1e6c532e79 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11334,6 +11334,7 @@ dependencies = [
  "compio",
  "configs",
  "consensus",
+ "crossfire",
  "ctrlc",
  "cyper",
  "cyper-axum",
@@ -11342,7 +11343,6 @@ dependencies = [
  "err_trail",
  "error_set",
  "figlet-rs",
- "flume 0.12.0",
  "fs2",
  "hash32 1.0.0",
  "human-repr",
@@ -11464,6 +11464,7 @@ dependencies = [
  "metadata",
  "papaya",
  "partitions",
+ "prometheus-client",
  "tracing",
 ]
 
diff --git a/core/configs/src/server_config/sharding.rs b/core/configs/src/server_config/sharding.rs
index 084c984e1c..fcf0396f05 100644
--- a/core/configs/src/server_config/sharding.rs
+++ b/core/configs/src/server_config/sharding.rs
@@ -21,11 +21,69 @@ use std::str::FromStr;
 
 use configs::ConfigEnv;
 
-#[derive(Debug, Deserialize, Serialize, Default, ConfigEnv)]
+/// Default capacity of the per-shard inter-shard inbox channel. Sized
+/// comfortably above the consensus working set, which is roughly
+/// `PIPELINE_PREPARE_QUEUE_MAX (= 8) * replica_count * directions`
+/// frames in flight per shard, without allowing a runaway producer to
+/// eat unbounded memory. Tunable via `[system.sharding] inbox_capacity`
+/// in TOML.
+///
+/// The capacity must also absorb the worst-case cross-shard client
+/// Reply burst. Unlike consensus frames, client Replies have no VSR
+/// retransmit path: a Reply lost on full inbox is gone and the client
+/// times out. A reasonable lower bound is
+/// `max_inflight_client_requests / num_shards` (assuming requests are
+/// distributed evenly across owning shards) plus the consensus
+/// headroom above.
+///
+/// Consensus frames and client-reply forwards share this one channel,
+/// so the two headrooms are not independent: a consensus burst or
+/// retransmit storm can fill the inbox with consensus frames exactly
+/// when a client Reply needs the space. A single `inbox_capacity` knob
+/// cannot isolate the two frame classes - size it for the sum of both
+/// worst cases occurring together. Watch the drop-site `tracing` logs
+/// (and, once a per-shard exporter lands, the `frame_drops_total`
+/// `{variant="forward_client_send"}` counter) to detect when the bound
+/// is too low in production.
+pub const DEFAULT_INBOX_CAPACITY: usize = 1024;
+
+/// Maximum permitted per-shard inbox depth. The channel is allocated
+/// up-front per shard, so a runaway value here OOMs the process at boot.
+/// `1 << 20` (~1M frames) is several orders of magnitude above any
+/// realistic backpressure target and still fits comfortably in process
+/// address space.
+pub const INBOX_CAPACITY_MAX: usize = 1 << 20;
+
+const fn default_inbox_capacity() -> usize {
+    DEFAULT_INBOX_CAPACITY
+}
+
+#[derive(Debug, Deserialize, Serialize, ConfigEnv)]
 pub struct ShardingConfig {
     #[serde(default)]
     #[config_env(leaf)]
     pub cpu_allocation: CpuAllocation,
+    /// Per-shard inter-shard inbox channel capacity. Bounded by design.
+    /// Drops on full inbox of consensus frames are recovered by VSR
+    /// retransmit. Drops of cross-shard client Reply frames are terminal:
+    /// the client never receives the reply (no in-protocol retransmit).
+    /// Both frame classes share this one channel, so a consensus burst
+    /// can starve client-reply forwards: size against the worst-case sum
+    /// of consensus working set + peak client-reply fan-out per shard
+    /// occurring together; see `DEFAULT_INBOX_CAPACITY` for the
+    /// rationale. Used by `core/server-ng`; the legacy server uses its
+    /// own hard-coded inbox sizing.
+    #[serde(default = "default_inbox_capacity")]
+    pub inbox_capacity: usize,
+}
+
+impl Default for ShardingConfig {
+    fn default() -> Self {
+        Self {
+            cpu_allocation: CpuAllocation::default(),
+            inbox_capacity: DEFAULT_INBOX_CAPACITY,
+        }
+    }
 }
 
 #[derive(Debug, Clone, PartialEq, Default)]
diff --git a/core/configs/src/server_config/validators.rs b/core/configs/src/server_config/validators.rs
index 1512961d6e..3e5a8fee84 100644
--- a/core/configs/src/server_config/validators.rs
+++ b/core/configs/src/server_config/validators.rs
@@ -23,7 +23,7 @@ use super::server::{
     DataMaintenanceConfig, MessageSaverConfig, MessagesMaintenanceConfig, TelemetryConfig,
 };
 use super::server::{MemoryPoolConfig, PersonalAccessTokenConfig, ServerConfig};
-use super::sharding::{CpuAllocation, ShardingConfig};
+use super::sharding::{CpuAllocation, INBOX_CAPACITY_MAX, ShardingConfig};
 use super::system::SegmentConfig;
 use super::system::{CompressionConfig, LoggingConfig, PartitionConfig};
 use crate::ConfigurationError;
@@ -379,6 +379,23 @@ impl Validatable<ConfigurationError> for MemoryPoolConfig {
 
 impl Validatable<ConfigurationError> for ShardingConfig {
     fn validate(&self) -> Result<(), ConfigurationError> {
+        if self.inbox_capacity == 0 {
+            eprintln!(
+                "Invalid sharding configuration: inbox_capacity must be > 0 (crossfire silently \
+                 rounds 0 to 1, masking config errors)"
+            );
+            return Err(ConfigurationError::InvalidConfigurationValue);
+        }
+        if self.inbox_capacity > INBOX_CAPACITY_MAX {
+            eprintln!(
+                "Invalid sharding configuration: inbox_capacity {} exceeds the {} cap (each \
+                 shard preallocates a channel of this size; oversizing here OOMs the process at \
+                 boot)",
+                self.inbox_capacity, INBOX_CAPACITY_MAX
+            );
+            return Err(ConfigurationError::InvalidConfigurationValue);
+        }
+
         let available_cpus = available_parallelism()
             .map_err(|_| {
                 eprintln!("Failed to detect available CPU cores");
diff --git a/core/consensus/src/metadata_helpers.rs b/core/consensus/src/metadata_helpers.rs
index 66885911e7..5904f1049e 100644
--- a/core/consensus/src/metadata_helpers.rs
+++ b/core/consensus/src/metadata_helpers.rs
@@ -353,6 +353,10 @@ mod tests {
         ) -> Result<(), SendError> {
             Ok(())
         }
+
+        fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {}
+        fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {}
+        fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {}
     }
 
     // Register-retry replay: cached IS the register reply; bytes replayed
diff --git a/core/consensus/src/plane_helpers.rs b/core/consensus/src/plane_helpers.rs
index e2a500cae0..7e0a89cf44 100644
--- a/core/consensus/src/plane_helpers.rs
+++ b/core/consensus/src/plane_helpers.rs
@@ -472,6 +472,10 @@ mod tests {
         ) -> Result<(), SendError> {
             Ok(())
         }
+
+        fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {}
+        fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {}
+        fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {}
     }
 
     fn prepare_message(op: u64, parent: u128, checksum: u128) -> Message<PrepareHeader> {
@@ -675,6 +679,10 @@ mod tests {
             self.sent.borrow_mut().push((replica, data));
             Ok(())
         }
+
+        fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {}
+        fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {}
+        fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {}
     }
 
     #[test]
diff --git a/core/journal/src/file_storage.rs b/core/journal/src/file_storage.rs
index 6517f5b908..0de1ac6730 100644
--- a/core/journal/src/file_storage.rs
+++ b/core/journal/src/file_storage.rs
@@ -31,7 +31,8 @@ pub struct FileStorage {
 
 #[allow(clippy::future_not_send)]
 impl FileStorage {
-    /// Open or create the file at `path`, setting `write_offset` to current file length.
+    /// Open or create the file at `path` in read-write mode, setting
+    /// `write_offset` to current file length.
     ///
     /// # Errors
     /// Returns an I/O error if the file cannot be opened or created.
diff --git a/core/journal/src/prepare_journal.rs b/core/journal/src/prepare_journal.rs
index 283db5b4d4..ac16d87113 100644
--- a/core/journal/src/prepare_journal.rs
+++ b/core/journal/src/prepare_journal.rs
@@ -44,7 +44,7 @@ const MAX_ENTRY_SIZE: u64 = 64 * 1024 * 1024;
 /// still contains them, but they become unreachable for recovery).
 ///
 /// **NOTE:** This number needs to be chosen in balance between number of
-/// entries in [`core::consensus::pipeline_prepare_queue_max`]. Because this number controls
+/// entries in [`consensus::PIPELINE_PREPARE_QUEUE_MAX`]. Because this number controls
 /// how many committed but not yet snapshotted entries that the buffer can
 /// hold. This may need to be tuned properly.
 pub(crate) const SLOT_COUNT: usize = 1024;
@@ -117,9 +117,47 @@ const fn slot_for_op(op: u64) -> usize {
     op as usize % SLOT_COUNT
 }
 
+/// Repair a damaged WAL tail by truncating to `pos`, or surface a loud
+/// error when truncation would be unsafe.
+///
+/// Truncation is only sound for a torn final append, which writes at most
+/// one entry's worth of bytes. If more than `MAX_ENTRY_SIZE` bytes follow
+/// `pos`, the damage is mid-file: truncating would silently discard every
+/// committed entry after it, so this hard-errors instead of repairing.
+#[allow(clippy::future_not_send)]
+async fn truncate_or_fail(
+    storage: &FileStorage,
+    pos: u64,
+    reason: &'static str,
+) -> Result<(), JournalError> {
+    let trailing = storage.file_len().saturating_sub(pos);
+    // Sound only because this WAL appends exactly one entry per `append`
+    // + fsync, so a torn tail is at most one entry wide. A batched-append
+    // WAL could leave a torn tail many entries wide, and this
+    // `> MAX_ENTRY_SIZE` check would misclassify it as mid-file damage.
+    if trailing > MAX_ENTRY_SIZE {
+        return Err(JournalError::Io(io::Error::new(
+            io::ErrorKind::InvalidData,
+            format!(
+                "mid-file WAL corruption at pos {pos}: {reason}; {trailing} bytes \
+                 follow (> MAX_ENTRY_SIZE {MAX_ENTRY_SIZE}), refusing to truncate \
+                 and discard committed entries"
+            ),
+        )));
+    }
+    storage.truncate(pos).await?;
+    // The repair must be crash-durable. `FileStorage::truncate` is a
+    // bare `set_len`; without this fsync a power loss right after the
+    // repair re-presents the torn tail on the next boot. Mirrors the
+    // write-then-fsync the `append` path already does.
+    storage.fsync().await?;
+    Ok(())
+}
+
 #[allow(clippy::cast_possible_truncation)]
 impl PrepareJournal {
-    /// Open the WAL file, scanning forward to rebuild the in-memory index.
+    /// Open the WAL file in read-write mode, scanning forward to rebuild
+    /// the in-memory index.
     ///
     /// `snapshot_op` is the highest op that has been durably snapshotted.
     /// It must be provided so that `append()` can detect slot collisions
@@ -133,6 +171,11 @@ impl PrepareJournal {
     #[allow(clippy::future_not_send)]
     pub async fn open(path: &Path, snapshot_op: u64) -> Result<Self, JournalError> {
         let storage = FileStorage::open(path).await?;
+        Self::scan(storage, snapshot_op).await
+    }
+
+    #[allow(clippy::future_not_send)]
+    async fn scan(storage: FileStorage, snapshot_op: u64) -> Result<Self, JournalError> {
         let file_len = storage.file_len();
         let mut headers: Vec<Option<PrepareHeader>> = vec![None; SLOT_COUNT];
         let mut offsets: Vec<Option<u64>> = vec![None; SLOT_COUNT];
@@ -143,27 +186,40 @@ impl PrepareJournal {
         while pos + HEADER_SIZE as u64 <= file_len {
             // Read the 256-byte header
             header_buf = storage.read_at(pos, header_buf).await?;
-            let header: PrepareHeader =
-                *bytemuck::checked::from_bytes::<PrepareHeader>(&header_buf);
+            // `try_from_bytes` so an invalid bit pattern (e.g. a corrupt
+            // `command` byte yielding no `Command2` variant) routes through
+            // `truncate_or_fail` instead of panicking the recovery thread.
+            let Ok(header_ref) = bytemuck::checked::try_from_bytes::<PrepareHeader>(&header_buf)
+            else {
+                truncate_or_fail(&storage, pos, "corrupt header (invalid bit pattern)").await?;
+                break;
+            };
+            let header: PrepareHeader = *header_ref;
 
             // Validate: must be a Prepare command with sane size
             if header.command != Command2::Prepare
                 || (header.size as usize) < HEADER_SIZE
                 || u64::from(header.size) > MAX_ENTRY_SIZE
             {
-                // Corrupt or non-prepare entry, truncate here
-                storage.truncate(pos).await?;
+                truncate_or_fail(&storage, pos, "corrupt or non-prepare entry").await?;
                 break;
             }
 
             let entry_size = u64::from(header.size);
 
+            // TODO(hubcio): verify `header.checksum` / `header.checksum_body`
+            // against the entry body during scan and route a mismatch
+            // through `truncate_or_fail`. Blocked on the writer side: the
+            // `PrepareHeader` projection in consensus builds prepares with
+            // `..Default::default()` so the integrity fields are always 0.
+            // Until a producer computes them, verification here would be
+            // trivially-passing noise. Without it, a body bit-flip that
+            // leaves the header valid is replayed silently as corrupt
+            // state.
+
             // Check if the full entry fits
             if pos + entry_size > file_len {
-                // Truncated entry at tail
-                // This handles the case where crash happened during write and
-                // only header was written and body was not. so we truncate the file to the start of the entry.
-                storage.truncate(pos).await?;
+                truncate_or_fail(&storage, pos, "truncated entry at tail").await?;
                 break;
             }
 
@@ -185,7 +241,7 @@ impl PrepareJournal {
 
         // If there are leftover bytes less than a header, truncate them
         if pos < storage.file_len() {
-            storage.truncate(pos).await?;
+            truncate_or_fail(&storage, pos, "leftover bytes shorter than a header").await?;
         }
 
         Ok(Self {
@@ -216,9 +272,13 @@ impl PrepareJournal {
     /// Advance the snapshot watermark. The caller must ensure `op` is
     /// monotonically increasing and corresponds to a durable snapshot.
     ///
+    /// # Errors
+    /// Returns `JournalError::Io` if a future durability requirement
+    /// surfaces an I/O failure; currently infallible.
+    ///
     /// # Panics
     /// Panics if `op` is less than the current snapshot watermark.
-    pub fn set_snapshot_op(&self, op: u64) {
+    pub fn set_snapshot_op(&self, op: u64) -> Result<(), JournalError> {
         assert!(
             op >= self.snapshot_op.get(),
             "snapshot_op must be monotonically increasing: {} -> {}",
@@ -226,6 +286,7 @@ impl PrepareJournal {
             op
         );
         self.snapshot_op.set(op);
+        Ok(())
     }
 
     /// Access the underlying storage (for fsync in tests, etc.).
@@ -322,6 +383,20 @@ impl Journal<FileStorage> for PrepareJournal {
 
         // Advance the snapshot watermark so future appends treat
         // drained ops as safe to overwrite.
+        //
+        // TODO(hubcio): this advances the in-memory watermark before the
+        // WAL rewrite below (tmp create -> write live entries -> fsync ->
+        // rename -> fsync parent -> reopen) is durable. Any `?` failure
+        // between here and `reopen()` leaves `snapshot_op` advanced while
+        // the WAL file and the in-memory index still hold the un-drained
+        // ops. The next `append()` that wraps onto one of those slots
+        // then sees `existing.op <= snapshot_op`, passes the slot
+        // collision check, and silently evicts a live entry from the
+        // index instead of panicking. The entry survives on disk but
+        // becomes unreachable: consensus can no longer answer
+        // `RetransmitPrepares` for it and a lagging peer stalls until a
+        // view change. Fix: advance `snapshot_op` only after `reopen()`
+        // returns `Ok`.
         if end_op > self.snapshot_op.get() {
             self.snapshot_op.set(end_op);
         }
@@ -588,6 +663,78 @@ mod tests {
         assert_eq!(entry.header().op, 1);
     }
 
+    #[compio::test]
+    async fn truncate_or_fail_durably_repairs_torn_tail() {
+        let dir = tempdir().unwrap();
+        let path = dir.path().join("journal.wal");
+
+        let good_len = {
+            let journal = PrepareJournal::open(&path, 0).await.unwrap();
+            journal.append(make_prepare(1, 64)).await.unwrap();
+            journal.append(make_prepare(2, 64)).await.unwrap();
+            journal.storage.fsync().await.unwrap();
+            journal.storage.file_len()
+        };
+
+        // Simulate a writer crash mid-append: junk bytes past the last
+        // good entry, shorter than MAX_ENTRY_SIZE so it reads as a torn
+        // tail rather than mid-file corruption.
+        {
+            let storage = FileStorage::open(&path).await.unwrap();
+            storage.write_append(vec![0xAB_u8; 16]).await.unwrap();
+            storage.fsync().await.unwrap();
+        }
+
+        // The recovery repair path truncates back to the last good entry.
+        {
+            let storage = FileStorage::open(&path).await.unwrap();
+            truncate_or_fail(&storage, good_len, "torn tail test")
+                .await
+                .unwrap();
+        }
+
+        // The repair is durable: a fresh open sees the file ending
+        // exactly at the last good entry, with no torn tail to re-repair.
+        assert_eq!(std::fs::metadata(&path).unwrap().len(), good_len);
+        let journal = PrepareJournal::open(&path, 0).await.unwrap();
+        assert_eq!(journal.last_op(), Some(2));
+        assert!(journal.header(3).is_none());
+    }
+
+    #[compio::test]
+    async fn open_rejects_mid_file_corruption() {
+        let dir = tempdir().unwrap();
+        let path = dir.path().join("journal.wal");
+
+        // A corrupt header followed by more than MAX_ENTRY_SIZE of
+        // trailing bytes is mid-file damage, not a torn final append. A
+        // read-write open must hard-error instead of silently truncating
+        // and discarding every committed entry after the corruption.
+        {
+            use std::io::Write;
+            let mut file = std::fs::File::create(&path).unwrap();
+            // All-0xFF is not a valid `Command2`/`Operation` bit pattern,
+            // so `try_from_bytes` rejects the header.
+            file.write_all(&[0xFF_u8; HEADER_SIZE]).unwrap();
+            // Sparse-extend past MAX_ENTRY_SIZE so the corruption at pos 0
+            // sits far from EOF without writing 64 MiB.
+            file.set_len(MAX_ENTRY_SIZE + HEADER_SIZE as u64).unwrap();
+            file.sync_all().unwrap();
+        }
+
+        let size_before = std::fs::metadata(&path).unwrap().len();
+        let err = PrepareJournal::open(&path, 0).await;
+        assert!(
+            err.is_err(),
+            "mid-file corruption must hard-error, not truncate"
+        );
+        let size_after = std::fs::metadata(&path).unwrap().len();
+        assert_eq!(
+            size_before, size_after,
+            "a rejected mid-file scan must not truncate the WAL"
+        );
+    }
+
     #[compio::test]
     async fn iter_headers_from() {
         let dir = tempdir().unwrap();
@@ -637,7 +784,7 @@ mod tests {
         // Op 3 goes to slot 3
         journal.append(make_prepare(3, 32)).await.unwrap();
         // Mark op 3 as snapshotted - safe to evict
-        journal.set_snapshot_op(3);
+        journal.set_snapshot_op(3).unwrap();
         // Op 3 + SLOT_COUNT goes to the same slot, evicting op 3
         let wraparound_op = 3 + SLOT_COUNT as u64;
         journal
diff --git a/core/message_bus/src/client_listener/mod.rs b/core/message_bus/src/client_listener/mod.rs
index 2d8a9ea6c9..2eea83bd99 100644
--- a/core/message_bus/src/client_listener/mod.rs
+++ b/core/message_bus/src/client_listener/mod.rs
@@ -25,7 +25,7 @@
 //!   peer cannot block the accept loop.
 //! - The `on_accepted` callback is responsible for minting a
 //!   `client_id`. The plain TCP and pre-upgrade WS paths route to the
-//!   owning shard via `ShardFramePayload::ClientConnectionSetup` and
+//!   owning shard via `shard::LifecycleFrame::ClientConnectionSetup` and
 //!   `ClientWsConnectionSetup` respectively. TCP-TLS, WSS, and QUIC
 //!   stay shard-0 terminal: their connection state is not serialisable
 //!   so the callback installs locally on shard 0 instead of shipping a
diff --git a/core/message_bus/src/client_listener/tcp.rs b/core/message_bus/src/client_listener/tcp.rs
index e8b3ab8cec..eee380185f 100644
--- a/core/message_bus/src/client_listener/tcp.rs
+++ b/core/message_bus/src/client_listener/tcp.rs
@@ -44,7 +44,7 @@ use tracing::{debug, error, info};
 pub async fn bind(addr: SocketAddr) -> Result<(TcpListener, SocketAddr), IggyError> {
     // `SO_REUSEPORT` intentionally not set: only shard 0 binds the client
     // listener. The shard-0 coordinator round-robins accepts to owning
-    // shards via `ShardFramePayload::ClientConnectionSetup`.
+    // shards via `shard::LifecycleFrame::ClientConnectionSetup`.
     let opts = SocketOpts::new().nodelay(true);
     let listener = TcpListener::bind_with_options(addr, &opts)
         .await
diff --git a/core/message_bus/src/client_listener/tcp_tls.rs b/core/message_bus/src/client_listener/tcp_tls.rs
index 7e70044248..756795e8cf 100644
--- a/core/message_bus/src/client_listener/tcp_tls.rs
+++ b/core/message_bus/src/client_listener/tcp_tls.rs
@@ -59,7 +59,7 @@ use tracing::{debug, error, info};
 /// time; the install path re-applies on every accepted stream because
 /// Linux does not propagate listener options to accepted sockets.
 /// `SO_KEEPALIVE` is intentionally NOT set (see
-/// [`crate::socket_opts`]): SDK clients manage their own keepalive
+/// `socket_opts`): SDK clients manage their own keepalive
 /// policy at the application layer and replica<->replica liveness is
 /// observed by VSR heartbeats, so the kernel timer would only race the
 /// app-level signal.
diff --git a/core/message_bus/src/client_listener/ws.rs b/core/message_bus/src/client_listener/ws.rs
index 90a6aa2668..20775cbb96 100644
--- a/core/message_bus/src/client_listener/ws.rs
+++ b/core/message_bus/src/client_listener/ws.rs
@@ -20,7 +20,7 @@
 //! Runs only on shard 0. The accept loop performs no protocol work
 //! beyond `TcpListener::accept`: every accepted stream is handed
 //! verbatim to the supplied callback, which dups the fd and ships a
-//! `ShardFramePayload::ClientWsConnectionSetup` frame to the
+//! `shard::LifecycleFrame::ClientWsConnectionSetup` frame to the
 //! round-robin-selected target shard. The HTTP-Upgrade handshake runs
 //! on the owning shard inside
 //! [`crate::ConnectionInstaller::install_client_ws_fd`]; no
@@ -30,7 +30,7 @@
 //! well-defined. The WS state machine only materialises after the
 //! upgrade, on the owning shard, where it can stay non-`Send`.
 //!
-//! `ShardFramePayload::ClientWsConnectionSetup` is defined in
+//! `shard::LifecycleFrame::ClientWsConnectionSetup` is defined in
 //! `core/shard/src/lib.rs`; the rustdoc cannot intra-link across crates
 //! without pulling `shard` in as a doc-only dep.
 
@@ -47,7 +47,7 @@ use tracing::{debug, error, info};
 ///
 /// Mirrors [`crate::client_listener::tcp::bind`] in shape: `TCP_NODELAY`
 /// on by default; `SO_KEEPALIVE` intentionally NOT set (see
-/// [`crate::socket_opts`]). The receiving shard re-applies socket
+/// `socket_opts`). The receiving shard re-applies socket
 /// options on the dup'd fd via the existing client-install path, so
 /// kernel-level options propagate end-to-end.
 ///
@@ -58,7 +58,7 @@ use tracing::{debug, error, info};
 pub async fn bind(addr: SocketAddr) -> Result<(TcpListener, SocketAddr), IggyError> {
     // `SO_REUSEPORT` intentionally not set: only shard 0 binds the WS
     // listener. The shard-0 coordinator round-robins accepts to owning
-    // shards via `ShardFramePayload::ClientWsConnectionSetup`.
+    // shards via `shard::LifecycleFrame::ClientWsConnectionSetup`.
     let opts = SocketOpts::new().nodelay(true);
     let listener = TcpListener::bind_with_options(addr, &opts)
         .await
diff --git a/core/message_bus/src/error.rs b/core/message_bus/src/error.rs
index 85ae33b7db..7d5fdd7df5 100644
--- a/core/message_bus/src/error.rs
+++ b/core/message_bus/src/error.rs
@@ -26,12 +26,26 @@ use thiserror::Error;
 /// The client- and replica-keyed variants separate three physically
 /// distinct failure modes so operators can tell apart routing bugs from
 /// transient disconnects:
-/// - `*NotFound` / `*NotConnected` — the key is unknown to this shard's
+/// - `*NotFound` / `*NotConnected` - the key is unknown to this shard's
 ///   local registry (genuine peer state, often recoverable by reconnect).
-/// - `*RouteMissing` — the bus cannot forward to the owning shard
+/// - `*RouteMissing` - the bus cannot forward to the owning shard
 ///   because no forward fn was installed (bootstrap ordering bug).
-/// - `*ForwardFailed` — the forward fn rejected the frame (inter-shard
+/// - `*ForwardFailed` - the forward fn rejected the frame (inter-shard
 ///   queue full, shutdown, etc.).
+///
+/// Recovery asymmetry between forward variants:
+/// - `ReplicaForwardFailed` is recoverable in-protocol. The dropped frame
+///   is a consensus message (`Prepare` / `PrepareOk` / view-change /
+///   `Commit`); VSR's WAL-driven retransmit re-sends it once the inbox
+///   drains.
+/// - `ClientForwardFailed` is terminal. Client Reply payloads live above
+///   the bus and have no retransmit path. A reply lost on full inbox is
+///   gone, the client times out, and request / response semantics break.
+///   Operators must size `inbox_capacity` for the worst-case cross-shard
+///   reply burst and alert on the drop-site `tracing` logs. The
+///   `frame_drops_total` `{variant="forward_client_send"}` counter is
+///   the structured complement, scrape-able once a per-shard exporter
+///   lands.
 #[derive(Debug, Error)]
 #[non_exhaustive]
 pub enum SendError {
@@ -41,6 +55,11 @@ pub enum SendError {
     #[error("client {0}: no inter-shard forward fn installed")]
     ClientRouteMissing(u128),
 
+    /// The owning shard's inter-shard inbox rejected the forward frame
+    /// (full or disconnected). Unlike `ReplicaForwardFailed`, no VSR
+    /// retransmit covers this drop: the client Reply payload was moved
+    /// into the frame and is now gone. Callers must treat this as a
+    /// final failure for the affected request; the client will time out.
     #[error("client {0}: inter-shard forward failed")]
     ClientForwardFailed(u128),
 
diff --git a/core/message_bus/src/installer/mod.rs b/core/message_bus/src/installer/mod.rs
index 116f887ffe..6af7336578 100644
--- a/core/message_bus/src/installer/mod.rs
+++ b/core/message_bus/src/installer/mod.rs
@@ -55,7 +55,7 @@ use std::rc::Rc;
 use tracing::warn;
 
 /// Operations a shard needs to perform on its local bus when the router
-/// receives an inter-shard connection-setup or mapping frame.
+/// receives an inter-shard connection-setup frame.
 ///
 /// The production implementation is on `Rc<IggyMessageBus>`. The simulator
 /// does not exercise this path; if it ever does, add a no-op impl on
@@ -87,14 +87,6 @@ pub trait ConnectionInstaller {
     /// subprotocol negotiation: the caller (server-ng) gates command
     /// access via the LOGIN allowlist.
     fn install_client_ws_fd(&self, fd: DupedFd, meta: ClientConnMeta, on_request: RequestHandler);
-
-    /// Update the replica -> owning shard mapping used by the `send_to_replica`
-    /// slow path on non-owning shards.
-    fn set_shard_mapping(&self, replica: u8, owning_shard: u16);
-
-    /// Forget the replica -> owning shard mapping (e.g. after a connection
-    /// loss, before the next allocate).
-    fn remove_shard_mapping(&self, replica: u8);
 }
 
 impl ConnectionInstaller for Rc<IggyMessageBus> {
@@ -140,12 +132,4 @@ impl ConnectionInstaller for Rc<IggyMessageBus> {
         });
         self.track_background(handle);
     }
-
-    fn set_shard_mapping(&self, replica: u8, owning_shard: u16) {
-        IggyMessageBus::set_shard_mapping(self, replica, owning_shard);
-    }
-
-    fn remove_shard_mapping(&self, replica: u8) {
-        IggyMessageBus::remove_shard_mapping(self, replica);
-    }
 }
diff --git a/core/message_bus/src/installer/replica.rs b/core/message_bus/src/installer/replica.rs
index 3c20fa0a3a..6e1d07cdfa 100644
--- a/core/message_bus/src/installer/replica.rs
+++ b/core/message_bus/src/installer/replica.rs
@@ -82,19 +82,20 @@ pub fn install_replica_conn<C: TransportConn>(
         async_channel::bounded::<Message<GenericHeader>>(bus.peer_queue_capacity());
 
     // Writer and reader both observe abnormal close and used to fire
-    // `notify_connection_lost` twice per disconnect, causing shard 0 to
-    // broadcast two `ReplicaMappingClear` rounds and churn the mapping.
+    // `notify_connection_lost` twice per disconnect, double-clearing the
+    // owner-table slot and double-firing any installed test callback.
     // Shared one-shot guard: whichever post-loop runs first wins.
     let notified = Rc::new(Cell::new(false));
     // If the registry insert below races with a concurrent install for
     // the same peer id and loses, both spawned halves must skip their
     // post-loop cleanup: the loser's `replicas().remove` /
     // `close_peer_if_token_matches` calls would no-op against the winner's
-    // generation token (so they can't evict the live entry), but
-    // `notify_connection_lost` has no token guard and would still broadcast
-    // a spurious mapping-clear round. `compio::runtime::JoinHandle::drop`
-    // does not cancel the spawned task, so we have to tell the tasks to
-    // stand down in-band.
+    // generation token (so they can't evict the live entry), and
+    // `notify_connection_lost` stands down whenever a live registry entry
+    // exists - but the loser also drives `replica_dispatch_loop`, which
+    // must never hand the winner's replica id to `on_message`.
+    // `compio::runtime::JoinHandle::drop` does not cancel the spawned
+    // task, so we have to tell the tasks to stand down in-band.
     let install_aborted = Rc::new(Cell::new(false));
 
     // Generation token published by the registry on a successful insert.
@@ -266,6 +267,12 @@ pub fn install_replica_conn<C: TransportConn>(
     ) {
         Ok(token) => {
             install_token.set(Some(token));
+            // Stamp this shard as the owner so every shard's
+            // `send_to_replica` slow path can route to us immediately. The
+            // matching CAS-clear runs from
+            // `IggyMessageBus::notify_connection_lost` on either of the
+            // post-loop guards firing.
+            bus.mark_replica_owned(peer_id);
         }
         Err(rejected) => {
             // Tell both halves to stand down: the winner's entry is live and
diff --git a/core/message_bus/src/installer/tcp_tls.rs b/core/message_bus/src/installer/tcp_tls.rs
index d9468f6d0f..3497aac79a 100644
--- a/core/message_bus/src/installer/tcp_tls.rs
+++ b/core/message_bus/src/installer/tcp_tls.rs
@@ -41,7 +41,7 @@ use tracing::warn;
 /// handshake starts, matching [`super::tcp::install_client_tcp`]'s
 /// plaintext behaviour. Linux does not propagate it from the listener
 /// to accepted sockets, so toggling here is required. `SO_KEEPALIVE`
-/// is intentionally NOT set; see [`crate::socket_opts`].
+/// is intentionally NOT set; see `socket_opts`.
 ///
 /// TCP-TLS is shard-0 terminal: the rustls connection state machine
 /// is non-serialisable and tied to the local task; pre-handshake the
diff --git a/core/message_bus/src/installer/wss.rs b/core/message_bus/src/installer/wss.rs
index 3ba359c7f6..32db26cb4c 100644
--- a/core/message_bus/src/installer/wss.rs
+++ b/core/message_bus/src/installer/wss.rs
@@ -41,7 +41,7 @@ use tracing::warn;
 ///
 /// `TCP_NODELAY` is applied pre-handshake for symmetry with
 /// [`super::tcp_tls::install_client_tcp_tls`]. `SO_KEEPALIVE` is
-/// intentionally NOT set; see [`crate::socket_opts`].
+/// intentionally NOT set; see `socket_opts`.
 ///
 /// WSS is shard-0 terminal for the same reasons as the TCP-TLS plane;
 /// see [`super::tcp_tls::install_client_tcp_tls`] for the rustls
diff --git a/core/message_bus/src/lib.rs b/core/message_bus/src/lib.rs
index a927bf8c6a..9b1163d341 100644
--- a/core/message_bus/src/lib.rs
+++ b/core/message_bus/src/lib.rs
@@ -59,7 +59,7 @@
 //! - fd-delegation ([`fd_transfer`]) is TCP-only. TLS / QUIC
 //!   connections have no dupable plaintext fd, so shard 0 terminates
 //!   and forwards `Frozen<MESSAGE_ALIGN>` over the existing
-//!   inter-shard flume.
+//!   inter-shard crossfire channel.
 //! - 0-RTT stays disabled by default on any future QUIC path. Per-
 //!   command opt-in requires a checked-in idempotence audit.
 //!
@@ -84,8 +84,7 @@ pub mod framing;
 pub mod installer;
 pub mod lifecycle;
 pub mod replica;
-#[doc(hidden)]
-pub mod socket_opts;
+pub(crate) mod socket_opts;
 pub mod transports;
 
 pub use config::{IOV_MAX_LIMIT, MessageBusConfig, QuicTuning, WebSocketConfig};
@@ -96,7 +95,7 @@ pub use installer::conn_info::{
 };
 pub use lifecycle::{
     BusMessage, BusReceiver, BusSender, ConnectionRegistry, DrainOutcome, FusedShutdown,
-    RejectedRegistration, ReplicaRegistry, Shutdown, ShutdownToken,
+    ReplicaRegistry, Shutdown, ShutdownToken,
 };
 pub use transports::tls::TlsServerCredentials;
 
@@ -105,26 +104,125 @@ use configs::server_ng::ServerNgConfig;
 use iggy_binary_protocol::consensus::MESSAGE_ALIGN;
 use iggy_binary_protocol::consensus::iobuf::Frozen;
 use iggy_binary_protocol::{GenericHeader, Message};
-use std::cell::RefCell;
-use std::collections::HashMap;
+use std::array;
+use std::cell::{OnceCell, RefCell};
 use std::net::SocketAddr;
 use std::rc::Rc;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU16, Ordering};
 use std::time::Duration;
 
-/// Callback for forwarding a consensus message to a remote shard.
+/// Maximum number of replicas a single cluster supports. Replica ids are
+/// `u8`, so the address space is 0..=255.
+pub const MAX_REPLICAS: usize = 256;
+
+/// Sentinel stored in [`ReplicaOwnerTable`] slots that have no current owner.
+///
+/// `u16::MAX` is reserved by the cluster bootstrap so it can never be a real
+/// shard id: `bootstrap` rejects any cluster whose shard count does not fit
+/// in `u16` or is `>= OWNER_NONE`, returning `InvalidShardsCount` rather
+/// than minting a shard id that collides with this sentinel.
+pub const OWNER_NONE: u16 = u16::MAX;
+
+/// Shared atomic owner table mapping `replica_id` to `owning_shard_id`.
+///
+/// One Arc-cloned instance is allocated per cluster at bootstrap and
+/// shared across every shard's [`IggyMessageBus`]. The owning shard
+/// stamps its id into a slot when an inbound replica connection passes
+/// the registry-insert race; the same shard CAS-clears the slot when
+/// its connection dies (`notify_connection_lost`). Non-owning shards
+/// only ever read.
+///
+/// Lock-free by construction: every operation is a single relaxed-or-
+/// stronger atomic on a fixed `[AtomicU16; MAX_REPLICAS]`. Sized for
+/// 256 slots upfront, so install / disconnect cost is `O(1)` and no
+/// hashing or allocation happens on the hot path.
+///
+/// The table is the sole authority for cross-shard replica routing:
+/// `send_to_replica`'s slow path and [`IggyMessageBus::owning_shard`]
+/// both read it directly. No separate broadcast or reconcile loop is
+/// involved.
+pub struct ReplicaOwnerTable {
+    slots: [AtomicU16; MAX_REPLICAS],
+}
+
+impl ReplicaOwnerTable {
+    /// Build a fresh table with every slot initialised to [`OWNER_NONE`].
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            slots: array::from_fn(|_| AtomicU16::new(OWNER_NONE)),
+        }
+    }
+
+    /// Stamp `shard_id` as the current owner of `replica_id`. Overwrites
+    /// any previous owner; the caller is responsible for sequencing this
+    /// against the corresponding registry insert (in
+    /// [`installer::install_replica_conn`]).
+    pub fn set(&self, replica_id: u8, shard_id: u16) {
+        self.slots[usize::from(replica_id)].store(shard_id, Ordering::Release);
+    }
+
+    /// Clear the slot iff its current owner is `shard_id`. Returns
+    /// `true` when the CAS succeeded.
+    ///
+    /// The CAS prevents a stale post-loop on shard A from clobbering a
+    /// slot that has since been re-registered on shard B.
+    pub fn clear_if_owned_by(&self, replica_id: u8, shard_id: u16) -> bool {
+        self.slots[usize::from(replica_id)]
+            .compare_exchange(shard_id, OWNER_NONE, Ordering::AcqRel, Ordering::Acquire)
+            .is_ok()
+    }
+
+    /// Read the current owner of `replica_id`. Returns `None` when the
+    /// slot is [`OWNER_NONE`].
+    #[must_use]
+    pub fn owner(&self, replica_id: u8) -> Option<u16> {
+        match self.slots[usize::from(replica_id)].load(Ordering::Acquire) {
+            OWNER_NONE => None,
+            owner => Some(owner),
+        }
+    }
+}
+
+impl Default for ReplicaOwnerTable {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Callback for forwarding a consensus replica message to the owning shard.
 ///
 /// Provided by the shard layer at bus construction and installed via
-/// [`IggyMessageBus::set_replica_forward_fn`] / [`IggyMessageBus::set_client_forward_fn`].
-/// Arguments: `(target_shard_id, message)`. Returns `Ok(())` on successful
-/// enqueue into the inter-shard channel, or [`SendError::RoutingFailed`] on
-/// `try_send` failure.
+/// [`IggyMessageBus::set_replica_forward_fn`]. Arguments:
+/// `(replica_id, owning_shard_id, message)`. Returns `Ok(())` on successful
+/// enqueue into the inter-shard channel.
+///
+/// `replica_id` is carried explicitly so the receiver can rebuild
+/// `ShardFrame::Lifecycle`'s `ForwardReplicaSend { replica_id, msg }`
+/// payload; the bus' slow path knows the id but the closure body, which lives
+/// in the shard layer, would otherwise have to re-derive it from the message.
 ///
 /// Asymmetry vs the `Rc<dyn Fn>` siblings (`AcceptedReplicaFn`,
 /// `AcceptedClientFn`, `ConnectionLostFn`): the forward fn has a single
 /// owner (the bus) and is installed once at bootstrap; `Box` suffices.
 /// The `Rc` siblings are shared with accept loops and connection tasks
 /// that outlive the caller and need independent ownership.
-pub type ShardForwardFn = Box<dyn Fn(u16, Frozen<MESSAGE_ALIGN>) -> Result<(), SendError>>;
+pub type ReplicaForwardFn = Box<dyn Fn(u8, u16, Frozen<MESSAGE_ALIGN>) -> Result<(), SendError>>;
+
+/// Callback for forwarding a client-bound message to the owning shard.
+///
+/// Provided by the shard layer at bus construction and installed via
+/// [`IggyMessageBus::set_client_forward_fn`]. Arguments:
+/// `(client_id, owning_shard_id, message)`. Returns `Ok(())` on successful
+/// enqueue into the inter-shard channel.
+///
+/// `owning_shard_id` is the value returned by [`client_id_owning_shard`];
+/// the slow path passes it pre-resolved so the closure does not redo the
+/// shift. `client_id` is carried so the receiver can rebuild the
+/// `ShardFrame::Lifecycle`'s `ForwardClientSend { client_id, msg }`
+/// payload.
+pub type ClientForwardFn = Box<dyn Fn(u128, u16, Frozen<MESSAGE_ALIGN>) -> Result<(), SendError>>;
 
 /// Callback invoked on every successful replica handshake.
 ///
@@ -161,8 +259,14 @@ pub struct AcceptedQuicConn {
 impl AcceptedQuicConn {
     /// Bundle a freshly-accepted QUIC connection and its first
     /// bidirectional stream pair.
+    ///
+    /// `pub(crate)` by design: the constructor's signature mentions
+    /// `compio_quic::Connection`, `SendStream`, and `RecvStream`, all
+    /// kept off the bus's public `SemVer` surface for the same reason
+    /// [`Self::into_parts`] is crate-private. The QUIC listener in
+    /// [`crate::client_listener::quic`] is the only mint site.
     #[must_use]
-    pub const fn new(
+    pub(crate) const fn new(
         connection: compio_quic::Connection,
         streams: (compio_quic::SendStream, compio_quic::RecvStream),
     ) -> Self {
@@ -220,7 +324,7 @@ pub type AcceptedQuicClientFn = std::rc::Rc<dyn Fn(AcceptedQuicConn)>;
 /// Fires after shard 0's WS listener accepts a raw TCP socket. The
 /// HTTP-Upgrade handshake has NOT run yet; the callback hands the
 /// raw stream off to the owning shard via inter-shard fd-shipping
-/// (`ShardFramePayload::ClientWsConnectionSetup`). The owning shard
+/// (`shard::LifecycleFrame::ClientWsConnectionSetup`). The owning shard
 /// runs the upgrade locally; no subprotocol is negotiated. The
 /// shipped fd is plain TCP at ship-time, so fd-delegation (which
 /// requires a dupable plaintext fd) stays well-defined.
@@ -291,6 +395,34 @@ pub type ConnectionLostFn = std::rc::Rc<dyn Fn(u8)>;
 /// on first poll. Future transports (QUIC, TLS) that would introduce real
 /// yields must advertise that change in their own doc; consensus code
 /// relies on the no-yield property to reason about reentrancy.
+///
+/// # Recovery asymmetry between `send_to_replica` and `send_to_client`
+///
+/// `send_to_replica` carries VSR-covered frames (`Prepare` / `PrepareOk`
+/// / view-change / `Commit`). A `ReplicaForwardFailed` on full
+/// inter-shard inbox is recovered by VSR retransmit and is
+/// informational.
+/// `send_to_client` carries the final Reply payload to the originating
+/// client; it has no in-protocol retransmit. A `ClientForwardFailed` is
+/// terminal: the client never receives the reply and times out.
+/// Operators must size `inbox_capacity` for the worst-case cross-shard
+/// reply burst. Each drop is logged at the drop site via `tracing`;
+/// alert on those logs today. The `frame_drops_total`
+/// `{variant="forward_client_send"}` counter is the structured
+/// complement and becomes scrape-able once a per-shard exporter lands.
+/// The symmetric `{variant="forward_replica_send"}` counter stays
+/// informational only.
+///
+/// # Setter install contract
+///
+/// The three setters below do NOT share one install rule:
+/// - [`Self::set_connection_lost_fn`] is re-installable (`RefCell`
+///   slot); it is a test observation hook some setups swap mid-test.
+/// - [`Self::set_replica_forward_fn`] and [`Self::set_client_forward_fn`]
+///   are one-shot (`OnceCell` slot) and panic on a second install;
+///   they are bootstrap-only wiring invariants.
+///
+/// A bus impl must preserve this divergence - see each method.
 pub trait MessageBus {
     fn send_to_client(
         &self,
@@ -305,15 +437,50 @@ pub trait MessageBus {
     ) -> impl Future<Output = Result<(), SendError>>;
 
     /// Install a notifier the bus will invoke when a delegated replica
-    /// connection dies abnormally. Used by shard-0 bootstrap to push a
-    /// `ShardFramePayload::ConnectionLost` into shard 0's inbox so the
-    /// coordinator can forget the replica's mapping.
-    ///
-    /// Default impl is a no-op for buses that do not delegate real
-    /// connections (simulator, test doubles); the production
-    /// `IggyMessageBus` overrides it to wire into its internal
-    /// `connection_lost_fn` hook.
-    fn set_connection_lost_fn(&self, _f: ConnectionLostFn) {}
+    /// connection dies abnormally. Production wiring leaves this unset
+    /// because the cluster-shared [`ReplicaOwnerTable`] is CAS-cleared
+    /// inside `IggyMessageBus::notify_connection_lost` before the
+    /// callback runs, so cross-shard routing recovers without any
+    /// follow-up message. Test code installs a callback to assert the
+    /// notifier path fires exactly once per disconnect.
+    ///
+    /// No default: every bus must explicitly opt in or stub. A silent
+    /// no-op default previously masked a wiring bug where shard 0 never
+    /// learned a delegated replica died.
+    ///
+    /// Re-installing is allowed by design - the backing slot is a
+    /// `RefCell`, so a second call overwrites the first. This is the
+    /// deliberate exception to the install rules of
+    /// [`Self::set_replica_forward_fn`] / [`Self::set_client_forward_fn`]:
+    /// the notifier is a test observation hook some setups swap
+    /// mid-test, not a one-shot bootstrap invariant.
+    fn set_connection_lost_fn(&self, f: ConnectionLostFn);
+
+    /// Install the replica-plane inter-shard forward closure.
+    ///
+    /// No default: every bus must explicitly opt in or stub. Mirrors the
+    /// reason `set_connection_lost_fn` is required - a silent no-op
+    /// default could mask a wiring bug where cross-shard replica sends
+    /// drop on the floor without anyone noticing.
+    ///
+    /// # Panics
+    ///
+    /// Panics on a second install on the same bus. Unlike
+    /// [`Self::set_connection_lost_fn`], the forward closure is a
+    /// one-shot bootstrap invariant backed by a `OnceCell`, so a
+    /// double-install is a wiring bug rather than a supported re-bind.
+    fn set_replica_forward_fn(&self, f: ReplicaForwardFn);
+
+    /// Install the client-plane inter-shard forward closure.
+    ///
+    /// No default: every bus must explicitly opt in or stub. Same
+    /// rationale as [`Self::set_replica_forward_fn`].
+    ///
+    /// # Panics
+    ///
+    /// Panics on a second install on the same bus, same one-shot
+    /// `OnceCell` invariant as [`Self::set_replica_forward_fn`].
+    fn set_client_forward_fn(&self, f: ClientForwardFn);
 }
 
 /// Production message bus backed by real TCP connections.
@@ -330,7 +497,10 @@ pub trait MessageBus {
 /// - `send_to_*` clones the per-peer `Sender` out of the registry, calls
 ///   `try_send` on it, and returns. No `.await` happens in the body. Drops
 ///   on `Full` (returned as [`SendError::Backpressure`]) are recovered by
-///   VSR retransmission.
+///   VSR retransmission for replica-bound consensus traffic. Client Reply
+///   drops returned as [`SendError::ClientForwardFailed`] have no
+///   retransmit and are terminal; see the trait-level "Recovery
+///   asymmetry" section on [`MessageBus`].
 /// - The per-connection writer task batches up to `config.max_batch` messages into
 ///   a single `writev` syscall on the plaintext TCP plane (see
 ///   [`transports::tcp`]).
@@ -345,25 +515,30 @@ pub struct IggyMessageBus {
     replicas: ReplicaRegistry,
     background_tasks: RefCell<Vec<JoinHandle<()>>>,
     config: MessageBusConfig,
-    /// For each replica, the shard that owns the TCP connection from this
-    /// shard's perspective. Present on ALL shards (owning and non-owning).
-    /// On the owning shard, `shard_mapping[replica] == self.shard_id`.
-    shard_mapping: RefCell<HashMap<u8, u16>>,
     /// Forwards a replica-bound message to the shard that owns the replica's
-    /// TCP connection. `None` on single-shard setups and tests.
-    replica_forward_fn: RefCell<Option<ShardForwardFn>>,
+    /// TCP connection. Empty on single-shard setups and tests. Installed
+    /// once at bootstrap via [`Self::set_replica_forward_fn`]; the slow
+    /// path reads through [`OnceCell::get`] without a runtime borrow.
+    replica_forward_fn: OnceCell<ReplicaForwardFn>,
     /// Forwards a client-bound message to the shard that owns the client's
     /// TCP connection (the owning shard is encoded in the top 16 bits of
-    /// the client id). `None` on single-shard setups and tests.
-    client_forward_fn: RefCell<Option<ShardForwardFn>>,
-    /// Invoked by a delegated replica connection's writer / reader tasks
-    /// when they exit abnormally. `None` when running without a shard-0
-    /// coordinator (single-shard deployments and tests).
+    /// the client id). Empty on single-shard setups and tests. Installed
+    /// once at bootstrap via [`Self::set_client_forward_fn`]; the slow
+    /// path reads through [`OnceCell::get`] without a runtime borrow.
+    client_forward_fn: OnceCell<ClientForwardFn>,
+    /// Optional disconnect notifier invoked by
+    /// `Self::notify_connection_lost` after it CAS-clears the owner
+    /// table. Production leaves this unset; tests install a callback to
+    /// observe the path.
     connection_lost_fn: RefCell<Option<ConnectionLostFn>>,
     /// Per-connection metadata exposed to the caller via
     /// [`Self::client_meta`]. Populated by the install path on
     /// successful registry insert; removed on connection teardown.
     client_meta: RefCell<ahash::AHashMap<u128, Rc<ClientConnMeta>>>,
+    /// Cluster-shared atomic owner table. Stamped by the install path
+    /// after a successful registry insert; CAS-cleared by
+    /// `Self::notify_connection_lost`. See [`ReplicaOwnerTable`].
+    owner_table: Arc<ReplicaOwnerTable>,
 }
 
 impl IggyMessageBus {
@@ -408,6 +583,19 @@ impl IggyMessageBus {
         Self::with_tunables(shard_id, MessageBusConfig::from(cfg))
     }
 
+    /// Production constructor for clusters: same as [`Self::with_config`]
+    /// but takes a pre-allocated [`ReplicaOwnerTable`] Arc. Bootstrap
+    /// allocates one table per cluster and clones the Arc into every
+    /// shard so all buses see the same atomic slots.
+    #[must_use]
+    pub fn with_config_and_owner_table(
+        shard_id: u16,
+        cfg: &ServerNgConfig,
+        owner_table: Arc<ReplicaOwnerTable>,
+    ) -> Self {
+        Self::with_tunables_and_owner_table(shard_id, MessageBusConfig::from(cfg), owner_table)
+    }
+
     /// Construct a bus from already-derived runtime tunables.
     ///
     /// Used by the public constructors above and by tests that need to
@@ -420,6 +608,24 @@ impl IggyMessageBus {
     /// `config.max_batch > IOV_MAX_LIMIT`.
     #[must_use]
     pub fn with_tunables(shard_id: u16, config: MessageBusConfig) -> Self {
+        Self::with_tunables_and_owner_table(shard_id, config, Arc::new(ReplicaOwnerTable::new()))
+    }
+
+    /// Construct a bus with explicit runtime tunables and a pre-allocated
+    /// owner table. Cluster bootstrap uses this so every shard's bus
+    /// shares the same atomic slots; tests use [`Self::with_tunables`]
+    /// which allocates a fresh table per bus.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `config.max_batch == 0` or
+    /// `config.max_batch > IOV_MAX_LIMIT`.
+    #[must_use]
+    pub fn with_tunables_and_owner_table(
+        shard_id: u16,
+        config: MessageBusConfig,
+        owner_table: Arc<ReplicaOwnerTable>,
+    ) -> Self {
         assert!(
             config.max_batch > 0 && config.max_batch <= IOV_MAX_LIMIT,
             "MessageBusConfig::max_batch must be in 1..={IOV_MAX_LIMIT} (writev IOV_MAX/2 cap); got {}",
@@ -434,14 +640,36 @@ impl IggyMessageBus {
             replicas: ReplicaRegistry::new(),
             background_tasks: RefCell::new(Vec::new()),
             config,
-            shard_mapping: RefCell::new(HashMap::new()),
-            replica_forward_fn: RefCell::new(None),
-            client_forward_fn: RefCell::new(None),
+            replica_forward_fn: OnceCell::new(),
+            client_forward_fn: OnceCell::new(),
             connection_lost_fn: RefCell::new(None),
             client_meta: RefCell::new(ahash::AHashMap::new()),
+            owner_table,
         }
     }
 
+    /// Clone of the cluster-shared owner table Arc.
+    #[must_use]
+    pub fn owner_table(&self) -> Arc<ReplicaOwnerTable> {
+        Arc::clone(&self.owner_table)
+    }
+
+    /// Stamp this bus' shard id into the owner table for `replica_id`.
+    /// Called by [`installer::install_replica_conn`] after a successful
+    /// registry insert.
+    pub fn mark_replica_owned(&self, replica_id: u8) {
+        self.owner_table.set(replica_id, self.shard_id);
+    }
+
+    /// CAS-clear the owner-table slot for `replica_id` if its current
+    /// owner is this bus' shard id. The CAS prevents a stale post-loop
+    /// from clobbering a slot that has since been re-registered on a
+    /// different shard.
+    pub fn clear_replica_owned(&self, replica_id: u8) -> bool {
+        self.owner_table
+            .clear_if_owned_by(replica_id, self.shard_id)
+    }
+
     /// Look up the per-connection metadata recorded for `client_id`.
     ///
     /// Returns `None` if the client never connected on this bus or if
@@ -472,6 +700,31 @@ impl IggyMessageBus {
     /// [`Self::set_connection_lost_fn`] (which takes a `borrow_mut`)
     /// without tripping the runtime borrow check.
     pub(crate) fn notify_connection_lost(&self, replica_id: u8) {
+        // Stand down if a live registry entry exists for this replica. A
+        // reconnect that round-robined back to this same shard installs a
+        // fresh entry before this stale post-loop runs; the owner-table
+        // CAS in `clear_replica_owned` only guards *different*-shard
+        // re-registration, so a same-shard reinstall leaves the slot
+        // == self.shard_id and the clear below would clobber the live
+        // connection's slot, stranding cross-shard `send_to_replica`
+        // forever. Every caller removes its own registry entry before
+        // invoking this, so a live entry here is always a newer install
+        // that now owns the slot. No token needed: the bus is
+        // single-threaded (compio), so this check, the clear, and a
+        // competing `install_replica_conn` cannot interleave.
+        if self.replicas.contains(replica_id) {
+            return;
+        }
+        // CAS-clear first so any reader racing the user callback already
+        // sees the slot vacated. The CAS no-ops if a different shard
+        // already owns the slot (post-loop arriving after a
+        // re-installation on another shard).
+        self.clear_replica_owned(replica_id);
+        tracing::warn!(
+            shard_id = self.shard_id,
+            replica_id,
+            "replica connection lost"
+        );
         let cb = self
             .connection_lost_fn
             .borrow()
@@ -493,11 +746,14 @@ impl IggyMessageBus {
     ///
     /// # Panics
     ///
-    /// Panics on re-entrant `RefCell::borrow_mut` if called from inside
-    /// an in-flight forward-closure invocation. All production call sites
-    /// are bootstrap only (single-threaded compio, no re-entry).
-    pub fn set_replica_forward_fn(&self, f: ShardForwardFn) {
-        *self.replica_forward_fn.borrow_mut() = Some(f);
+    /// Panics if called more than once on the same bus. The bootstrap is
+    /// the sole caller and runs exactly once per bus; a second install
+    /// signals a wiring bug we want to surface loudly.
+    pub fn set_replica_forward_fn(&self, f: ReplicaForwardFn) {
+        self.replica_forward_fn
+            .set(f)
+            .ok()
+            .expect("replica_forward_fn installed twice (bootstrap invariant)");
     }
 
     /// Install the client-plane inter-shard forward closure.
@@ -510,41 +766,23 @@ impl IggyMessageBus {
     ///
     /// # Panics
     ///
-    /// Panics on re-entrant `RefCell::borrow_mut` if called from inside
-    /// an in-flight forward-closure invocation.
-    pub fn set_client_forward_fn(&self, f: ShardForwardFn) {
-        *self.client_forward_fn.borrow_mut() = Some(f);
-    }
-
-    /// Update the shard mapping for a replica.
-    ///
-    /// Called when the allocation strategy assigns or reassigns connections.
-    ///
-    /// # Panics
-    ///
-    /// Panics on re-entrant `RefCell::borrow_mut` if called while
-    /// [`IggyMessageBus::owning_shard`] (or any other read-side of
-    /// `shard_mapping`) holds an outstanding read borrow on the same
-    /// bus instance.
-    pub fn set_shard_mapping(&self, replica: u8, owning_shard: u16) {
-        self.shard_mapping
-            .borrow_mut()
-            .insert(replica, owning_shard);
+    /// Panics if called more than once on the same bus. Same rationale as
+    /// [`Self::set_replica_forward_fn`].
+    pub fn set_client_forward_fn(&self, f: ClientForwardFn) {
+        self.client_forward_fn
+            .set(f)
+            .ok()
+            .expect("client_forward_fn installed twice (bootstrap invariant)");
     }
 
-    /// Remove all shard mappings for a replica (on deallocation).
+    /// Get the owning shard for a replica, if any.
     ///
-    /// # Panics
-    ///
-    /// Same re-entrant-`borrow_mut` constraint as
-    /// [`IggyMessageBus::set_shard_mapping`].
-    pub fn remove_shard_mapping(&self, replica: u8) {
-        self.shard_mapping.borrow_mut().remove(&replica);
-    }
-
-    /// Get the owning shard for a replica, if mapped.
+    /// Reads the cluster-shared [`ReplicaOwnerTable`], which is stamped
+    /// synchronously by the owning shard's installer and CAS-cleared on
+    /// disconnect inside `Self::notify_connection_lost`.
+    #[must_use]
     pub fn owning_shard(&self, replica: u8) -> Option<u16> {
-        self.shard_mapping.borrow().get(&replica).copied()
+        self.owner_table.owner(replica)
     }
 
     #[must_use]
@@ -713,6 +951,14 @@ impl<T: MessageBus + ?Sized> MessageBus for std::rc::Rc<T> {
     fn set_connection_lost_fn(&self, f: ConnectionLostFn) {
         (**self).set_connection_lost_fn(f);
     }
+
+    fn set_replica_forward_fn(&self, f: ReplicaForwardFn) {
+        (**self).set_replica_forward_fn(f);
+    }
+
+    fn set_client_forward_fn(&self, f: ClientForwardFn) {
+        (**self).set_client_forward_fn(f);
+    }
 }
 
 #[allow(clippy::future_not_send)]
@@ -737,11 +983,12 @@ impl MessageBus for IggyMessageBus {
                 Err(_msg) => Err(SendError::ClientNotFound(client_id)),
             };
         }
-        let forward = self.client_forward_fn.borrow();
-        let forward = forward
-            .as_ref()
+        let forward = self
+            .client_forward_fn
+            .get()
             .ok_or(SendError::ClientRouteMissing(client_id))?;
-        forward(owning_shard, message).map_err(|_| SendError::ClientForwardFailed(client_id))
+        forward(client_id, owning_shard, message)
+            .map_err(|_| SendError::ClientForwardFailed(client_id))
     }
 
     async fn send_to_replica(
@@ -760,22 +1007,43 @@ impl MessageBus for IggyMessageBus {
             Err(message) => message,
         };
         // Slow path: route via the inter-shard channel to the owning shard.
+        // The cluster-shared `owner_table` is the authoritative routing
+        // view: `mark_replica_owned` stamps it on install and
+        // `clear_replica_owned` CAS-clears it on disconnect.
         let owning_shard = self
-            .shard_mapping
-            .borrow()
-            .get(&replica)
-            .copied()
+            .owner_table
+            .owner(replica)
             .ok_or(SendError::ReplicaNotConnected(replica))?;
-        let forward = self.replica_forward_fn.borrow();
-        let forward = forward
-            .as_ref()
+        // The fast path already failed (no live registry slot), so a
+        // self-owner here means the disconnect is mid-flight:
+        // `close_peer_if_token_matches` took the slot but
+        // `clear_replica_owned` has not yet run. Forwarding now would
+        // `try_send` a `ForwardReplicaSend` onto our own inbox and the
+        // pump would re-enter this path - a self-inbox ping-pong on the
+        // consensus hot path. Treat as not connected, mirroring the
+        // `send_to_client` self-owner guard.
+        if owning_shard == self.shard_id {
+            return Err(SendError::ReplicaNotConnected(replica));
+        }
+        let forward = self
+            .replica_forward_fn
+            .get()
             .ok_or(SendError::ReplicaRouteMissing(replica))?;
-        forward(owning_shard, message).map_err(|_| SendError::ReplicaForwardFailed(replica))
+        forward(replica, owning_shard, message)
+            .map_err(|_| SendError::ReplicaForwardFailed(replica))
     }
 
     fn set_connection_lost_fn(&self, f: ConnectionLostFn) {
         Self::set_connection_lost_fn(self, f);
     }
+
+    fn set_replica_forward_fn(&self, f: ReplicaForwardFn) {
+        Self::set_replica_forward_fn(self, f);
+    }
+
+    fn set_client_forward_fn(&self, f: ClientForwardFn) {
+        Self::set_client_forward_fn(self, f);
+    }
 }
 
 /// Extract the owning shard from a client id.
@@ -823,16 +1091,20 @@ mod tests {
         let bus = IggyMessageBus::new(5);
         let captured: std::rc::Rc<RefCell<Option<u16>>> = std::rc::Rc::new(RefCell::new(None));
         let captured_clone = captured.clone();
-        bus.set_client_forward_fn(Box::new(move |target, _msg| {
+        let client_id = (7u128 << 112) | 0x2a;
+        let captured_id: std::rc::Rc<RefCell<Option<u128>>> = std::rc::Rc::new(RefCell::new(None));
+        let captured_id_clone = captured_id.clone();
+        bus.set_client_forward_fn(Box::new(move |id, target, _msg| {
+            *captured_id_clone.borrow_mut() = Some(id);
             *captured_clone.borrow_mut() = Some(target);
             Ok(())
         }));
 
-        let client_id = (7u128 << 112) | 0x2a;
         bus.send_to_client(client_id, dummy_message().into_frozen())
             .await
             .expect("forward_fn should accept");
         assert_eq!(*captured.borrow(), Some(7));
+        assert_eq!(*captured_id.borrow(), Some(client_id));
     }
 
     #[compio::test]
@@ -863,21 +1135,25 @@ mod tests {
 
     #[compio::test]
     #[allow(clippy::future_not_send)]
-    async fn send_to_replica_slow_path_uses_shard_mapping_and_forwards() {
+    async fn send_to_replica_slow_path_uses_owner_table_and_forwards() {
         let bus = IggyMessageBus::new(0);
         let captured: std::rc::Rc<RefCell<Option<u16>>> = std::rc::Rc::new(RefCell::new(None));
         let captured_clone = captured.clone();
-        bus.set_replica_forward_fn(Box::new(move |target, _msg| {
+        let captured_id: std::rc::Rc<RefCell<Option<u8>>> = std::rc::Rc::new(RefCell::new(None));
+        let captured_id_clone = captured_id.clone();
+        bus.set_replica_forward_fn(Box::new(move |id, target, _msg| {
+            *captured_id_clone.borrow_mut() = Some(id);
             *captured_clone.borrow_mut() = Some(target);
             Ok(())
         }));
-        bus.set_shard_mapping(5, 3);
+        bus.owner_table().set(5, 3);
 
         bus.send_to_replica(5, dummy_message().into_frozen())
             .await
             .expect("forward ok");
 
         assert_eq!(*captured.borrow(), Some(3));
+        assert_eq!(*captured_id.borrow(), Some(5));
     }
 
     #[compio::test]
@@ -891,6 +1167,24 @@ mod tests {
         assert!(matches!(err, SendError::ReplicaNotConnected(9)));
     }
 
+    /// Disconnect-race window: the registry slot is already gone (fast
+    /// path misses) but `owner_table` still points at this shard because
+    /// `clear_replica_owned` has not run yet. The slow path must treat a
+    /// self-owner as not connected rather than forwarding the frame onto
+    /// its own inbox - mirrors the `send_to_client` self-owner guard.
+    #[compio::test]
+    #[allow(clippy::future_not_send)]
+    async fn send_to_replica_slow_path_self_owner_returns_not_connected() {
+        let bus = IggyMessageBus::new(3);
+        bus.owner_table().set(5, 3);
+
+        let err = bus
+            .send_to_replica(5, dummy_message().into_frozen())
+            .await
+            .unwrap_err();
+        assert!(matches!(err, SendError::ReplicaNotConnected(5)));
+    }
+
     #[compio::test]
     #[allow(clippy::future_not_send)]
     async fn client_id_owning_shard_extracts_top_16_bits() {
@@ -990,6 +1284,43 @@ mod tests {
         assert_eq!(counter.get(), 11);
     }
 
+    /// Same-shard reconnect race: a stale post-loop from a dead connection
+    /// must not CAS-clear the owner-table slot that a live reinstall
+    /// (round-robined back to this shard) just stamped. The slot is only
+    /// released once no live registry entry remains.
+    #[compio::test]
+    #[allow(clippy::future_not_send)]
+    async fn notify_connection_lost_stands_down_when_replica_reregistered() {
+        let bus = IggyMessageBus::new(3);
+
+        // Live reinstall: registry entry present + owner table stamped.
+        let (tx, _rx) = async_channel::bounded(8);
+        let writer = compio::runtime::spawn(async {});
+        let reader = compio::runtime::spawn(async {});
+        let (conn_shutdown, _conn_token) = Shutdown::new();
+        bus.replicas()
+            .insert(7, tx, writer, reader, conn_shutdown)
+            .expect("insert ok");
+        bus.mark_replica_owned(7);
+
+        // Stale predecessor post-loop fires while the reinstall is live.
+        bus.notify_connection_lost(7);
+        assert_eq!(
+            bus.owning_shard(7),
+            Some(3),
+            "stale post-loop cleared a re-registered replica's owner slot",
+        );
+
+        // Genuine disconnect: no live entry, the slot is released.
+        bus.replicas().remove(7);
+        bus.notify_connection_lost(7);
+        assert_eq!(
+            bus.owning_shard(7),
+            None,
+            "owner slot must clear once no live connection remains",
+        );
+    }
+
     #[compio::test]
     #[allow(clippy::future_not_send)]
     async fn shutdown_loop_drains_tasks_added_during_shutdown() {
diff --git a/core/message_bus/src/transports/mod.rs b/core/message_bus/src/transports/mod.rs
index fab08207c9..a141ba04b6 100644
--- a/core/message_bus/src/transports/mod.rs
+++ b/core/message_bus/src/transports/mod.rs
@@ -70,7 +70,7 @@
 //! - fd-delegation (`F_DUPFD_CLOEXEC`) stays TCP-only and lives outside
 //!   this trait (see [`crate::fd_transfer`]). Other transports terminate
 //!   on shard 0 and forward `Frozen<MESSAGE_ALIGN>` over the inter-shard
-//!   flume via [`crate::ShardForwardFn`].
+//!   crossfire channel via [`crate::ReplicaForwardFn`] / [`crate::ClientForwardFn`].
 //!
 //! Per-transport impls live in
 //! `transports/{tcp,tcp_tls,ws,wss,quic}.rs`.
diff --git a/core/metadata/src/impls/metadata.rs b/core/metadata/src/impls/metadata.rs
index 3e5487b488..047b54745b 100644
--- a/core/metadata/src/impls/metadata.rs
+++ b/core/metadata/src/impls/metadata.rs
@@ -303,11 +303,18 @@ impl std::fmt::Display for RegisterSubmitError {
 impl std::error::Error for RegisterSubmitError {}
 
 pub struct IggyMetadata<C, J, S, M> {
-    /// Some on shard0, None on other shards
+    /// `Some` on shard 0, `None` on other shards. Server-ng bootstrap
+    /// holds the invariant: only shard 0 owns the metadata consensus
+    /// replica; every other shard reconstructs `mux_stm` from the
+    /// snapshot + replayed journal during recovery and then drops both.
     pub consensus: Option<C>,
-    /// Some on shard0, None on other shards
+    /// `Some` on shard 0, `None` on other shards. Shard 0 owns the WAL
+    /// writer (via `PrepareJournal::open`); non-owning shards open the
+    /// WAL read-only during recovery for replay only and must drop the
+    /// handle afterwards so a `drain()` rename on shard 0 cannot strand
+    /// a stale inode visible through their RO fd.
     pub journal: Option<J>,
-    /// Some on shard0, None on other shards
+    /// `Some` on shard 0, `None` on other shards.
     pub snapshot: Option<S>,
     /// State machine - lives on all shards
     pub mux_stm: M,
@@ -705,9 +712,13 @@ where
                     .send_to_client(prepare_header.client, reply_buffers)
                     .await
                 {
-                    warn!(
-                        "on_ack: failed to send reply to client={}: {e}",
-                        prepare_header.client
+                    error!(
+                        client = prepare_header.client,
+                        op = prepare_header.op,
+                        request_id = prepare_header.request,
+                        operation = ?prepare_header.operation,
+                        %e,
+                        "client reply forward failed, no retransmit path; client will time out",
                     );
                 }
             }
diff --git a/core/metadata/src/impls/recovery.rs b/core/metadata/src/impls/recovery.rs
index 3d8d0d436c..06987f062d 100644
--- a/core/metadata/src/impls/recovery.rs
+++ b/core/metadata/src/impls/recovery.rs
@@ -98,6 +98,9 @@ pub struct RecoveredMetadata<M> {
 /// 4. Replay journal entries from the first post-snapshot op through the state machine
 /// 5. Return the assembled `RecoveredMetadata`
 ///
+/// Only the owning shard (shard 0) should call this. Peer shards receive
+/// a `ReadHandleFactory` bundle from shard 0 and skip WAL access entirely.
+///
 /// # Errors
 /// Returns `RecoveryError` if snapshot loading, journal opening, or replay fails.
 #[allow(clippy::future_not_send)]
@@ -110,7 +113,6 @@ where
     let metadata_dir = data_dir.join(super::METADATA_DIR);
     std::fs::create_dir_all(&metadata_dir)?;
 
-    // 1. Load snapshot if present.
     let snapshot_path = metadata_dir.join("snapshot.bin");
     let snapshot = if snapshot_path.exists() {
         Some(IggySnapshot::load(&snapshot_path)?)
@@ -121,24 +123,17 @@ where
         .as_ref()
         .map_or(0, |snapshot| snapshot.sequence_number() + 1);
 
-    // 2. Restore state machine from snapshot, or start from a fresh empty state.
     let mux_stm = match snapshot.as_ref() {
         Some(snapshot) => M::restore_snapshot(snapshot.snapshot())?,
         None => M::default(),
     };
 
-    // 3. Open journal, scan the WAL and build index
     let journal_path = metadata_dir.join("journal.wal");
-    let journal = PrepareJournal::open(
-        &journal_path,
-        snapshot.as_ref().map_or(0, IggySnapshot::sequence_number),
-    )
-    .await?;
-
-    // 4. Replay journal entries after snapshot.
-    // Intentional fail-fast for now: if replay hits a bad entry, recovery
-    // aborts and the operator must repair or truncate the WAL before the
-    // node can boot again.
+    let watermark = snapshot.as_ref().map_or(0, IggySnapshot::sequence_number);
+    let journal = PrepareJournal::open(&journal_path, watermark).await?;
+
+    // Intentional fail-fast: a bad entry aborts recovery and the operator
+    // must repair or truncate the WAL before the node can boot again.
     let headers_to_replay = journal.iter_headers_from(replay_from);
 
     let mut last_applied_op: Option<u64> = None;
diff --git a/core/metadata/src/stm/mod.rs b/core/metadata/src/stm/mod.rs
index bde07b02e6..9e5eb2e243 100644
--- a/core/metadata/src/stm/mod.rs
+++ b/core/metadata/src/stm/mod.rs
@@ -23,11 +23,13 @@ pub mod user;
 
 use bytes::Bytes;
 use iggy_common::Either;
-use left_right::{Absorb, ReadHandle, WriteHandle};
+use left_right::{Absorb, ReadHandle, ReadHandleFactory, WriteHandle};
 use std::cell::Cell;
 use std::cell::UnsafeCell;
 use std::sync::Arc;
 
+pub use left_right::ReadHandleFactory as LeftRightFactory;
+
 pub struct WriteCell<T, O>
 where
     T: Absorb<O>,
@@ -113,6 +115,30 @@ where
         let guard = self.read.enter().expect("read handle should be accessible");
         f(&*guard)
     }
+
+    /// Produce a `Send + Sync` factory that can mint additional [`ReadHandle`]s
+    /// on other threads. Use this to share the read side across compio shard
+    /// runtimes without sharing the (`!Sync`) [`ReadHandle`] itself.
+    #[must_use]
+    pub fn factory(&self) -> ReadHandleFactory<T> {
+        self.read.factory()
+    }
+
+    /// Build a reader-only [`LeftRight`] from a factory minted on the owning
+    /// thread. The new [`ReadHandle`] is materialised here (the caller's
+    /// thread), keeping the `!Sync` discipline local. `write` stays `None`,
+    /// so `do_apply` panics if invoked on a reader-only instance.
+    ///
+    /// Takes the factory by value so the constructor reads as ownership
+    /// transfer from the owning shard to the peer-shard reader.
+    #[must_use]
+    #[allow(clippy::needless_pass_by_value)]
+    pub fn from_factory(factory: ReadHandleFactory<T>) -> Self {
+        Self {
+            write: None,
+            read: Arc::new(factory.handle()),
+        }
+    }
 }
 
 impl<T> From<T> for LeftRight<T, <T as Command>::Cmd>
@@ -256,6 +282,41 @@ macro_rules! define_state {
                     [<$state Inner>]::new().into()
                 }
             }
+
+            impl $state {
+                /// Mint a `Send + Sync` [`ReadHandleFactory`] for this state.
+                /// Allows the read side to be carried across shard threads
+                /// without sharing the underlying `!Sync` [`ReadHandle`].
+                #[must_use]
+                pub fn factory(&self) -> $crate::stm::LeftRightFactory<[<$state Inner>]> {
+                    self.inner.factory()
+                }
+
+                /// Construct a reader-only state wrapper from a factory minted
+                /// by the writer-side shard. The thread that calls this owns
+                /// the resulting [`ReadHandle`]; calling `apply` on the
+                /// returned wrapper panics because `write` is `None`.
+                #[must_use]
+                pub fn from_factory(
+                    factory: $crate::stm::LeftRightFactory<[<$state Inner>]>,
+                ) -> Self {
+                    Self {
+                        inner: $crate::stm::LeftRight::from_factory(factory),
+                    }
+                }
+            }
+
+            impl $crate::stm::mux::WithFactory for $state {
+                type Bundle = $crate::stm::LeftRightFactory<[<$state Inner>]>;
+
+                fn factory_bundle(&self) -> Self::Bundle {
+                    self.factory()
+                }
+
+                fn from_factory_bundle(bundle: Self::Bundle) -> Self {
+                    Self::from_factory(bundle)
+                }
+            }
         }
     };
 }
diff --git a/core/metadata/src/stm/mux.rs b/core/metadata/src/stm/mux.rs
index 8097cc750d..c9a0d1e6a8 100644
--- a/core/metadata/src/stm/mux.rs
+++ b/core/metadata/src/stm/mux.rs
@@ -22,6 +22,49 @@ use iggy_common::variadic;
 
 use crate::stm::{State, StateMachine};
 
+/// Cross-thread read-side handoff for a state machine.
+///
+/// The owning shard (typically shard 0) materialises a `Bundle` after
+/// recovering the WAL and hands one clone to every peer shard. Peer
+/// shards then call [`WithFactory::from_factory_bundle`] on their own
+/// runtime to construct a reader-mode state machine — no WAL replay,
+/// no shared `!Sync` `ReadHandle`, just `Send + Sync` factories per
+/// state.
+///
+/// The variadic tuple impl below recurses over the state list, so a
+/// `MuxStateMachine<variadic!(Users, Streams, ConsumerGroups)>` ends up
+/// with `Bundle = (Users::Bundle, (Streams::Bundle, (ConsumerGroups::Bundle, ())))`.
+pub trait WithFactory: Sized {
+    type Bundle: Clone + Send + Sync;
+    fn factory_bundle(&self) -> Self::Bundle;
+    fn from_factory_bundle(bundle: Self::Bundle) -> Self;
+}
+
+impl WithFactory for () {
+    type Bundle = ();
+    fn factory_bundle(&self) -> Self::Bundle {}
+    fn from_factory_bundle((): Self::Bundle) -> Self {}
+}
+
+impl<S, Rest> WithFactory for variadic!(S, ...Rest)
+where
+    S: WithFactory,
+    Rest: WithFactory,
+{
+    type Bundle = (S::Bundle, Rest::Bundle);
+
+    fn factory_bundle(&self) -> Self::Bundle {
+        (self.0.factory_bundle(), self.1.factory_bundle())
+    }
+
+    fn from_factory_bundle(bundle: Self::Bundle) -> Self {
+        (
+            S::from_factory_bundle(bundle.0),
+            Rest::from_factory_bundle(bundle.1),
+        )
+    }
+}
+
 // MuxStateMachine that proxies to an tuple of variadic state machines
 #[derive(Debug)]
 pub struct MuxStateMachine<T>
@@ -55,6 +98,26 @@ where
     }
 }
 
+impl<T> MuxStateMachine<T>
+where
+    T: StateMachine + WithFactory,
+{
+    /// Mint a factory bundle from the owning state machine so peer shards
+    /// can rebuild a reader-mode mirror on their own runtimes.
+    #[must_use]
+    pub fn factory_bundle(&self) -> T::Bundle {
+        self.inner.factory_bundle()
+    }
+
+    /// Reconstruct a reader-mode `MuxStateMachine` from a bundle minted
+    /// on the writer-side shard. Must be invoked on the destination
+    /// runtime so the inner `!Sync` `ReadHandle`s are created in place.
+    #[must_use]
+    pub fn from_factory_bundle(bundle: T::Bundle) -> Self {
+        Self::new(T::from_factory_bundle(bundle))
+    }
+}
+
 impl<T> StateMachine for MuxStateMachine<T>
 where
     T: StateMachine,
diff --git a/core/partitions/src/iggy_partition.rs b/core/partitions/src/iggy_partition.rs
index 4a82aaec8a..f57bd790bd 100644
--- a/core/partitions/src/iggy_partition.rs
+++ b/core/partitions/src/iggy_partition.rs
@@ -1416,14 +1416,14 @@ where
                     .send_to_client(prepare_header.client, reply_buffers)
                     .await
                 {
-                    warn!(
+                    tracing::error!(
                         target: "iggy.partitions.diag",
                         plane = "partitions",
                         client = prepare_header.client,
                         op = prepare_header.op,
                         namespace_raw,
                         %error,
-                        "failed to send reply to client"
+                        "client reply forward failed, no retransmit path; client will time out",
                     );
                 }
             }
diff --git a/core/server-ng/Cargo.toml b/core/server-ng/Cargo.toml
index d11700a198..3b130fdb4f 100644
--- a/core/server-ng/Cargo.toml
+++ b/core/server-ng/Cargo.toml
@@ -43,7 +43,6 @@ ignored = [
     "err_trail",
     "error_set",
     "figlet-rs",
-    "flume",
     "fs2",
     "hash32",
     "human-repr",
@@ -105,6 +104,7 @@ clap = { workspace = true }
 compio = { workspace = true }
 configs = { workspace = true }
 consensus = { workspace = true }
+crossfire = { workspace = true }
 ctrlc = { workspace = true }
 cyper = { workspace = true }
 cyper-axum = { workspace = true }
@@ -113,7 +113,6 @@ dotenvy = { workspace = true }
 err_trail = { workspace = true }
 error_set = { workspace = true }
 figlet-rs = { workspace = true }
-flume = { workspace = true }
 fs2 = { workspace = true }
 hash32 = { workspace = true }
 human-repr = { workspace = true }
diff --git a/core/server-ng/config.toml b/core/server-ng/config.toml
index 7d8b51123a..77261181c2 100644
--- a/core/server-ng/config.toml
+++ b/core/server-ng/config.toml
@@ -589,7 +589,8 @@ ports = { tcp = 8091, quic = 8081, http = 3001, websocket = 8093, tcp_replica =
 # - numa settings:
 #     + "numa:auto": Use all available numa node, cores
 #     + "numa:nodes=0,1;cores=4;no_ht=true": Use NUMA node 0 and 1, each nodes use 4 cores, and no hyperthreads
-cpu_allocation = "numa:auto"
+# TODO(hubcio): revert to "numa:auto" once multi-shard server-ng is stable.
+cpu_allocation = 1
 
 [websocket]
 enabled = true
diff --git a/core/server-ng/src/bootstrap.rs b/core/server-ng/src/bootstrap.rs
index 816a464f5e..9f61db9efc 100644
--- a/core/server-ng/src/bootstrap.rs
+++ b/core/server-ng/src/bootstrap.rs
@@ -20,6 +20,7 @@
 use crate::config_writer::write_current_config;
 use crate::server_error::ServerNgError;
 use configs::server_ng::ServerNgConfig;
+use configs::sharding::INBOX_CAPACITY_MAX;
 use consensus::{LocalPipeline, PartitionsHandle, Sequencer, VsrConsensus};
 use iggy_binary_protocol::RequestHeader;
 use iggy_common::sharding::{IggyNamespace, PartitionLocation, ShardId};
@@ -30,9 +31,7 @@ use iggy_common::{
 use journal::Journal;
 use journal::prepare_journal::PrepareJournal;
 use message_bus::client_listener::{self, RequestHandler};
-use message_bus::fd_transfer;
 use message_bus::installer;
-use message_bus::installer::ConnectionInstaller;
 use message_bus::installer::conn_info::{ClientConnMeta, ClientTransportKind};
 use message_bus::replica::io as replica_io;
 use message_bus::replica::listener::{self as replica_listener, MessageHandler};
@@ -42,13 +41,14 @@ use message_bus::transports::tls::{
 };
 use message_bus::{
     AcceptedClientFn, AcceptedQuicClientFn, AcceptedReplicaFn, AcceptedTlsClientFn,
-    AcceptedWsClientFn, IggyMessageBus, connector,
+    AcceptedWsClientFn, IggyMessageBus, ReplicaOwnerTable, connector,
 };
 use metadata::IggyMetadata;
 use metadata::MuxStateMachine;
 use metadata::impls::metadata::{IggySnapshot, StreamsFrontend};
 use metadata::impls::recovery::recover;
 use metadata::stm::consumer_group::ConsumerGroups;
+use metadata::stm::mux::WithFactory;
 use metadata::stm::snapshot::Snapshot;
 use metadata::stm::stream::{Partition, Streams};
 use metadata::stm::user::Users;
@@ -56,38 +56,53 @@ use partitions::{
     IggyIndexWriter, IggyPartition, IggyPartitions, MessagesWriter, PartitionsConfig, Segment,
 };
 // TODO: decouple bootstrap/storage helpers and logging from the `server` crate.
-use server::bootstrap::create_directories;
+use server::bootstrap::{create_directories, create_shard_executor};
 use server::log::logger::Logging;
+use server::shard_allocator::{ShardAllocator, ShardInfo};
 use server::streaming::partitions::storage::{load_consumer_group_offsets, load_consumer_offsets};
 use server::streaming::segments::storage::create_segment_storage;
 use shard::builder::IggyShardBuilder;
-use shard::shards_table::PapayaShardsTable;
+use shard::metrics::ShardMetrics;
+use shard::shards_table::{PapayaShardsTable, calculate_shard_assignment};
 use shard::{
-    CoordinatorConfig, IggyShard, PartitionConsensusConfig, ShardIdentity, channel, shard_channel,
+    CoordinatorConfig, IggyShard, PartitionConsensusConfig, Receiver as ShardReceiver, ShardFrame,
+    ShardIdentity, TaggedSender, channel, shard_cluster_channels,
 };
-use std::cell::{Cell, RefCell};
-use std::future::Future;
+use std::cell::RefCell;
 use std::net::{IpAddr, SocketAddr};
 use std::path::{Path, PathBuf};
 use std::rc::{Rc, Weak};
 use std::sync::Arc;
-use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::thread;
+use std::time::Duration;
 use tracing::{error, info, warn};
 
 const CLUSTER_ID: u128 = 1;
-const SHARD_ID: u16 = 0;
 const SHARD_REPLICA_ID: u8 = 0;
-const SHARD_NAME: &str = "server-ng-shard-0";
-const SHARD_INBOX_CAPACITY: usize = 1024;
+/// Bus shutdown drain timeout. Picked larger than typical TCP RTT ×
+/// in-flight write-batch to give writers their full last
+/// `write_vectored_all` budget before the connection registry kicks in.
+const SHARD_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(10);
+/// Watchdog poll cadence for the cross-thread shutdown flag. 50ms keeps
+/// Ctrl-C latency operator-visible without measurable wakeup overhead.
+const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(50);
 
 type ServerNgMuxStateMachine = MuxStateMachine<variadic!(Users, Streams, ConsumerGroups)>;
+
+/// Cross-thread bundle carrying one `ReadHandleFactory` per metadata
+/// state. Shard 0 mints one after `recover()` and broadcasts a clone to
+/// every peer shard; each peer rebuilds a reader-mode
+/// [`ServerNgMuxStateMachine`] on its own runtime, skipping the WAL.
+type ServerNgMetadataBundle = <variadic!(Users, Streams, ConsumerGroups) as WithFactory>::Bundle;
+
 type ServerNgMetadata = IggyMetadata<
     VsrConsensus<Rc<IggyMessageBus>>,
     PrepareJournal,
     IggySnapshot,
     ServerNgMuxStateMachine,
 >;
-type ServerNgShard = IggyShard<
+pub type ServerNgShard = IggyShard<
     Rc<IggyMessageBus>,
     PrepareJournal,
     IggySnapshot,
@@ -97,6 +112,149 @@ type ServerNgShard = IggyShard<
 
 type ServerNgShardHandle = Rc<RefCell<Option<Weak<ServerNgShard>>>>;
 
+/// Result of a multi-shard bootstrap.
+///
+/// Carries the cross-thread shutdown flag and one OS-thread `JoinHandle`
+/// per shard. The caller flips the flag via [`Self::install_ctrlc_handler`]
+/// and then drains the cluster via [`Self::join_all`].
+pub struct ClusterHandles {
+    shutdown_flag: Arc<AtomicBool>,
+    shard_threads: Vec<(u16, thread::JoinHandle<Result<(), ServerNgError>>)>,
+}
+
+impl ClusterHandles {
+    /// Install a SIGINT/Ctrl-C handler that flips the shutdown flag on
+    /// the first signal. A second signal is logged but otherwise
+    /// ignored so an in-flight WAL fsync or replica drain runs to
+    /// completion.
+    ///
+    /// # Errors
+    ///
+    /// Returns the underlying `ctrlc::Error` if the handler cannot be
+    /// installed (typically because another handler already owns the
+    /// signal).
+    pub fn install_ctrlc_handler(&self) -> Result<(), ctrlc::Error> {
+        let flag = Arc::clone(&self.shutdown_flag);
+        ctrlc::set_handler(move || {
+            if flag.swap(true, Ordering::Relaxed) {
+                // Second Ctrl-C: leave the shutdown machinery to drain.
+                // Refusing to abort here keeps the WAL fsync / replica
+                // drain from being interrupted mid-frame.
+                warn!("second Ctrl-C ignored; cluster is already shutting down");
+            } else {
+                info!("Ctrl-C received; signalling cluster shutdown");
+            }
+        })
+    }
+
+    /// Drain every shard thread, returning the first error observed (a
+    /// thread-level `Result::Err` or a panic). Clean exits log a single
+    /// info line per shard.
+    ///
+    /// # Errors
+    ///
+    /// Returns the first per-shard error: either a `Result::Err` returned
+    /// by `shard_main` or a `ServerNgError::ShardThreadPanicked` carrying
+    /// the panicking shard's id.
+    pub fn join_all(self) -> Result<(), ServerNgError> {
+        let mut first_error: Option<ServerNgError> = None;
+        for (shard_id, handle) in self.shard_threads {
+            match handle.join() {
+                Ok(Ok(())) => {
+                    info!(shard_id, "shard thread exited cleanly");
+                }
+                Ok(Err(error)) => {
+                    error!(shard_id, error = %error, "shard thread returned error");
+                    if first_error.is_none() {
+                        first_error = Some(error);
+                    }
+                }
+                Err(panic_payload) => {
+                    let message = panic_payload_to_string(&*panic_payload);
+                    error!(shard_id, message = %message, "shard thread panicked");
+                    if first_error.is_none() {
+                        first_error =
+                            Some(ServerNgError::ShardThreadPanicked { shard_id, message });
+                    }
+                }
+            }
+        }
+        first_error.map_or(Ok(()), Err)
+    }
+}
+
+/// Best-effort extraction of the panic message from a
+/// `Box<dyn Any + Send>` returned by `JoinHandle::join`. Tries the two
+/// payload shapes the standard library guarantees (`&'static str` and
+/// `String`) and falls back to a placeholder so the panic still surfaces
+/// in the error chain.
+fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
+    if let Some(s) = payload.downcast_ref::<&'static str>() {
+        return (*s).to_string();
+    }
+    if let Some(s) = payload.downcast_ref::<String>() {
+        return s.clone();
+    }
+    "<panic payload not String/&str>".to_string()
+}
+
+/// Flips the cross-thread shutdown flag on `Drop` unless disarmed.
+///
+/// A shard thread that exits via an error `?` or a panic unwind would
+/// otherwise leave sibling shards parked forever on `bus.token().wait()`:
+/// their watchdogs never observe the flag and the bus has no
+/// `Drop`-triggered shutdown. Arming this for the whole thread body makes
+/// every non-clean exit drive sibling-shard teardown. Disarmed only on a
+/// clean `Ok(())`.
+struct ShutdownOnDrop {
+    flag: Arc<AtomicBool>,
+    armed: bool,
+}
+
+impl ShutdownOnDrop {
+    const fn new(flag: Arc<AtomicBool>) -> Self {
+        Self { flag, armed: true }
+    }
+
+    const fn disarm(&mut self) {
+        self.armed = false;
+    }
+}
+
+impl Drop for ShutdownOnDrop {
+    fn drop(&mut self) {
+        if self.armed {
+            self.flag.store(true, Ordering::Relaxed);
+        }
+    }
+}
+
+/// Shard-local end of the metadata bundle handoff.
+///
+/// Shard 0 owns the WAL writer and runs `recover()` to build the only
+/// `WriteHandle`-bearing [`ServerNgMuxStateMachine`]. It then mints a
+/// [`ServerNgMetadataBundle`] (a tuple of `Send + Sync`
+/// `ReadHandleFactory`s) and pushes one clone per peer onto `bundle_tx`.
+/// Every other shard receives the bundle and rebuilds a reader-mode
+/// `MuxStateMachine` on its own runtime - no WAL access, no replay, no
+/// `RecoverySync` two-phase fence. Phase 2 of the old handshake was
+/// only there to keep peer scans away from shard 0's torn-tail repair;
+/// with no peer scan that race is structurally gone.
+///
+/// The channel is bounded to the peer count so shard 0's `send` never
+/// blocks beyond a peer drain. A peer that dies before recv drops its
+/// `bundle_rx`, so shard 0's `send` eventually sees a disconnected
+/// channel; the cross-thread shutdown flag drives every waiter out of
+/// its `recv` loop if shard 0 panics before broadcasting.
+enum MetadataHandoff {
+    Owner {
+        bundle_tx: crossfire::MAsyncTx<crossfire::mpmc::Array<ServerNgMetadataBundle>>,
+    },
+    Waiter {
+        bundle_rx: crossfire::MAsyncRx<crossfire::mpmc::Array<ServerNgMetadataBundle>>,
+    },
+}
+
 struct TcpTopology {
     self_replica_id: u8,
     replica_count: u8,
@@ -123,58 +281,6 @@ struct BoundClientListeners {
     quic: Option<SocketAddr>,
 }
 
-pub trait RunServerNg {
-    fn run(
-        &self,
-        config: &ServerNgConfig,
-        current_replica_id: Option<u8>,
-    ) -> impl Future<Output = Result<(), ServerNgError>>;
-}
-
-impl RunServerNg for Rc<ServerNgShard> {
-    /// Run the fully bootstrapped `server-ng` shard.
-    ///
-    /// # Errors
-    ///
-    /// Returns an error if TCP listener bootstrap fails or cluster TCP
-    /// addresses cannot be resolved from config.
-    async fn run(
-        &self,
-        config: &ServerNgConfig,
-        current_replica_id: Option<u8>,
-    ) -> Result<(), ServerNgError> {
-        let topology = resolve_tcp_topology(config, current_replica_id)?;
-        let (stop_tx, stop_rx) = channel(1);
-        let message_pump_shard = Self::clone(self);
-        let message_pump_handle = compio::runtime::spawn(async move {
-            message_pump_shard.run_message_pump(stop_rx).await;
-        });
-        self.bus.track_background(message_pump_handle);
-
-        let on_replica_message = make_replica_message_handler(self);
-        let on_client_request = make_client_request_handler(self);
-        let accepted_replica = make_local_replica_accept_fn(&self.bus, on_replica_message);
-        let accepted_client = make_local_client_accept_fns(&self.bus, on_client_request);
-
-        info!(
-            shard = self.id,
-            partitions = self.plane.partitions().len(),
-            "server-ng shard initialized"
-        );
-
-        if let Err(error) =
-            start_tcp_runtime(self, config, &topology, accepted_replica, accepted_client).await
-        {
-            let _ = stop_tx.try_send(());
-            return Err(error);
-        }
-
-        self.bus.token().wait().await;
-        let _ = stop_tx.try_send(());
-        Ok(())
-    }
-}
-
 /// Load config, prepare directories, and complete late logging init.
 ///
 /// # Errors
@@ -205,92 +311,474 @@ pub async fn load_config(logging: &mut Logging) -> Result<ServerNgConfig, Server
     Ok(config)
 }
 
-/// Bootstraps `server-ng` from config and on-disk metadata/partition state.
+/// Spawn the multi-shard `server-ng` cluster.
+///
+/// Resolves shard count + CPU affinities from
+/// `system.sharding.cpu_allocation`, builds canonical-ordered
+/// `(senders, inboxes)` channels, and spawns one OS thread per shard.
+///
+/// Each thread pins itself (`nix::sched::sched_setaffinity` on Linux via
+/// [`ShardInfo::bind_cpu`]), binds memory to its NUMA node when
+/// configured, builds a fresh `compio::runtime::Runtime` (one
+/// `io_uring` instance per shard), and runs `shard_main` inside it.
+///
+/// Returns [`ClusterHandles`] containing the cross-thread shutdown flag
+/// and the per-shard `JoinHandle`s. The caller (`main.rs`) installs a
+/// `ctrlc` handler that flips the flag, then `.join()`s every handle.
 ///
 /// # Errors
 ///
-/// Returns an error if metadata recovery, consensus restoration, or
-/// partition hydration fails.
-pub async fn bootstrap(
-    config: &ServerNgConfig,
+/// Returns an error if shard allocation fails, the inbox capacity is
+/// invalid, or any OS thread fails to spawn. Per-shard recovery /
+/// listener / consensus failures surface through the per-thread `Result`
+/// the caller observes on `.join()`.
+///
+/// # Panics
+///
+/// Panics if [`shard_cluster_channels`] returns an inbox slot already
+/// consumed - a bootstrap programming error that would only fire if this
+/// function were called twice with the same inboxes.
+pub fn bootstrap(
+    config: ServerNgConfig,
     current_replica_id: Option<u8>,
-) -> Result<Rc<ServerNgShard>, ServerNgError> {
-    let topology = resolve_tcp_topology(config, current_replica_id)?;
-    let bus = Rc::new(IggyMessageBus::with_config(SHARD_ID, config));
-    let recovered = recover::<ServerNgMuxStateMachine>(Path::new(&config.system.path))
+) -> Result<ClusterHandles, ServerNgError> {
+    let allocator = ShardAllocator::new(&config.system.sharding.cpu_allocation)
+        .map_err(ServerNgError::ShardAllocator)?;
+    let assignments = allocator
+        .to_shard_assignments()
+        .map_err(ServerNgError::ShardAllocator)?;
+    let shards_count = assignments.len();
+    if shards_count == 0 {
+        return Err(ServerNgError::InvalidShardsCount { count: 0 });
+    }
+    // Shard ids index `ReplicaOwnerTable` slots as `u16`. `OWNER_NONE`
+    // (`u16::MAX`) is reserved as the empty-slot sentinel, so a cluster
+    // with `u16::MAX` shards would mint a shard id that collides with
+    // the sentinel and an owner-table lookup could never tell that shard
+    // apart from an unowned slot. Reject at boot so the invariant is held
+    // by the type system above this line, not by hoping the operator
+    // never configures 65535 cores worth of shards.
+    let total_shards = match u16::try_from(shards_count) {
+        Ok(count) if count < message_bus::OWNER_NONE => count,
+        _ => {
+            return Err(ServerNgError::InvalidShardsCount {
+                count: shards_count,
+            });
+        }
+    };
+
+    // Re-check the full valid range, not just the zero floor: a caller
+    // that built the config without running `ShardingConfig::validate`
+    // would otherwise OOM at boot allocating an oversized inbox channel.
+    let inbox_capacity = config.system.sharding.inbox_capacity;
+    if inbox_capacity == 0 || inbox_capacity > INBOX_CAPACITY_MAX {
+        return Err(ServerNgError::InvalidInboxCapacity {
+            value: inbox_capacity,
+            max: INBOX_CAPACITY_MAX,
+        });
+    }
+
+    let (senders, mut inboxes) = shard_cluster_channels(total_shards, inbox_capacity);
+    let shutdown_flag = Arc::new(AtomicBool::new(false));
+    let config = Arc::new(config);
+    // One owner table per cluster, Arc-cloned into every shard's bus so
+    // any shard's bus reads the same atomic slots that the owning
+    // shard's installer / disconnect path writes.
+    let owner_table = Arc::new(ReplicaOwnerTable::new());
+
+    // Single-shot bundle handoff (see `MetadataHandoff`): shard 0 sends
+    // one cloned `ServerNgMetadataBundle` per peer; each peer drains
+    // exactly one. Bounded to the peer count so shard 0's broadcast
+    // never blocks past a peer drain, and so a single-shard cluster
+    // (zero peers, crossfire clamps capacity to 1) can still allocate
+    // the channel without minting a useless slot. If a peer dies before
+    // recv, shard 0's `send` eventually sees a disconnected channel;
+    // the cross-thread shutdown flag drives every waiter out of its
+    // recv loop if shard 0 panics before broadcasting.
+    let metadata_peers = shards_count.saturating_sub(1);
+    let (metadata_bundle_tx, metadata_bundle_rx) =
+        crossfire::mpmc::bounded_async::<ServerNgMetadataBundle>(metadata_peers);
+
+    let mut shard_threads: Vec<(u16, thread::JoinHandle<Result<(), ServerNgError>>)> =
+        Vec::with_capacity(shards_count);
+    for (idx, assignment) in assignments.into_iter().enumerate() {
+        #[allow(clippy::cast_possible_truncation)]
+        let shard_id = idx as u16;
+        let inbox = inboxes[idx]
+            .take()
+            .expect("shard_cluster_channels populates every inbox slot exactly once");
+        let senders_for_shard = senders.clone();
+        let config_for_shard = Arc::clone(&config);
+        let shutdown_flag_for_shard = Arc::clone(&shutdown_flag);
+        let owner_table_for_shard = Arc::clone(&owner_table);
+        let metadata_handoff_for_shard = if shard_id == 0 {
+            MetadataHandoff::Owner {
+                bundle_tx: metadata_bundle_tx.clone(),
+            }
+        } else {
+            MetadataHandoff::Waiter {
+                bundle_rx: metadata_bundle_rx.clone(),
+            }
+        };
+
+        let handle = match thread::Builder::new()
+            .name(format!("shard-{shard_id}"))
+            .spawn(move || -> Result<(), ServerNgError> {
+                run_shard_thread(
+                    shard_id,
+                    total_shards,
+                    current_replica_id,
+                    assignment,
+                    senders_for_shard,
+                    inbox,
+                    config_for_shard,
+                    shutdown_flag_for_shard,
+                    metadata_handoff_for_shard,
+                    owner_table_for_shard,
+                )
+            }) {
+            Ok(handle) => handle,
+            Err(source) => {
+                // Signal every shard already spawned before propagating, so
+                // their watchdog loops drive `bus.shutdown(...)` and the
+                // process can exit instead of hanging on stuck OS threads.
+                shutdown_flag.store(true, Ordering::Relaxed);
+                // Drop bootstrap's own channel clones before joining
+                // survivors. Otherwise a peer waiting on `bundle_rx.recv`
+                // would never observe the sender side disconnecting and
+                // would hang until the shutdown watchdog kicks the bus.
+                drop(metadata_bundle_tx);
+                drop(metadata_bundle_rx);
+                for (_, survivor) in shard_threads {
+                    let _ = survivor.join();
+                }
+                return Err(ServerNgError::ShardSpawnFailed { shard_id, source });
+            }
+        };
+        shard_threads.push((shard_id, handle));
+    }
+
+    // Drop bootstrap's own channel clones now that every shard owns its
+    // half. Keeping them on bootstrap's stack would deadlock a peer
+    // whose `bundle_rx.recv` only completes once every sender
+    // disconnects.
+    drop(metadata_bundle_tx);
+    drop(metadata_bundle_rx);
+
+    info!(
+        shards_count,
+        "server-ng cluster bootstrap dispatched; awaiting shard runtimes"
+    );
+
+    Ok(ClusterHandles {
+        shutdown_flag,
+        shard_threads,
+    })
+}
+
+/// Per-shard OS thread entry. Pins CPU + memory, builds the compio
+/// runtime, and `block_on`s `shard_main`.
+#[allow(clippy::needless_pass_by_value, clippy::too_many_arguments)]
+fn run_shard_thread(
+    shard_id: u16,
+    total_shards: u16,
+    replica_id: Option<u8>,
+    assignment: ShardInfo,
+    senders: Vec<TaggedSender>,
+    inbox: ShardReceiver<ShardFrame>,
+    config: Arc<ServerNgConfig>,
+    shutdown_flag: Arc<AtomicBool>,
+    metadata_handoff: MetadataHandoff,
+    owner_table: Arc<ReplicaOwnerTable>,
+) -> Result<(), ServerNgError> {
+    // Armed for the whole thread body: a post-spawn error `?` or a panic
+    // unwind here must flip `shutdown_flag` so sibling watchdogs drive
+    // their bus shutdown instead of parking forever on `bus.token().wait()`.
+    let mut shutdown_guard = ShutdownOnDrop::new(Arc::clone(&shutdown_flag));
+
+    assignment
+        .bind_cpu()
+        .map_err(|source| ServerNgError::CpuAffinityFailed { shard_id, source })?;
+    assignment
+        .bind_memory()
+        .map_err(|source| ServerNgError::MemoryAffinityFailed { shard_id, source })?;
+
+    // TODO(hubcio): decouple runtime creation from the `server` crate
+    // (mirrors the identical TODO in `main.rs`). Reusing legacy here so
+    // server-ng and the legacy server share one io_uring tuning surface.
+    let runtime = create_shard_executor()
+        .map_err(|source| ServerNgError::ShardRuntimeCreateFailed { shard_id, source })?;
+
+    let result = runtime.block_on(async move {
+        shard_main(
+            shard_id,
+            total_shards,
+            replica_id,
+            senders,
+            inbox,
+            &config,
+            shutdown_flag,
+            metadata_handoff,
+            owner_table,
+        )
         .await
-        .map_err(ServerNgError::MetadataRecovery)?;
-    let restored_op = recovered.last_applied_op.unwrap_or_else(|| {
-        recovered
-            .snapshot
-            .as_ref()
-            .map_or(0, IggySnapshot::sequence_number)
     });
 
+    if result.is_ok() {
+        shutdown_guard.disarm();
+    }
+    result
+}
+
+/// Per-shard async lifecycle. Builds the bus, recovers metadata,
+/// constructs the `IggyShard` for this shard's slice of partitions,
+/// wires listeners on shard 0, and runs the message pump until
+/// shutdown.
+#[allow(clippy::too_many_arguments, clippy::too_many_lines)]
+async fn shard_main(
+    shard_id: u16,
+    total_shards: u16,
+    replica_id: Option<u8>,
+    senders: Vec<TaggedSender>,
+    inbox: ShardReceiver<ShardFrame>,
+    config: &ServerNgConfig,
+    shutdown_flag: Arc<AtomicBool>,
+    metadata_handoff: MetadataHandoff,
+    owner_table: Arc<ReplicaOwnerTable>,
+) -> Result<(), ServerNgError> {
+    let topology = resolve_tcp_topology(config, replica_id)?;
+    let bus = Rc::new(IggyMessageBus::with_config_and_owner_table(
+        shard_id,
+        config,
+        owner_table,
+    ));
+
+    let shutdown_flag_for_handoff = Arc::clone(&shutdown_flag);
+    spawn_shutdown_watchdog(Rc::clone(&bus), shutdown_flag);
+
+    // Metadata bootstrap is single-writer: shard 0 owns the WAL and the
+    // only `WriteHandle`-bearing `MuxStateMachine`. Peer shards receive
+    // a `ReadHandleFactory` bundle on the inter-thread channel and
+    // rebuild a reader-mode `MuxStateMachine` on their own runtime - no
+    // WAL access, no replay. Writes still funnel through shard 0's
+    // metadata VSR; per-commit `publish()` (in `WriteCell::apply`)
+    // bounds reader staleness to one op.
+    let data_dir = Path::new(&config.system.path);
+    let (mux_stm, owner_state) = match metadata_handoff {
+        MetadataHandoff::Owner { bundle_tx } => {
+            let recovered = recover::<ServerNgMuxStateMachine>(data_dir)
+                .await
+                .map_err(ServerNgError::MetadataRecovery)?;
+            broadcast_metadata_bundle(
+                shard_id,
+                &bundle_tx,
+                recovered.mux_stm.factory_bundle(),
+                total_shards.saturating_sub(1),
+                &shutdown_flag_for_handoff,
+            )
+            .await?;
+            (
+                recovered.mux_stm,
+                Some((
+                    recovered.journal,
+                    recovered.snapshot,
+                    recovered.last_applied_op,
+                )),
+            )
+        }
+        MetadataHandoff::Waiter { bundle_rx } => {
+            let bundle =
+                await_metadata_bundle(shard_id, &bundle_rx, &shutdown_flag_for_handoff).await?;
+            (ServerNgMuxStateMachine::from_factory_bundle(bundle), None)
+        }
+    };
+
+    // Metadata consensus + journal + snapshot live only on shard 0.
+    // `IggyShard::tick_metadata` short-circuits when `consensus.is_none()`,
+    // so peer shards have no caller that reads `journal` or `snapshot`.
+    let (metadata_consensus, journal_for_metadata, snapshot_for_metadata) =
+        if let Some((journal, snapshot, last_applied_op)) = owner_state {
+            let restored_op = last_applied_op
+                .unwrap_or_else(|| snapshot.as_ref().map_or(0, IggySnapshot::sequence_number));
+            let consensus = restore_metadata_consensus(
+                &journal,
+                restored_op,
+                topology.self_replica_id,
+                topology.replica_count,
+                Rc::clone(&bus),
+            );
+            (Some(consensus), Some(journal), snapshot)
+        } else {
+            (None, None, None)
+        };
     let metadata = ServerNgMetadata::new(
-        Some(restore_metadata_consensus(
-            &recovered.journal,
-            restored_op,
-            topology.self_replica_id,
-            topology.replica_count,
-            Rc::clone(&bus),
-        )),
-        Some(recovered.journal),
-        recovered.snapshot,
-        recovered.mux_stm,
+        metadata_consensus,
+        journal_for_metadata,
+        snapshot_for_metadata,
+        mux_stm,
         Some(PathBuf::from(&config.system.path)),
     );
-    let shard = build_single_shard(config, &topology, metadata, bus).await?;
-    info!(shard = shard.id, "server-ng bootstrap complete");
 
-    Ok(shard)
-}
+    let shard_metrics = ShardMetrics::for_shard(shard_id);
+    let shard = build_shard_for_thread(
+        shard_id,
+        total_shards,
+        config,
+        &topology,
+        metadata,
+        Rc::clone(&bus),
+        senders,
+        inbox,
+        shard_metrics,
+    )
+    .await?;
 
-fn restore_metadata_consensus(
-    journal: &PrepareJournal,
-    restored_op: u64,
-    self_replica_id: u8,
-    replica_count: u8,
-    bus: Rc<IggyMessageBus>,
-) -> VsrConsensus<Rc<IggyMessageBus>> {
-    let mut consensus = VsrConsensus::new(
-        CLUSTER_ID,
-        self_replica_id,
-        replica_count,
-        0,
-        bus,
-        LocalPipeline::new(),
+    info!(
+        shard = shard_id,
+        partitions = shard.plane.partitions().len(),
+        "server-ng shard initialized"
     );
 
-    let last_header = journal
-        .last_op()
-        .and_then(|op| usize::try_from(op).ok())
-        .and_then(|op| journal.header(op).map(|header| *header));
-    if let Some(header) = last_header {
-        consensus.set_view(header.view);
+    let (stop_tx, stop_rx) = channel(1);
+    let pump_shard = Rc::clone(&shard);
+    let pump_handle = compio::runtime::spawn(async move {
+        pump_shard.run_message_pump(stop_rx).await;
+    });
+    bus.track_background(pump_handle);
+
+    // Listeners (replica + every client transport) bind on shard 0 only.
+    // Shard 0's coordinator round-robins inbound TCP/WS connections to
+    // peer shards via fd-transfer. QUIC and TCP-TLS clients terminate
+    // locally on shard 0 (their per-connection state is non-portable -
+    // see `LifecycleFrame::ClientWsConnectionSetup` rustdoc).
+    //
+    // No phase-2 listener fence is needed: peer shards no longer scan
+    // the WAL, so a shard-0 append accepted mid-boot cannot race a
+    // peer's `truncate_or_fail`. The factory-bundle handoff has already
+    // installed reader-mode `MuxStateMachine`s on every peer by the
+    // time shard 0 returns from `broadcast_metadata_bundle`.
+    if shard_id == 0 {
+        let coord = shard
+            .coordinator()
+            .expect("shard 0 always has a coordinator attached by the builder");
+        let on_client_request = make_client_request_handler(&shard);
+        let accepted_replica = make_delegating_replica_accept_fn(Rc::clone(&coord));
+        let accepted_client = make_shard_zero_client_accept_fns(coord, &bus, on_client_request);
+
+        if let Err(error) =
+            start_tcp_runtime(&shard, config, &topology, accepted_replica, accepted_client).await
+        {
+            let _ = stop_tx.try_send(());
+            return Err(error);
+        }
     }
 
-    consensus.init();
-    consensus.sequencer().set_sequence(restored_op);
-    // Known gap: clustered bootstrap does not yet persist a durable commit
-    // watermark. Until that exists, recovery assumes the not-yet-supported
-    // case where replicas are not rejoining with divergent commit state.
-    consensus.restore_commit_state(restored_op, restored_op);
-    if let Some(header) = last_header {
-        consensus.set_last_prepare_checksum(header.checksum);
+    bus.token().wait().await;
+    let _ = stop_tx.try_send(());
+
+    info!(shard = shard_id, "server-ng shard exited cleanly");
+    Ok(())
+}
+
+/// Block until shard 0 broadcasts the metadata factory bundle, or the
+/// cross-thread shutdown flag flips. The recv future is polled in a
+/// `SHUTDOWN_POLL_INTERVAL` loop so a shard 0 that panics before it
+/// broadcasts cannot strand peer shards: the shutdown path flips the
+/// flag, every waiter observes it on the next tick, and the cluster
+/// tears down instead of hanging.
+async fn await_metadata_bundle(
+    shard_id: u16,
+    bundle_rx: &crossfire::MAsyncRx<crossfire::mpmc::Array<ServerNgMetadataBundle>>,
+    shutdown_flag: &Arc<AtomicBool>,
+) -> Result<ServerNgMetadataBundle, ServerNgError> {
+    loop {
+        match compio::time::timeout(SHUTDOWN_POLL_INTERVAL, bundle_rx.recv()).await {
+            Ok(Ok(bundle)) => return Ok(bundle),
+            Ok(Err(_)) => return Err(ServerNgError::MetadataHandoffAborted { shard_id }),
+            Err(_) => {
+                if shutdown_flag.load(Ordering::Relaxed) {
+                    return Err(ServerNgError::MetadataHandoffAborted { shard_id });
+                }
+            }
+        }
     }
+}
 
-    consensus
+/// Push `peers` cloned bundles onto `bundle_tx`, polling each send in a
+/// `SHUTDOWN_POLL_INTERVAL` loop so the cross-thread shutdown flag can
+/// interrupt a stalled handoff. Symmetric to [`await_metadata_bundle`]:
+/// shutdown observed mid-handshake aborts cleanly rather than stalling
+/// on a `send` future that can no longer make progress.
+async fn broadcast_metadata_bundle(
+    shard_id: u16,
+    bundle_tx: &crossfire::MAsyncTx<crossfire::mpmc::Array<ServerNgMetadataBundle>>,
+    bundle: ServerNgMetadataBundle,
+    peers: u16,
+    shutdown_flag: &Arc<AtomicBool>,
+) -> Result<(), ServerNgError> {
+    for _ in 0..peers {
+        loop {
+            match compio::time::timeout(SHUTDOWN_POLL_INTERVAL, bundle_tx.send(bundle.clone()))
+                .await
+            {
+                Ok(Ok(())) => break,
+                Ok(Err(_)) => {
+                    // Every peer dropped its `bundle_rx` before recv; the
+                    // cluster is tearing itself down. Stop sending and
+                    // let the shutdown path handle teardown.
+                    return Ok(());
+                }
+                Err(_) => {
+                    if shutdown_flag.load(Ordering::Relaxed) {
+                        return Err(ServerNgError::MetadataHandoffAborted { shard_id });
+                    }
+                }
+            }
+        }
+    }
+    Ok(())
 }
 
-async fn build_single_shard(
+/// Spawn a per-shard polling task that watches the cross-thread shutdown
+/// flag and triggers this shard's bus shutdown on transition. The flag
+/// is the only Send signal we have; the bus' shutdown machinery is
+/// `!Send` (`Rc<Cell<bool>>` + per-shard `async_channel`), so it must be
+/// triggered from within the runtime that owns the bus.
+#[allow(clippy::needless_pass_by_value)]
+fn spawn_shutdown_watchdog(bus: Rc<IggyMessageBus>, shutdown_flag: Arc<AtomicBool>) {
+    let bus_for_task = Rc::clone(&bus);
+    let bus_token = bus.token();
+    let watchdog = compio::runtime::spawn(async move {
+        loop {
+            if shutdown_flag.load(Ordering::Relaxed) {
+                break;
+            }
+            if bus_token.is_triggered() {
+                // Bus shutdown was driven from elsewhere (e.g. internal
+                // failure path). Watchdog has nothing left to do.
+                return;
+            }
+            compio::time::sleep(SHUTDOWN_POLL_INTERVAL).await;
+        }
+        let _ = bus_for_task.shutdown(SHARD_SHUTDOWN_TIMEOUT).await;
+    });
+    watchdog.detach();
+}
+
+#[allow(clippy::too_many_arguments, clippy::too_many_lines)]
+async fn build_shard_for_thread(
+    shard_id: u16,
+    total_shards: u16,
     config: &ServerNgConfig,
     topology: &TcpTopology,
     metadata: ServerNgMetadata,
     bus: Rc<IggyMessageBus>,
+    senders: Vec<TaggedSender>,
+    inbox: ShardReceiver<ShardFrame>,
+    metrics: ShardMetrics,
 ) -> Result<Rc<ServerNgShard>, ServerNgError> {
-    let shard_id = ShardId::new(SHARD_ID);
-    let partition_count = metadata.mux_stm.streams().read(|inner| {
+    let shard_local_id = ShardId::new(shard_id);
+    let total_partitions = metadata.mux_stm.streams().read(|inner| {
         inner
             .items
             .iter()
@@ -301,10 +789,20 @@ async fn build_single_shard(
                     .map(|(_, topic)| topic.partitions.len())
                     .sum::<usize>()
             })
-            .sum()
+            .sum::<usize>()
     });
+
+    // IggyPartitions holds only the partitions owned by this shard
+    // (see the filter below at insert time), so the cluster total is
+    // an N-fold overshoot. `ceil(total / shards) * 2` is a coarse upper
+    // bound that absorbs hash skew without paying the full multiplier.
+    // PapayaShardsTable below stays sized to the cluster total because
+    // every shard routes every namespace.
+    let owned_partitions_capacity = total_partitions
+        .div_ceil(usize::from(total_shards).max(1))
+        .saturating_mul(2);
     let mut partitions = IggyPartitions::with_capacity(
-        shard_id,
+        shard_local_id,
         PartitionsConfig {
             messages_required_to_save: config.system.partition.messages_required_to_save,
             size_of_messages_required_to_save: config
@@ -314,13 +812,13 @@ async fn build_single_shard(
             enforce_fsync: config.system.partition.enforce_fsync,
             segment_size: config.system.segment.size,
         },
-        partition_count,
+        owned_partitions_capacity,
     );
-    let shards_table = PapayaShardsTable::with_capacity(partition_count);
+    let shards_table = PapayaShardsTable::with_capacity(total_partitions);
 
     let (topic_stats, namespaces) = metadata.mux_stm.streams().read(|inner| {
         let mut topic_stats = Vec::new();
-        let mut namespaces = Vec::with_capacity(partition_count);
+        let mut namespaces = Vec::with_capacity(total_partitions);
         for (_, stream) in &inner.items {
             for (topic_id, topic) in &stream.topics {
                 topic_stats.push(topic.stats.clone());
@@ -332,6 +830,12 @@ async fn build_single_shard(
         (topic_stats, namespaces)
     });
 
+    // Every shard owns its own `Arc<TopicStats>` after recovery (the
+    // `mux_stm` is per-shard, no sharing). Each shard must zero from the
+    // snapshot total before re-adding its own per-partition deltas;
+    // otherwise the non-zero shards inherit the cluster-wide totals and
+    // decrement them into the parent `StreamStats`, leaving every
+    // per-shard exporter / query inconsistent.
     for topic_stats in topic_stats {
         topic_stats.zero_out_all();
     }
@@ -339,30 +843,40 @@ async fn build_single_shard(
     for (stream_id, topic_id, topic_stats, partition_metadata) in namespaces {
         validate_recovered_namespace(config, stream_id, topic_id, partition_metadata.id)?;
         let namespace = IggyNamespace::new(stream_id, topic_id, partition_metadata.id);
-        let partition = load_partition(
-            config,
-            namespace,
-            topic_stats,
-            &partition_metadata,
-            topology.self_replica_id,
-            topology.replica_count,
-            Rc::clone(&bus),
-        )
-        .await?;
-        let local_idx = partitions.insert(namespace, partition);
-        shards_table.insert(
-            namespace,
-            PartitionLocation::new(shard_id, LocalIdx::new(*local_idx)),
-        );
+        let owning_shard = calculate_shard_assignment(&namespace, u32::from(total_shards));
+        if owning_shard == shard_id {
+            let partition = load_partition(
+                config,
+                namespace,
+                topic_stats,
+                &partition_metadata,
+                topology.self_replica_id,
+                topology.replica_count,
+                Rc::clone(&bus),
+            )
+            .await?;
+            let local_idx = partitions.insert(namespace, partition);
+            shards_table.insert(
+                namespace,
+                PartitionLocation::new(ShardId::new(owning_shard), LocalIdx::new(*local_idx)),
+            );
+        } else {
+            // Non-owning entry: only `shard_id` is consulted by the
+            // router; `local_idx` is meaningful only on the owning
+            // shard, so a sentinel `0` is safe here.
+            shards_table.insert(
+                namespace,
+                PartitionLocation::new(ShardId::new(owning_shard), LocalIdx::new(0)),
+            );
+        }
     }
 
-    let (sender, inbox) = shard_channel::<()>(SHARD_ID, SHARD_INBOX_CAPACITY);
-    let senders = vec![sender];
     let shard_handle = Rc::new(RefCell::new(None));
     let on_replica_message = make_deferred_replica_message_handler(&shard_handle);
     let on_client_request = make_deferred_client_request_handler(&shard_handle);
+    let shard_name = format!("server-ng-shard-{shard_id}");
     let built = IggyShardBuilder::new(
-        ShardIdentity::new(SHARD_ID, SHARD_NAME.to_string()),
+        ShardIdentity::new(shard_id, shard_name),
         Rc::clone(&bus),
         on_replica_message,
         on_client_request,
@@ -373,18 +887,52 @@ async fn build_single_shard(
         shards_table,
         PartitionConsensusConfig::new(CLUSTER_ID, topology.replica_count, Rc::clone(&bus)),
         CoordinatorConfig::default(),
-        bus.token(),
+        metrics,
     )
     .build();
-    if let Some(refresh_task) = built.refresh_task {
-        bus.track_background(refresh_task);
-    }
 
     let shard = Rc::new(built.shard);
     *shard_handle.borrow_mut() = Some(Rc::downgrade(&shard));
     Ok(shard)
 }
 
+fn restore_metadata_consensus(
+    journal: &PrepareJournal,
+    restored_op: u64,
+    self_replica_id: u8,
+    replica_count: u8,
+    bus: Rc<IggyMessageBus>,
+) -> VsrConsensus<Rc<IggyMessageBus>> {
+    let mut consensus = VsrConsensus::new(
+        CLUSTER_ID,
+        self_replica_id,
+        replica_count,
+        0,
+        bus,
+        LocalPipeline::new(),
+    );
+
+    let last_header = journal
+        .last_op()
+        .and_then(|op| usize::try_from(op).ok())
+        .and_then(|op| journal.header(op).map(|header| *header));
+    if let Some(header) = last_header {
+        consensus.set_view(header.view);
+    }
+
+    consensus.init();
+    consensus.sequencer().set_sequence(restored_op);
+    // Known gap: clustered bootstrap does not yet persist a durable commit
+    // watermark. Until that exists, recovery assumes the not-yet-supported
+    // case where replicas are not rejoining with divergent commit state.
+    consensus.restore_commit_state(restored_op, restored_op);
+    if let Some(header) = last_header {
+        consensus.set_last_prepare_checksum(header.checksum);
+    }
+
+    consensus
+}
+
 const fn validate_recovered_namespace(
     config: &ServerNgConfig,
     stream_id: usize,
@@ -861,16 +1409,20 @@ fn resolve_tcp_topology(
     let default_quic_addr =
         resolve_optional_listener_addr(config.quic.enabled, "quic.address", &config.quic.address)?;
     if !config.cluster.enabled {
-        if let Some(replica_id) = current_replica_id {
-            warn!(
-                replica_id,
-                "cluster is disabled, ignoring --replica-id for single-node server-ng startup"
-            );
+        if let Some(replica_id) = current_replica_id
+            && replica_id != SHARD_REPLICA_ID
+        {
+            return Err(ServerNgError::ReplicaIdRequiresCluster {
+                supplied: replica_id,
+                default: SHARD_REPLICA_ID,
+            });
         }
         return Ok(TcpTopology {
             // Keep parity with the current server binary and the integration
-            // harness: `--replica-id` may be passed unconditionally, but in
-            // single-node mode there is only replica 0.
+            // harness: `--replica-id 0` may be passed unconditionally in
+            // single-node mode; any other id is rejected above so the WAL
+            // cannot commit under an identity that will later disagree with
+            // a cluster.nodes[] entry.
             self_replica_id: SHARD_REPLICA_ID,
             replica_count: 1,
             client_listen_addr: default_client_addr,
@@ -1209,13 +1761,6 @@ async fn start_manual_runtime(
     Ok(())
 }
 
-fn make_replica_message_handler(shard: &Rc<ServerNgShard>) -> MessageHandler {
-    let shard = Rc::clone(shard);
-    Rc::new(move |_replica_id, message| {
-        shard.dispatch(message);
-    })
-}
-
 fn make_client_request_handler(shard: &Rc<ServerNgShard>) -> RequestHandler {
     let shard = Rc::clone(shard);
     Rc::new(move |client_id, message| {
@@ -1271,87 +1816,73 @@ fn upgrade_shard_handle(shard_handle: &ServerNgShardHandle) -> Option<Rc<ServerN
         .and_then(std::rc::Weak::upgrade)
 }
 
-fn make_local_replica_accept_fn(
-    bus: &Rc<IggyMessageBus>,
-    on_message: MessageHandler,
+/// Replica accept callback that ships every inbound connection through
+/// the shard-0 coordinator's round-robin fd-delegation. The fd lands on
+/// the target shard's inbox as a [`shard::LifecycleFrame::ReplicaConnectionSetup`]
+/// frame and is installed on that shard's bus.
+fn make_delegating_replica_accept_fn(
+    coord: Rc<shard::coordinator::ShardZeroCoordinator>,
 ) -> AcceptedReplicaFn {
-    let bus = Rc::clone(bus);
-    Rc::new(move |stream, peer_id| {
-        installer::install_replica_tcp(&bus, peer_id, stream, on_message.clone());
-    })
+    Rc::new(
+        move |stream, peer_id| match coord.delegate_replica(stream, peer_id) {
+            Ok(target) => {
+                info!(peer_id, target, "replica connection delegated");
+            }
+            Err(error) => {
+                warn!(
+                    peer_id,
+                    error = ?error,
+                    "delegate_replica failed; dropping inbound replica connection"
+                );
+            }
+        },
+    )
 }
 
-fn make_local_client_accept_fns(
+/// Shard-0 client accept callbacks. TCP and WS clients are delegated via
+/// the coordinator (round-robin to peer shards); QUIC and TCP-TLS install
+/// locally on shard 0 because their per-connection state is not portable
+/// across shards (`compio_quic` endpoint binds one UDP socket; rustls TLS
+/// state ties to the post-handshake reactor).
+fn make_shard_zero_client_accept_fns(
+    coord: Rc<shard::coordinator::ShardZeroCoordinator>,
     bus: &Rc<IggyMessageBus>,
     on_request: RequestHandler,
 ) -> LocalClientAcceptFns {
-    let tcp_bus = Rc::clone(bus);
-    let ws_bus = Rc::clone(bus);
     let quic_bus = Rc::clone(bus);
     let tcp_tls_bus = Rc::clone(bus);
-    let tcp_request = on_request.clone();
-    let ws_request = on_request.clone();
     let quic_request = on_request.clone();
     let tcp_tls_request = on_request;
-    let counter = Rc::new(Cell::new(1_u128));
-    let shard_id = u128::from(bus.shard_id());
-
-    let tcp_counter = Rc::clone(&counter);
-    let tcp = Rc::new(move |stream| {
-        let Some(meta) = client_meta_from_stream(
-            &stream,
-            tcp_counter.as_ref(),
-            shard_id,
-            ClientTransportKind::Tcp,
-        ) else {
-            return;
-        };
-        installer::install_client_tcp(&tcp_bus, meta, stream, tcp_request.clone());
+
+    let tcp_coord = Rc::clone(&coord);
+    let tcp = Rc::new(move |stream| match tcp_coord.delegate_client(stream) {
+        Ok(client_id) => info!(client_id, "TCP client delegated"),
+        Err(error) => warn!(error = ?error, "delegate_client failed; dropping TCP client"),
     });
 
-    let ws_counter = Rc::clone(&counter);
-    let ws = Rc::new(move |stream| {
-        let Some(meta) = client_meta_from_stream(
-            &stream,
-            ws_counter.as_ref(),
-            shard_id,
-            ClientTransportKind::Ws,
-        ) else {
-            return;
-        };
-        let fd = match fd_transfer::dup_fd(&stream) {
-            Ok(fd) => fd,
-            Err(error) => {
-                warn!(
-                    client_id = meta.client_id,
-                    error = %error,
-                    "dropping accepted websocket client after fd duplication failure"
-                );
-                return;
-            }
-        };
-        ws_bus.install_client_ws_fd(fd, meta, ws_request.clone());
+    let ws_coord = Rc::clone(&coord);
+    let ws = Rc::new(move |stream| match ws_coord.delegate_ws_client(stream) {
+        Ok(client_id) => info!(client_id, "WS client delegated"),
+        Err(error) => warn!(error = ?error, "delegate_ws_client failed; dropping WS client"),
     });
 
-    let quic_counter = Rc::clone(&counter);
+    // QUIC and TCP-TLS terminate locally on shard 0 but mint their client
+    // ids through the coordinator's `client_seq`, the same counter the
+    // delegated TCP/WS path uses. A separate counter here would let a
+    // shard-0-local id collide with a delegated id that round-robined to
+    // shard 0 (both encode target shard 0) in shard 0's connection
+    // registry.
+    let quic_coord = Rc::clone(&coord);
     let quic = Rc::new(move |accepted: message_bus::AcceptedQuicConn| {
-        let meta = mint_client_meta(
-            quic_counter.as_ref(),
-            shard_id,
-            accepted.peer_addr(),
-            ClientTransportKind::Quic,
-        );
+        let meta = mint_client_meta(&quic_coord, accepted.peer_addr(), ClientTransportKind::Quic);
         installer::install_client_quic(&quic_bus, meta, accepted, quic_request.clone());
     });
 
-    let tcp_tls_counter = Rc::clone(&counter);
+    let tcp_tls_coord = coord;
     let tcp_tls = Rc::new(move |stream, tls_config| {
-        let Some(meta) = client_meta_from_stream(
-            &stream,
-            tcp_tls_counter.as_ref(),
-            shard_id,
-            ClientTransportKind::TcpTls,
-        ) else {
+        let Some(meta) =
+            client_meta_from_stream(&stream, &tcp_tls_coord, ClientTransportKind::TcpTls)
+        else {
             return;
         };
         installer::install_client_tcp_tls(
@@ -1373,34 +1904,25 @@ fn make_local_client_accept_fns(
 
 fn client_meta_from_stream(
     stream: &compio::net::TcpStream,
-    counter: &Cell<u128>,
-    shard_id: u128,
+    coord: &shard::coordinator::ShardZeroCoordinator,
     transport: ClientTransportKind,
 ) -> Option<ClientConnMeta> {
     let peer_addr = match stream.peer_addr() {
         Ok(peer_addr) => peer_addr,
         Err(error) => {
-            let client_id = preview_client_id(counter, shard_id);
-            warn!(client_id, error = %error, "dropping accepted client with unknown peer address");
+            warn!(error = %error, "dropping accepted client with unknown peer address");
             return None;
         }
     };
-    Some(mint_client_meta(counter, shard_id, peer_addr, transport))
-}
-
-const fn preview_client_id(counter: &Cell<u128>, shard_id: u128) -> u128 {
-    (shard_id << 112) | counter.get()
+    Some(mint_client_meta(coord, peer_addr, transport))
 }
 
 fn mint_client_meta(
-    counter: &Cell<u128>,
-    shard_id: u128,
+    coord: &shard::coordinator::ShardZeroCoordinator,
     peer_addr: SocketAddr,
     transport: ClientTransportKind,
 ) -> ClientConnMeta {
-    let seq = counter.get();
-    counter.set(seq.wrapping_add(1));
-    ClientConnMeta::new((shard_id << 112) | seq, peer_addr, transport)
+    ClientConnMeta::new(coord.mint_shard_zero_client_id(), peer_addr, transport)
 }
 
 async fn start_client_listeners(
@@ -1571,3 +2093,109 @@ fn socket_addr_from_parts(
         })?;
     Ok(SocketAddr::new(ip, port))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn shutdown_on_drop_armed_flips_flag() {
+        let flag = Arc::new(AtomicBool::new(false));
+        drop(ShutdownOnDrop::new(Arc::clone(&flag)));
+        assert!(
+            flag.load(Ordering::Relaxed),
+            "an armed guard must flip the flag on drop (covers the error `?` \
+             and panic-unwind exit paths of run_shard_thread)"
+        );
+    }
+
+    #[test]
+    fn shutdown_on_drop_disarmed_leaves_flag() {
+        let flag = Arc::new(AtomicBool::new(false));
+        let mut guard = ShutdownOnDrop::new(Arc::clone(&flag));
+        guard.disarm();
+        drop(guard);
+        assert!(
+            !flag.load(Ordering::Relaxed),
+            "a disarmed guard must not flip the flag (clean `Ok(())` exit)"
+        );
+    }
+
+    #[compio::test]
+    async fn broadcast_metadata_bundle_returns_immediately_with_no_peers() {
+        // Single-shard cluster: shard 0 has no peers to fan out to, so
+        // the handoff must complete without ever calling `send`.
+        let (bundle_tx, _bundle_rx) = crossfire::mpmc::bounded_async::<ServerNgMetadataBundle>(0);
+        let flag = Arc::new(AtomicBool::new(false));
+        let mux = ServerNgMuxStateMachine::default();
+        broadcast_metadata_bundle(0, &bundle_tx, mux.factory_bundle(), 0, &flag)
+            .await
+            .expect("zero peers must not block shard 0");
+    }
+
+    #[compio::test]
+    async fn metadata_bundle_round_trips_through_channel() {
+        // End-to-end: shard 0 mints a bundle, a peer receives it on
+        // another runtime, and `from_factory_bundle` constructs a
+        // reader-mode mux that observes shard 0's writes via the same
+        // LeftRight pair.
+        let peers = 1u16;
+        let (bundle_tx, bundle_rx) =
+            crossfire::mpmc::bounded_async::<ServerNgMetadataBundle>(usize::from(peers));
+        let flag = Arc::new(AtomicBool::new(false));
+
+        let owner = ServerNgMuxStateMachine::default();
+        let bundle = owner.factory_bundle();
+        broadcast_metadata_bundle(0, &bundle_tx, bundle, peers, &flag)
+            .await
+            .expect("broadcast must succeed with one peer drained");
+
+        let received = await_metadata_bundle(1, &bundle_rx, &flag)
+            .await
+            .expect("peer must receive the broadcast bundle");
+        let _peer_mux = ServerNgMuxStateMachine::from_factory_bundle(received);
+    }
+
+    #[compio::test]
+    async fn await_metadata_bundle_aborts_when_owner_drops_without_sending() {
+        let (bundle_tx, bundle_rx) = crossfire::mpmc::bounded_async::<ServerNgMetadataBundle>(1);
+        let flag = Arc::new(AtomicBool::new(false));
+
+        // Shard 0 dies before broadcasting; the peer must observe the
+        // disconnect and abort instead of hanging forever.
+        drop(bundle_tx);
+
+        let err = await_metadata_bundle(1, &bundle_rx, &flag)
+            .await
+            .expect_err("a peer whose owner never sends must abort");
+        assert!(
+            matches!(err, ServerNgError::MetadataHandoffAborted { shard_id: 1 }),
+            "expected MetadataHandoffAborted, got {err:?}"
+        );
+    }
+
+    #[compio::test]
+    async fn await_metadata_bundle_aborts_on_shutdown_flag() {
+        let (_bundle_tx, bundle_rx) = crossfire::mpmc::bounded_async::<ServerNgMetadataBundle>(1);
+        let flag = Arc::new(AtomicBool::new(false));
+
+        let waiter = compio::runtime::spawn({
+            let flag = Arc::clone(&flag);
+            async move { await_metadata_bundle(1, &bundle_rx, &flag).await }
+        });
+
+        // Owner has not sent yet, but shutdown was requested; the peer
+        // must exit via the flag poll instead of hanging.
+        compio::time::sleep(SHUTDOWN_POLL_INTERVAL / 2).await;
+        flag.store(true, Ordering::Relaxed);
+
+        let err = waiter
+            .await
+            .unwrap_or_else(|e| std::panic::resume_unwind(e))
+            .expect_err("shutdown flag must abort the bundle wait");
+        assert!(
+            matches!(err, ServerNgError::MetadataHandoffAborted { shard_id: 1 }),
+            "expected MetadataHandoffAborted on shutdown, got {err:?}"
+        );
+    }
+}
diff --git a/core/server-ng/src/main.rs b/core/server-ng/src/main.rs
index 169f32f65b..e604f4a6ab 100644
--- a/core/server-ng/src/main.rs
+++ b/core/server-ng/src/main.rs
@@ -23,50 +23,66 @@ mod args;
 
 use args::Args;
 use clap::Parser;
-use server_ng::bootstrap::RunServerNg;
+use server_ng::bootstrap::{bootstrap, load_config};
 use server_ng::server_error::ServerNgError;
+use tracing::{error, info};
 
 fn main() -> Result<(), ServerNgError> {
-    // TODO: decouple runtime creation from the `server` crate and move the shared
-    // compio executor setup into a lower-level crate/module used by both binaries.
-    let runtime = match server::bootstrap::create_shard_executor() {
+    // TODO(hubcio): decouple runtime creation from the `server` crate and
+    // move the shared compio executor setup into a lower-level crate/module
+    // used by both binaries.
+    let bootstrap_runtime = match server::bootstrap::create_shard_executor() {
         Ok(rt) => rt,
         Err(e) => {
             match e.kind() {
                 std::io::ErrorKind::InvalidInput => {
-                    // TODO: decouple io_uring diagnostics from the `server` crate.
                     server::diagnostics::print_invalid_io_uring_args_info();
                 }
                 std::io::ErrorKind::OutOfMemory => {
-                    // TODO: decouple io_uring diagnostics from the `server` crate.
                     server::diagnostics::print_locked_memory_limit_info();
                 }
                 std::io::ErrorKind::PermissionDenied => {
-                    // TODO: decouple io_uring diagnostics from the `server` crate.
                     server::diagnostics::print_io_uring_permission_info();
                 }
                 _ => {}
             }
-            panic!("Cannot create server-ng executor: {e}");
+            panic!("Cannot create server-ng bootstrap executor: {e}");
         }
     };
-    runtime.block_on(async {
-        let args = Args::parse();
-        if let Ok(env_path) = std::env::var("IGGY_ENV_PATH") {
-            let _ = dotenvy::from_path(&env_path);
-        } else {
-            let _ = dotenvy::dotenv();
-        }
 
-        // TODO: decouple logging from the `server` crate and move the shared
-        // logging bootstrap into a lower-level crate/module.
-        let mut logging = server::log::logger::Logging::new();
-        logging.early_init();
+    // Bootstrap on a temporary runtime: parse args, init logging, load
+    // config, init the memory pool. Then drop the runtime and spawn the
+    // per-shard runtimes - each shard thread builds its OWN
+    // `compio::runtime::Runtime` via `create_shard_executor`, pinned to
+    // its CPU.
+    let bootstrap_result: Result<(configs::server_ng::ServerNgConfig, Option<u8>), ServerNgError> =
+        bootstrap_runtime.block_on(async {
+            let args = Args::parse();
+            if let Ok(env_path) = std::env::var("IGGY_ENV_PATH") {
+                let _ = dotenvy::from_path(&env_path);
+            } else {
+                let _ = dotenvy::dotenv();
+            }
+
+            // TODO: decouple logging from the `server` crate.
+            let mut logging = server::log::logger::Logging::new();
+            logging.early_init();
+
+            let config = load_config(&mut logging).await?;
+            iggy_common::MemoryPool::init_pool(&config.system.memory_pool.into_other());
+
+            Ok((config, args.replica_id))
+        });
+    let (config, replica_id) = bootstrap_result?;
+    drop(bootstrap_runtime);
 
-        let config = server_ng::bootstrap::load_config(&mut logging).await?;
-        iggy_common::MemoryPool::init_pool(&config.system.memory_pool.into_other());
+    let cluster = bootstrap(config, replica_id)?;
+    if let Err(error) = cluster.install_ctrlc_handler() {
+        error!(error = %error, "failed to install Ctrl-C handler; cluster shutdown will not respond to SIGINT");
+    }
 
-        let shard = server_ng::bootstrap::bootstrap(&config, args.replica_id).await?;
-        shard.run(&config, args.replica_id).await
-    })
+    info!("server-ng cluster running; waiting on shard threads");
+    cluster.join_all()?;
+    info!("server-ng cluster shutdown complete");
+    Ok(())
 }
diff --git a/core/server-ng/src/server_error.rs b/core/server-ng/src/server_error.rs
index 77e17ee1cb..2af78c2586 100644
--- a/core/server-ng/src/server_error.rs
+++ b/core/server-ng/src/server_error.rs
@@ -20,6 +20,7 @@
 use metadata::impls::recovery::RecoveryError;
 // TODO: decouple logging errors from the `server` crate.
 use server::server_error::LogError;
+use server::shard_allocator::ShardingError;
 use thiserror::Error;
 
 #[derive(Debug, Error)]
@@ -28,6 +29,42 @@ pub enum ServerNgError {
     Iggy(Box<iggy_common::IggyError>),
     #[error("failed to load server-ng config")]
     Config(#[source] configs::ConfigurationError),
+    #[error("failed to allocate shards from sharding.cpu_allocation")]
+    ShardAllocator(#[source] ShardingError),
+    #[error("failed to bind shard {shard_id} to its CPU set")]
+    CpuAffinityFailed {
+        shard_id: u16,
+        #[source]
+        source: ShardingError,
+    },
+    #[error("failed to bind shard {shard_id} memory to its NUMA node")]
+    MemoryAffinityFailed {
+        shard_id: u16,
+        #[source]
+        source: ShardingError,
+    },
+    #[error("failed to spawn OS thread for shard {shard_id}")]
+    ShardSpawnFailed {
+        shard_id: u16,
+        #[source]
+        source: std::io::Error,
+    },
+    #[error("failed to create io_uring runtime for shard {shard_id}")]
+    ShardRuntimeCreateFailed {
+        shard_id: u16,
+        #[source]
+        source: std::io::Error,
+    },
+    #[error("shard {shard_id} thread panicked: {message}")]
+    ShardThreadPanicked { shard_id: u16, message: String },
+    #[error(
+        "computed shards_count = {count} is not a valid cluster size; must be \
+         1..={} (fit in u16 and stay below the OWNER_NONE sentinel)",
+        message_bus::OWNER_NONE - 1
+    )]
+    InvalidShardsCount { count: usize },
+    #[error("system.sharding.inbox_capacity must be in 1..={max}; got {value}")]
+    InvalidInboxCapacity { value: usize, max: usize },
     #[error("failed to serialize current server-ng config")]
     CurrentConfigSerialize(#[source] toml::ser::Error),
     #[error("failed to write current server-ng config at {path}")]
@@ -40,6 +77,11 @@ pub enum ServerNgError {
     Logging(#[source] LogError),
     #[error("failed to recover metadata snapshot and journal")]
     MetadataRecovery(#[source] RecoveryError),
+    #[error(
+        "shard {shard_id} aborted while waiting for shard-0 to broadcast the metadata \
+         factory bundle; shard 0 dropped its sender (most likely it failed to recover)"
+    )]
+    MetadataHandoffAborted { shard_id: u16 },
     #[error("failed to parse {context} socket address '{address}'")]
     SocketAddressParse {
         context: &'static str,
@@ -53,6 +95,12 @@ pub enum ServerNgError {
     ClusterReplicaCountTooLarge { count: usize },
     #[error("cluster mode requires --replica-id to identify the current node")]
     MissingReplicaId,
+    #[error(
+        "--replica-id {supplied} was passed with cluster.enabled=false; the WAL would commit \
+         under replica {default} which permanently fixes this node's identity. Either set \
+         cluster.enabled=true with a matching nodes[] entry, or drop --replica-id"
+    )]
+    ReplicaIdRequiresCluster { supplied: u8, default: u8 },
     #[error("cluster node for replica {replica_id} is missing tcp_replica port")]
     ClusterReplicaPortMissing { replica_id: u8 },
     #[error(
diff --git a/core/server/config.toml b/core/server/config.toml
index 1a58db6c8e..00611bf6ec 100644
--- a/core/server/config.toml
+++ b/core/server/config.toml
@@ -591,6 +591,17 @@ ports = { tcp = 8091, quic = 8081, http = 3001, websocket = 8093, tcp_replica =
 #     + "numa:nodes=0,1;cores=4;no_ht=true": Use NUMA node 0 and 1, each nodes use 4 cores, and no hyperthreads
 cpu_allocation = "numa:auto"
 
+# Per-shard inter-shard inbox channel capacity (server-ng only; the legacy
+# server uses its own hard-coded sizing). Bounded by design: consensus frame
+# drops are recovered by VSR retransmit, but cross-shard client reply drops
+# are terminal. Consensus frames and client-reply forwards share this one
+# channel, so a consensus burst can starve client replies: size against the
+# worst-case consensus working set plus peak client-reply fan-out per shard
+# occurring together. Watch the drop-site tracing logs (and the
+# frame_drops_total{variant="forward_client_send"} counter once a per-shard
+# exporter lands) to detect an under-sized value in production.
+inbox_capacity = 1024
+
 [websocket]
 enabled = true
 address = "127.0.0.1:8092"
diff --git a/core/shard/Cargo.toml b/core/shard/Cargo.toml
index 82bdb69f82..985b8098af 100644
--- a/core/shard/Cargo.toml
+++ b/core/shard/Cargo.toml
@@ -36,6 +36,7 @@ message_bus = { path = "../message_bus" }
 metadata = { path = "../metadata" }
 papaya = { workspace = true }
 partitions = { path = "../partitions" }
+prometheus-client = { workspace = true }
 tracing = { workspace = true }
 
 [lints.clippy]
diff --git a/core/shard/src/builder.rs b/core/shard/src/builder.rs
index b365b99136..d750859431 100644
--- a/core/shard/src/builder.rs
+++ b/core/shard/src/builder.rs
@@ -16,65 +16,49 @@
 // under the License.
 
 //! One-stop construction for an [`IggyShard`] plus, on shard 0 only, its
-//! associated [`ShardZeroCoordinator`] and periodic mapping-refresh task.
+//! associated [`ShardZeroCoordinator`].
 //!
-//! The coordinator owns the authoritative `replica_id -> owning_shard`
-//! snapshot used by the router's `ConnectionLost` fast path and by the
-//! refresh task that re-broadcasts the snapshot to recover shards whose
-//! inbox was full when an earlier `ReplicaMappingUpdate` was sent.
-//! Both the `set_coordinator` call and the refresh-task spawn must happen
-//! together: wiring one without the other breaks the drop-recovery story
-//! the coordinator's rustdoc promises.
+//! The coordinator owns shard-0's round-robin state for replica and
+//! client placement. Cluster-wide `replica_id -> owning_shard` routing
+//! flows through the cluster-shared [`message_bus::ReplicaOwnerTable`],
+//! stamped by the owning shard's installer and CAS-cleared on disconnect.
 //!
-//! Bootstrap code constructs an [`IggyShardBuilder`] per shard and calls
-//! [`IggyShardBuilder::build`]. On shard 0 the returned [`BuiltShard`]
-//! carries the refresh task's [`JoinHandle`]; bootstrap is responsible for
-//! tracking it on the bus' background-task set so graceful shutdown awaits
-//! it. On non-zero shards the handle is `None` and the coordinator field
-//! on the shard stays unset.
+//! The coordinator is constructed before the shard and passed to
+//! [`IggyShard::new`] as a ctor argument, so an `IggyShard` is fully
+//! wired the instant it becomes observable to any reader.
 
-use crate::coordinator::ShardZeroCoordinator;
+use crate::coordinator::{ShardZeroCoordinator, classify_try_send_err};
+use crate::metrics::{ShardMetrics, frame_drop_variant};
 use crate::{
-    CoordinatorConfig, IggyShard, PartitionConsensusConfig, Receiver, ShardFrame,
-    ShardFramePayload, ShardIdentity, TaggedSender,
+    CoordinatorConfig, IggyShard, LifecycleFrame, PartitionConsensusConfig, Receiver, ShardFrame,
+    ShardIdentity, TaggedSender,
 };
-use compio::runtime::JoinHandle;
 use consensus::VsrConsensus;
 use journal::JournalHandle;
-use message_bus::MessageBus;
 use message_bus::client_listener::RequestHandler;
-use message_bus::lifecycle::ShutdownToken;
 use message_bus::replica::listener::MessageHandler;
+use message_bus::{MessageBus, SendError};
 use metadata::IggyMetadata;
 use metadata::stm::StateMachine;
 use partitions::IggyPartitions;
 use std::rc::Rc;
-use tracing::warn;
 
 use crate::shards_table::ShardsTable;
 
-/// A freshly constructed [`IggyShard`], paired with the refresh-task
-/// [`JoinHandle`] when this is shard 0. Bootstrap tracks the handle on
-/// the bus' background tasks so graceful shutdown awaits it.
-pub struct BuiltShard<B, MJ, S, M, T, R = ()>
+/// A freshly constructed [`IggyShard`].
+pub struct BuiltShard<B, MJ, S, M, T>
 where
     B: MessageBus,
-    R: Send + 'static,
 {
-    pub shard: IggyShard<B, MJ, S, M, T, R>,
-    /// `Some` on shard 0, `None` otherwise. Pass to
-    /// `IggyMessageBus::track_background` (or an equivalent tracker) so
-    /// the task is awaited on shutdown.
-    pub refresh_task: Option<JoinHandle<()>>,
+    pub shard: IggyShard<B, MJ, S, M, T>,
 }
 
 /// Builder that pairs [`IggyShard`] construction with coordinator wiring
 /// on shard 0. Non-zero shards skip the coordinator entirely; the
-/// `coord_config` and `shutdown_token` fields are then ignored.
-pub struct IggyShardBuilder<B, MJ, S, M, T, R = ()>
+/// `coord_config` field is then ignored.
+pub struct IggyShardBuilder<B, MJ, S, M, T>
 where
     B: MessageBus,
-    R: Send + 'static,
 {
     identity: ShardIdentity,
     bus: B,
@@ -82,26 +66,24 @@ where
     on_client_request: RequestHandler,
     metadata: IggyMetadata<VsrConsensus<B>, MJ, S, M>,
     partitions: IggyPartitions<B>,
-    senders: Vec<TaggedSender<R>>,
-    inbox: Receiver<ShardFrame<R>>,
+    senders: Vec<TaggedSender>,
+    inbox: Receiver<ShardFrame>,
     shards_table: T,
     partition_consensus: PartitionConsensusConfig<B>,
     coord_config: CoordinatorConfig,
-    shutdown_token: ShutdownToken,
+    metrics: ShardMetrics,
 }
 
-impl<B, MJ, S, M, T, R> IggyShardBuilder<B, MJ, S, M, T, R>
+impl<B, MJ, S, M, T> IggyShardBuilder<B, MJ, S, M, T>
 where
-    B: MessageBus,
-    R: Send + 'static,
+    B: MessageBus + Clone + 'static,
     T: ShardsTable,
     MJ: JournalHandle,
     S: Send + 'static,
     M: StateMachine,
 {
     /// Create a builder carrying every input needed by both
-    /// [`IggyShard::new`] and (for shard 0) `ShardZeroCoordinator::new`
-    /// plus `spawn_refresh_task`.
+    /// [`IggyShard::new`] and (for shard 0) `ShardZeroCoordinator::new`.
     #[allow(clippy::too_many_arguments)]
     pub fn new(
         identity: ShardIdentity,
@@ -110,12 +92,12 @@ where
         on_client_request: RequestHandler,
         metadata: IggyMetadata<VsrConsensus<B>, MJ, S, M>,
         partitions: IggyPartitions<B>,
-        senders: Vec<TaggedSender<R>>,
-        inbox: Receiver<ShardFrame<R>>,
+        senders: Vec<TaggedSender>,
+        inbox: Receiver<ShardFrame>,
         shards_table: T,
         partition_consensus: PartitionConsensusConfig<B>,
         coord_config: CoordinatorConfig,
-        shutdown_token: ShutdownToken,
+        metrics: ShardMetrics,
     ) -> Self {
         Self {
             identity,
@@ -129,15 +111,14 @@ where
             shards_table,
             partition_consensus,
             coord_config,
-            shutdown_token,
+            metrics,
         }
     }
 
     /// Consume the builder and produce a fully wired [`BuiltShard`]. On
-    /// shard 0 this constructs the coordinator, attaches it to the shard,
-    /// and spawns the periodic refresh task. On any other shard the
-    /// coordinator-specific fields are dropped and `refresh_task` is
-    /// `None`.
+    /// shard 0 this constructs the coordinator and passes it to the shard
+    /// ctor. On any other shard the coordinator-specific fields are
+    /// dropped.
     ///
     /// # Panics
     ///
@@ -145,49 +126,84 @@ where
     /// (`senders[i].shard_id() != i`) or if `senders.len()` does not
     /// fit in `u16`. Both are bootstrap programming errors and the
     /// `u16` overflow check fires on every shard, not only shard 0.
-    pub fn build(self) -> BuiltShard<B, MJ, S, M, T, R> {
+    pub fn build(self) -> BuiltShard<B, MJ, S, M, T> {
         let is_shard_zero = self.identity.id == 0;
 
         let total_shards = u16::try_from(self.senders.len())
             .expect("cluster shard count must fit in u16 (bootstrap invariant)");
 
-        // Wire the bus' connection-lost notifier to push a ConnectionLost
-        // frame into shard 0's inbox. Every shard's bus needs this so that
-        // a dead replica connection anywhere surfaces at the coordinator.
-        // Shard 0's sender is cloned once here; the closure captures that
-        // clone and runs on the owning shard's reader/writer tasks.
-        let shard_zero_sender = self.senders[0].sender().clone();
-        let connection_lost_fn: message_bus::ConnectionLostFn = Rc::new(move |replica_id: u8| {
-            let frame = ShardFrame::<R> {
-                payload: ShardFramePayload::ConnectionLost { replica_id },
-                response_sender: None,
-            };
-            if let Err(e) = shard_zero_sender.try_send(frame) {
-                // Inbox full: the coordinator's periodic refresh task
-                // re-broadcasts the authoritative snapshot and the bus
-                // retries on next dial, so a single drop is recoverable.
-                // Warn so operators see repeated drops if they occur.
-                warn!(
-                    replica_id,
-                    "shard-0 inbox rejected ConnectionLost frame: {e}"
-                );
-            }
-        });
-        self.bus.set_connection_lost_fn(connection_lost_fn);
+        // Install the cross-shard forward closures. The bus' slow path
+        // (`send_to_replica` / `send_to_client`) calls these when the
+        // destination connection lives on a different shard; the closure
+        // wraps the message into a `LifecycleFrame::Forward{Replica,
+        // Client}Send` and `try_send`s it to the owning shard's inbox.
+        // The receiving shard's router unwraps the frame and re-enters
+        // the bus' fast path locally. The `senders` Vec is canonically
+        // ordered (`senders[i].shard_id() == i`).
+        debug_assert_eq!(
+            self.senders[0].shard_id(),
+            0,
+            "shard-0 sender slot misordered: senders[0] carries shard_id={}",
+            self.senders[0].shard_id()
+        );
+        let senders_for_replica: Rc<Vec<TaggedSender>> = Rc::new(self.senders.clone());
+        let senders_for_client = Rc::clone(&senders_for_replica);
+        let metrics_for_replica = self.metrics.clone();
+        let metrics_for_client = self.metrics.clone();
+        self.bus
+            .set_replica_forward_fn(Box::new(move |replica_id, owning_shard, msg| {
+                senders_for_replica.get(usize::from(owning_shard)).map_or(
+                    Err(SendError::ReplicaForwardFailed(replica_id)),
+                    |sender| {
+                        let frame = ShardFrame::lifecycle(LifecycleFrame::ForwardReplicaSend {
+                            replica_id,
+                            msg,
+                        });
+                        sender.try_send(frame).map_err(|err| {
+                            metrics_for_replica.record_frame_drop(
+                                frame_drop_variant::FORWARD_REPLICA_SEND,
+                                classify_try_send_err(&err),
+                            );
+                            SendError::ReplicaForwardFailed(replica_id)
+                        })
+                    },
+                )
+            }));
+        self.bus
+            .set_client_forward_fn(Box::new(move |client_id, owning_shard, msg| {
+                senders_for_client.get(usize::from(owning_shard)).map_or(
+                    Err(SendError::ClientForwardFailed(client_id)),
+                    |sender| {
+                        let frame = ShardFrame::lifecycle(LifecycleFrame::ForwardClientSend {
+                            client_id,
+                            msg,
+                        });
+                        sender.try_send(frame).map_err(|err| {
+                            metrics_for_client.record_frame_drop(
+                                frame_drop_variant::FORWARD_CLIENT_SEND,
+                                classify_try_send_err(&err),
+                            );
+                            SendError::ClientForwardFailed(client_id)
+                        })
+                    },
+                )
+            }));
 
-        let coord_inputs = if is_shard_zero {
-            let coord_senders: Vec<TaggedSender<R>> = self.senders.clone();
-            Some((
+        // On shard 0: build the coordinator BEFORE the shard, so the
+        // coordinator can be passed to `IggyShard::new` as a ctor arg.
+        let coordinator = if is_shard_zero {
+            let coord_senders: Vec<TaggedSender> = self.senders.clone();
+            Some(Rc::new(ShardZeroCoordinator::new(
                 Rc::new(coord_senders),
                 total_shards,
-                self.coord_config,
-                self.shutdown_token,
-            ))
+                self.coord_config.clone(),
+                self.metrics.clone(),
+            )))
         } else {
             None
         };
 
-        let mut shard = IggyShard::new(
+        let shard = IggyShard::new(
             self.identity,
             self.bus,
             self.on_replica_message,
@@ -198,20 +214,10 @@ where
             self.inbox,
             self.shards_table,
             self.partition_consensus,
+            coordinator,
+            self.metrics,
         );
 
-        let refresh_task = if let Some((senders_rc, total_shards, cfg, token)) = coord_inputs {
-            let refresh_period = cfg.refresh_period;
-            let coord = Rc::new(ShardZeroCoordinator::new(senders_rc, total_shards, cfg));
-            shard.set_coordinator(Rc::clone(&coord));
-            Some(coord.spawn_refresh_task(token, refresh_period))
-        } else {
-            None
-        };
-
-        BuiltShard {
-            shard,
-            refresh_task,
-        }
+        BuiltShard { shard }
     }
 }
diff --git a/core/shard/src/config.rs b/core/shard/src/config.rs
index 5165828342..27535c2e9a 100644
--- a/core/shard/src/config.rs
+++ b/core/shard/src/config.rs
@@ -23,19 +23,9 @@
 //! from `ServerConfig` lands. Kept in-crate for now to avoid churning
 //! the configs crate ahead of that wiring.
 
-use std::time::Duration;
-
 /// Tunables for [`crate::coordinator::ShardZeroCoordinator`].
 #[derive(Debug, Clone)]
 pub struct CoordinatorConfig {
-    /// Cadence at which shard 0 re-broadcasts its authoritative
-    /// `replica_id -> owning_shard` snapshot. A shard whose inbox was
-    /// full when the original `ReplicaMappingUpdate` was sent recovers
-    /// its mapping on the next tick rather than staying silently stale.
-    /// Must be comfortably shorter than the VSR view-change timeout so
-    /// a missed mapping cannot trigger a spurious view change.
-    pub refresh_period: Duration,
-
     /// When `total_shards > 1`, exclude shard 0 from replica placement.
     /// Shard 0 already hosts the coordinator, the metadata writer, and
     /// both listeners; replicas are long-lived steady flows, so offload
@@ -52,7 +42,6 @@ pub struct CoordinatorConfig {
 impl Default for CoordinatorConfig {
     fn default() -> Self {
         Self {
-            refresh_period: Duration::from_secs(10),
             skip_shard_zero_for_replicas: true,
             skip_shard_zero_for_clients: false,
         }
diff --git a/core/shard/src/coordinator.rs b/core/shard/src/coordinator.rs
index bcf69f6a21..cdbd1df03f 100644
--- a/core/shard/src/coordinator.rs
+++ b/core/shard/src/coordinator.rs
@@ -23,12 +23,16 @@
 //!
 //! 1. picks the next target shard via round-robin,
 //! 2. duplicates the TCP fd,
-//! 3. sends a `ShardFramePayload::{Replica,Client}ConnectionSetup` frame to
+//! 3. sends a `LifecycleFrame::{Replica,Client}ConnectionSetup` frame to
 //!    the target shard's inbox,
 //! 4. drops its own `TcpStream` so only the target shard's wrapped fd
-//!    keeps the socket alive,
-//! 5. broadcasts the resulting replica -> owning shard mapping so every
-//!    bus' `send_to_replica` slow path can route to the owner.
+//!    keeps the socket alive.
+//!
+//! The cluster-shared [`message_bus::ReplicaOwnerTable`] is the
+//! authoritative `replica_id -> owning_shard` view; the owning shard
+//! stamps and CAS-clears its slot synchronously through
+//! `mark_replica_owned` / `clear_replica_owned`, so the coordinator does
+//! not need a separate mapping cache or broadcast subsystem.
 //!
 //! Client ids encode the owning shard in their top 16 bits, so clients do
 //! not need a mapping table; any shard can route a client reply from the
@@ -40,74 +44,39 @@
 //! dropped connection.
 
 use crate::config::CoordinatorConfig;
-use crate::{ShardFrame, ShardFramePayload, TaggedSender, assert_sender_ordering};
+use crate::metrics::{frame_drop_reason, frame_drop_variant};
+use crate::{LifecycleFrame, ShardFrame, TaggedSender, assert_sender_ordering};
 use compio::net::TcpStream;
-use compio::runtime::JoinHandle;
 use message_bus::installer::conn_info::{ClientConnMeta, ClientTransportKind};
-use message_bus::lifecycle::ShutdownToken;
 use message_bus::{SendError, fd_transfer};
-use std::cell::{Cell, RefCell};
+use std::cell::Cell;
 use std::rc::Rc;
-use std::time::Duration;
-use tracing::{debug, warn};
+use tracing::warn;
 
 /// Coordinator owned by shard 0 only.
 ///
 /// Wrapped in `Rc` by the bootstrap and shared with the replica listener,
 /// the connector, and the client listener so each of those paths can
 /// delegate immediately.
-pub struct ShardZeroCoordinator<R: Send + 'static = ()> {
+pub struct ShardZeroCoordinator {
     /// Inter-shard channel senders, indexed by shard id.
     ///
     /// Each [`TaggedSender`] carries the id of the shard whose paired
-    /// receiver drains it, and the ctor asserts `senders[i].shard_id() == i`.
-    /// The previous plain-`Sender` form was a silent-misroute hazard: a Vec
-    /// with correct length but permuted ordering had no way to be caught.
-    senders: Rc<Vec<TaggedSender<R>>>,
+    /// receiver drains it; the ctor asserts `senders[i].shard_id() == i`
+    /// so a permuted Vec is caught at boot rather than misrouting frames.
+    senders: Rc<Vec<TaggedSender>>,
+    /// Per-shard observability handle. Cloned at construction; bumps the
+    /// `frame_drops_total{variant=fd_transfer}` counter when a `try_send`
+    /// rejection occurs in the lifecycle path.
+    metrics: crate::metrics::ShardMetrics,
     total_shards: u16,
     cfg: CoordinatorConfig,
     replica_rr: Cell<u16>,
     client_rr: Cell<u16>,
     client_seq: Cell<u128>,
-    /// Authoritative `replica_id -> owning_shard` view as understood by the
-    /// coordinator. Populated on successful `delegate_replica`, transitioned
-    /// to [`MappingSlot::Cleared`] by [`forget_mapping`](Self::forget_mapping)
-    /// when a replica dies. The periodic refresh task re-broadcasts this
-    /// snapshot so shards whose inbox was full when the original
-    /// `ReplicaMappingUpdate` or `ReplicaMappingClear` was sent recover on
-    /// the next tick rather than staying silently stale until the replica
-    /// reconnects.
-    ///
-    /// Three-state slot is what makes refresh complete: `Untouched` slots
-    /// emit nothing (the cluster never saw that replica id), `Active` slots
-    /// re-emit `ReplicaMappingUpdate`, and `Cleared` slots re-emit
-    /// `ReplicaMappingClear`. A two-state `Option<u16>` would silently lose
-    /// clears against shards whose inbox was full at clear-time.
-    ///
-    /// Flat `[MappingSlot; 256]` to mirror `ReplicaRegistry` and avoid a
-    /// hash lookup on the refresh hot path.
-    mappings: RefCell<[MappingSlot; 256]>,
 }
 
-/// Per-replica mapping state tracked by the coordinator.
-///
-/// `Cleared` is structurally distinct from `Untouched` so the refresh task
-/// can re-broadcast `ReplicaMappingClear` for replicas the cluster did once
-/// see but is no longer routing to.
-#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
-enum MappingSlot {
-    /// Coordinator has never delegated this replica id.
-    #[default]
-    Untouched,
-    /// Replica is connected and routed via `owning_shard`.
-    Active(u16),
-    /// Replica was once active and is now disconnected. Refresh re-broadcasts
-    /// a `ReplicaMappingClear` for this slot until a fresh `delegate_replica`
-    /// flips it back to `Active`.
-    Cleared,
-}
-
-impl<R: Send + 'static> ShardZeroCoordinator<R> {
+impl ShardZeroCoordinator {
     /// # Panics
     ///
     /// Panics if `senders.len() != total_shards`, if `total_shards < 1`, or
@@ -116,9 +85,10 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
     /// invariant from a doc comment to a ctor assertion.
     #[must_use]
     pub fn new(
-        senders: Rc<Vec<TaggedSender<R>>>,
+        senders: Rc<Vec<TaggedSender>>,
         total_shards: u16,
         cfg: CoordinatorConfig,
+        metrics: crate::metrics::ShardMetrics,
     ) -> Self {
         assert_eq!(
             senders.len(),
@@ -129,12 +99,12 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
         assert_sender_ordering(&senders);
         Self {
             senders,
+            metrics,
             total_shards,
             cfg,
             replica_rr: Cell::new(0),
             client_rr: Cell::new(0),
             client_seq: Cell::new(1),
-            mappings: RefCell::new([MappingSlot::Untouched; 256]),
         }
     }
 
@@ -170,11 +140,28 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
         (u128::from(target_shard) << 112) | seq
     }
 
+    /// Mint a client id for a connection that terminates locally on shard 0
+    /// (QUIC, TCP-TLS) instead of being round-robin delegated.
+    ///
+    /// Shares the coordinator's `client_seq` counter with the delegated
+    /// TCP/WS path. Both a shard-0-local connection and a delegated
+    /// connection that round-robined to shard 0 encode `target_shard = 0`
+    /// in the top 16 bits; drawing from separate counters would let them
+    /// mint the same id and collide in shard 0's single connection
+    /// registry. One counter keeps every shard-0 id distinct.
+    #[must_use]
+    pub fn mint_shard_zero_client_id(&self) -> u128 {
+        self.mint_client_id(0)
+    }
+
     /// Ship a replica TCP connection to the next round-robin target shard.
     ///
-    /// On success broadcasts a `ReplicaMappingUpdate` to every shard and
-    /// returns `Ok(target_shard)`. On inter-shard channel failure closes
-    /// the duplicated fd and returns `Err(SendError::RoutingFailed)`.
+    /// Returns `Ok(target_shard)` on a successful `try_send`. The owning
+    /// shard records itself in the cluster-shared
+    /// [`message_bus::ReplicaOwnerTable`] once its installer accepts the
+    /// fd, so no extra broadcast or mapping cache is needed here. On
+    /// inter-shard channel failure closes the duplicated fd and returns
+    /// `Err(SendError::RoutingFailed)`.
     ///
     /// # Errors
     ///
@@ -184,11 +171,13 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
         let target = self.next_replica_target();
         let fd = fd_transfer::dup_fd(&stream).map_err(SendError::DupFailed)?;
 
-        let setup = ShardFramePayload::ReplicaConnectionSetup { fd, replica_id };
+        let setup = LifecycleFrame::ReplicaConnectionSetup { fd, replica_id };
         if let Err(e) = self.senders[target as usize].try_send(ShardFrame::lifecycle(setup)) {
             // The frame (and the `DupedFd` inside) is returned in `e` and
             // dropped at end-of-block, which closes the dup. No explicit
             // `close_fd` needed.
+            self.metrics
+                .record_frame_drop(frame_drop_variant::FD_TRANSFER, classify_try_send_err(&e));
             warn!(
                 replica_id,
                 target, "delegate_replica try_send failed: {e:?}"
@@ -200,35 +189,9 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
         // the socket open.
         drop(stream);
 
-        // Record the mapping on the coordinator's authoritative snapshot
-        // BEFORE broadcasting: if `broadcast_mapping_update` drops a frame
-        // on a full inbox, the periodic refresh task reads this snapshot
-        // and re-broadcasts.
-        self.mappings.borrow_mut()[usize::from(replica_id)] = MappingSlot::Active(target);
-        self.broadcast_mapping_update(replica_id, target);
-
         Ok(target)
     }
 
-    fn broadcast_mapping_update(&self, replica_id: u8, owning_shard: u16) {
-        // Broadcast to every shard (including shard 0 and the owner
-        // itself). Drops on full inbox are tolerable: the owning shard
-        // already holds the fd and the periodic refresh task reconciles
-        // any missed mapping on its next tick.
-        for sender in self.senders.iter() {
-            let update = ShardFramePayload::ReplicaMappingUpdate {
-                replica_id,
-                owning_shard,
-            };
-            if let Err(e) = sender.try_send(ShardFrame::lifecycle(update)) {
-                debug!(
-                    shard_id = sender.shard_id(),
-                    replica_id, "mapping update try_send failed: {e:?}"
-                );
-            }
-        }
-    }
-
     /// Ship a client TCP connection to the next round-robin target shard.
     ///
     /// On success returns the minted client id. On failure closes the
@@ -247,9 +210,11 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
 
         let fd = fd_transfer::dup_fd(&stream).map_err(SendError::DupFailed)?;
         let meta = ClientConnMeta::new(client_id, peer_addr, ClientTransportKind::Tcp);
-        let setup = ShardFramePayload::ClientConnectionSetup { fd, meta };
+        let setup = LifecycleFrame::ClientConnectionSetup { fd, meta };
         if let Err(e) = self.senders[target as usize].try_send(ShardFrame::lifecycle(setup)) {
             // The returned frame owns the `DupedFd` and closes it on drop.
+            self.metrics
+                .record_frame_drop(frame_drop_variant::FD_TRANSFER, classify_try_send_err(&e));
             warn!(client_id, target, "delegate_client try_send failed: {e:?}");
             return Err(SendError::RoutingFailed(target));
         }
@@ -262,7 +227,7 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
     /// round-robin target shard.
     ///
     /// Identical wire path to [`Self::delegate_client`] but ships
-    /// [`ShardFramePayload::ClientWsConnectionSetup`] so the receiving
+    /// [`LifecycleFrame::ClientWsConnectionSetup`] so the receiving
     /// shard runs `compio_ws::accept_async_with_config` before
     /// installing the connection. The fd at ship-time is plain TCP;
     /// the WS state machine only materialises post-upgrade on the
@@ -280,8 +245,10 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
 
         let fd = fd_transfer::dup_fd(&stream).map_err(SendError::DupFailed)?;
         let meta = ClientConnMeta::new(client_id, peer_addr, ClientTransportKind::Ws);
-        let setup = ShardFramePayload::ClientWsConnectionSetup { fd, meta };
+        let setup = LifecycleFrame::ClientWsConnectionSetup { fd, meta };
         if let Err(e) = self.senders[target as usize].try_send(ShardFrame::lifecycle(setup)) {
+            self.metrics
+                .record_frame_drop(frame_drop_variant::FD_TRANSFER, classify_try_send_err(&e));
             warn!(
                 client_id,
                 target, "delegate_ws_client try_send failed: {e:?}"
@@ -293,84 +260,21 @@ impl<R: Send + 'static> ShardZeroCoordinator<R> {
         Ok(client_id)
     }
 
-    /// Broadcast a `ReplicaMappingClear` to every shard. Used by the
-    /// `ConnectionLost` handler before the next `delegate_replica` runs.
-    pub fn broadcast_mapping_clear(&self, replica_id: u8) {
-        crate::broadcast_mapping_clear(&self.senders, replica_id);
-    }
-
-    /// Transition the coordinator's authoritative entry for `replica_id`
-    /// to [`MappingSlot::Cleared`] so the periodic refresh task re-broadcasts
-    /// `ReplicaMappingClear` until a fresh delegation revives the slot. A
-    /// shard whose inbox was full at the original clear-time recovers on the
-    /// next refresh tick rather than retaining the stale `Active(owner)`
-    /// view forever.
-    ///
-    /// Call from shard 0's `ConnectionLost` handler paired with
-    /// [`broadcast_mapping_clear`](Self::broadcast_mapping_clear).
-    pub fn forget_mapping(&self, replica_id: u8) {
-        self.mappings.borrow_mut()[usize::from(replica_id)] = MappingSlot::Cleared;
-    }
-
-    /// Re-broadcast every mapping the coordinator is tracking.
-    ///
-    /// `delegate_replica` and `forget_mapping` broadcast their respective
-    /// `ReplicaMappingUpdate` / `ReplicaMappingClear` as best-effort
-    /// `try_send`: a shard whose inbox was full at the moment of the
-    /// original broadcast permanently lost the change. Without periodic
-    /// refresh, an `Update` miss leaves `send_to_replica` returning
-    /// `ReplicaNotConnected` until the peer reconnects, and a `Clear` miss
-    /// leaves a stale `owning_shard` view that misroutes traffic to a shard
-    /// that no longer holds the connection. This refresh closes both gaps.
-    pub fn broadcast_mapping_snapshot(&self) {
-        // Stage entries in scratch `Vec`s so the `RefCell` borrow is
-        // released before any `try_send` runs, matching the borrow
-        // discipline of the other mapping paths. Heap allocation is
-        // acceptable: this snapshot runs on a periodic refresh tick,
-        // not on the per-message hot path.
-        let mut updates: Vec<(u8, u16)> = Vec::new();
-        let mut clears: Vec<u8> = Vec::new();
-        for (idx, slot) in self.mappings.borrow().iter().enumerate() {
-            #[allow(clippy::cast_possible_truncation)]
-            let replica_id = idx as u8;
-            match *slot {
-                MappingSlot::Active(owner) => updates.push((replica_id, owner)),
-                MappingSlot::Cleared => clears.push(replica_id),
-                MappingSlot::Untouched => {}
-            }
-        }
-        for (replica_id, owning_shard) in updates {
-            self.broadcast_mapping_update(replica_id, owning_shard);
-        }
-        for replica_id in clears {
-            crate::broadcast_mapping_clear(&self.senders, replica_id);
-        }
-    }
-
-    /// Spawn a compio task that calls [`Self::broadcast_mapping_snapshot`]
-    /// every `period` until `token` fires. Bootstrap is expected to track
-    /// the returned handle on the bus's background tasks so graceful
-    /// shutdown awaits it.
-    pub fn spawn_refresh_task(
-        self: &Rc<Self>,
-        token: ShutdownToken,
-        period: Duration,
-    ) -> JoinHandle<()> {
-        let coord = Rc::clone(self);
-        compio::runtime::spawn(async move {
-            while token.sleep_or_shutdown(period).await {
-                coord.broadcast_mapping_snapshot();
-            }
-            debug!("coordinator mapping refresh task exiting");
-        })
-    }
-
     #[must_use]
     pub const fn total_shards(&self) -> u16 {
         self.total_shards
     }
 }
 
+/// Map `crossfire::TrySendError` to the `frame_drop_reason` label set so
+/// every drop site picks the same string.
+pub(crate) const fn classify_try_send_err<T>(err: &crossfire::TrySendError<T>) -> &'static str {
+    match err {
+        crossfire::TrySendError::Full(_) => frame_drop_reason::FULL,
+        crossfire::TrySendError::Disconnected(_) => frame_drop_reason::DISCONNECTED,
+    }
+}
+
 /// Advance `counter` and return the next target shard.
 ///
 /// When `skip_zero` is true and `total_shards > 1`, wraps over
@@ -394,7 +298,7 @@ mod tests {
     fn build_senders(total: u16) -> Rc<Vec<TaggedSender>> {
         let mut senders = Vec::with_capacity(total as usize);
         for shard_id in 0..total {
-            let (tx, _rx) = crate::shard_channel::<()>(shard_id, 16);
+            let (tx, _rx) = crate::shard_channel(shard_id, 16);
             senders.push(tx);
         }
         Rc::new(senders)
@@ -406,7 +310,7 @@ mod tests {
         let mut senders = Vec::with_capacity(total as usize);
         let mut receivers = Vec::with_capacity(total as usize);
         for shard_id in 0..total {
-            let (tx, rx) = crate::shard_channel::<()>(shard_id, 16);
+            let (tx, rx) = crate::shard_channel(shard_id, 16);
             senders.push(tx);
             receivers.push(rx);
         }
@@ -418,19 +322,29 @@ mod tests {
     fn ctor_rejects_permuted_sender_vec() {
         // Build senders in correct order, then swap two entries so the
         // indexed position no longer matches the tagged shard id.
-        let (tx0, _rx0) = crate::shard_channel::<()>(0, 16);
-        let (tx1, _rx1) = crate::shard_channel::<()>(1, 16);
-        let (tx2, _rx2) = crate::shard_channel::<()>(2, 16);
-        let (tx3, _rx3) = crate::shard_channel::<()>(3, 16);
+        let (tx0, _rx0) = crate::shard_channel(0, 16);
+        let (tx1, _rx1) = crate::shard_channel(1, 16);
+        let (tx2, _rx2) = crate::shard_channel(2, 16);
+        let (tx3, _rx3) = crate::shard_channel(3, 16);
         let permuted = Rc::new(vec![tx0, tx2, tx1, tx3]);
-        let _coord = ShardZeroCoordinator::<()>::new(permuted, 4, CoordinatorConfig::default());
+        let _coord = ShardZeroCoordinator::new(
+            permuted,
+            4,
+            CoordinatorConfig::default(),
+            crate::metrics::ShardMetrics::for_shard(0),
+        );
     }
 
     #[test]
     fn replica_rr_default_skips_shard_zero() {
         // Default config: replicas skip shard 0, clients include it.
         let senders = build_senders(4);
-        let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default());
+        let coord = ShardZeroCoordinator::new(
+            senders,
+            4,
+            CoordinatorConfig::default(),
+            crate::metrics::ShardMetrics::for_shard(0),
+        );
 
         // Replicas wrap over [1, 4).
         assert_eq!(coord.next_replica_target(), 1);
@@ -453,9 +367,9 @@ mod tests {
         let cfg = CoordinatorConfig {
             skip_shard_zero_for_replicas: false,
             skip_shard_zero_for_clients: false,
-            ..CoordinatorConfig::default()
         };
-        let coord = ShardZeroCoordinator::<()>::new(senders, 4, cfg);
+        let coord =
+            ShardZeroCoordinator::new(senders, 4, cfg, crate::metrics::ShardMetrics::for_shard(0));
 
         assert_eq!(coord.next_replica_target(), 0);
         assert_eq!(coord.next_replica_target(), 1);
@@ -473,9 +387,9 @@ mod tests {
         let cfg = CoordinatorConfig {
             skip_shard_zero_for_replicas: true,
             skip_shard_zero_for_clients: true,
-            ..CoordinatorConfig::default()
         };
-        let coord = ShardZeroCoordinator::<()>::new(senders, 4, cfg);
+        let coord =
+            ShardZeroCoordinator::new(senders, 4, cfg, crate::metrics::ShardMetrics::for_shard(0));
 
         for _ in 0..8 {
             let r = coord.next_replica_target();
@@ -491,9 +405,9 @@ mod tests {
         let cfg = CoordinatorConfig {
             skip_shard_zero_for_replicas: true,
             skip_shard_zero_for_clients: true,
-            ..CoordinatorConfig::default()
         };
-        let coord = ShardZeroCoordinator::<()>::new(senders, 1, cfg);
+        let coord =
+            ShardZeroCoordinator::new(senders, 1, cfg, crate::metrics::ShardMetrics::for_shard(0));
         for _ in 0..4 {
             assert_eq!(coord.next_replica_target(), 0);
             assert_eq!(coord.next_client_target(), 0);
@@ -503,18 +417,55 @@ mod tests {
     #[test]
     fn mint_client_id_encodes_target_shard() {
         let senders = build_senders(8);
-        let coord = ShardZeroCoordinator::<()>::new(senders, 8, CoordinatorConfig::default());
+        let coord = ShardZeroCoordinator::new(
+            senders,
+            8,
+            CoordinatorConfig::default(),
+            crate::metrics::ShardMetrics::for_shard(0),
+        );
 
         let id = coord.mint_client_id(5);
         assert_eq!((id >> 112) as u16, 5);
         assert_eq!(id & ((1u128 << 112) - 1), 1, "first seq is 1");
     }
 
+    #[test]
+    fn shard_zero_local_and_delegated_ids_never_collide() {
+        let senders = build_senders(4);
+        let coord = ShardZeroCoordinator::new(
+            senders,
+            4,
+            CoordinatorConfig::default(),
+            crate::metrics::ShardMetrics::for_shard(0),
+        );
+
+        // Interleave shard-0-local mints (QUIC / TCP-TLS path) with
+        // delegated mints that round-robin to shard 0. Both encode target
+        // shard 0 in the top 16 bits; the shared counter keeps every id
+        // distinct so they cannot overwrite each other in shard 0's
+        // connection registry.
+        let mut ids = std::collections::HashSet::new();
+        for _ in 0..8 {
+            assert!(ids.insert(coord.mint_shard_zero_client_id()));
+            assert!(ids.insert(coord.mint_client_id(0)));
+        }
+        assert_eq!(
+            ids.len(),
+            16,
+            "every minted shard-0 client id must be unique"
+        );
+    }
+
     #[compio::test]
     #[allow(clippy::future_not_send)]
-    async fn delegate_replica_sends_setup_then_broadcasts_mapping() {
+    async fn delegate_replica_ships_setup_to_target_shard() {
         let (senders, receivers) = build_senders_with_rx(4);
-        let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default());
+        let coord = ShardZeroCoordinator::new(
+            senders,
+            4,
+            CoordinatorConfig::default(),
+            crate::metrics::ShardMetrics::for_shard(0),
+        );
 
         // Loopback TCP pair so delegate_replica has a real fd to dup.
         let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
@@ -531,29 +482,25 @@ mod tests {
 
         // Target shard should observe ReplicaConnectionSetup.
         let setup_frame = receivers[target as usize].recv().await.unwrap();
-        match setup_frame.payload {
-            ShardFramePayload::ReplicaConnectionSetup { fd, replica_id } => {
+        match setup_frame {
+            ShardFrame::Lifecycle(LifecycleFrame::ReplicaConnectionSetup { fd, replica_id }) => {
                 assert_eq!(replica_id, 7);
-                // Drop closes the dup'd fd via `DupedFd::Drop`.
                 drop(fd);
             }
             _ => panic!("expected ReplicaConnectionSetup on target shard"),
         }
 
-        // Every shard (including shard 0 and the target itself) should
-        // observe the ReplicaMappingUpdate broadcast.
+        // Non-target shards must not receive any frame: delegate_replica
+        // no longer broadcasts a mapping update, the cluster-shared
+        // owner table covers cross-shard routing.
         for (idx, rx) in receivers.iter().enumerate() {
-            let frame = rx.recv().await.unwrap();
-            match frame.payload {
-                ShardFramePayload::ReplicaMappingUpdate {
-                    replica_id,
-                    owning_shard,
-                } => {
-                    assert_eq!(replica_id, 7, "shard {idx} mapping update replica id");
-                    assert_eq!(owning_shard, target, "shard {idx} mapping update owner");
-                }
-                _ => panic!("shard {idx} expected ReplicaMappingUpdate"),
+            if idx == target as usize {
+                continue;
             }
+            assert!(
+                rx.try_recv().is_err(),
+                "shard {idx} unexpectedly received a frame after delegate_replica",
+            );
         }
     }
 
@@ -561,7 +508,12 @@ mod tests {
     #[allow(clippy::future_not_send)]
     async fn delegate_client_ships_setup_with_meta_transport_tcp() {
         let (senders, receivers) = build_senders_with_rx(4);
-        let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default());
+        let coord = ShardZeroCoordinator::new(
+            senders,
+            4,
+            CoordinatorConfig::default(),
+            crate::metrics::ShardMetrics::for_shard(0),
+        );
 
         let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
         let addr = listener.local_addr().unwrap();
@@ -577,8 +529,8 @@ mod tests {
         );
 
         let setup_frame = receivers[target as usize].recv().await.unwrap();
-        match setup_frame.payload {
-            ShardFramePayload::ClientConnectionSetup { fd, meta } => {
+        match setup_frame {
+            ShardFrame::Lifecycle(LifecycleFrame::ClientConnectionSetup { fd, meta }) => {
                 assert_eq!(meta.client_id, client_id);
                 assert!(matches!(meta.transport, ClientTransportKind::Tcp));
                 drop(fd);
@@ -591,7 +543,12 @@ mod tests {
     #[allow(clippy::future_not_send)]
     async fn delegate_ws_client_ships_setup_with_meta_transport_ws() {
         let (senders, receivers) = build_senders_with_rx(4);
-        let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default());
+        let coord = ShardZeroCoordinator::new(
+            senders,
+            4,
+            CoordinatorConfig::default(),
+            crate::metrics::ShardMetrics::for_shard(0),
+        );
 
         let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
         let addr = listener.local_addr().unwrap();
@@ -607,8 +564,8 @@ mod tests {
         );
 
         let setup_frame = receivers[target as usize].recv().await.unwrap();
-        match setup_frame.payload {
-            ShardFramePayload::ClientWsConnectionSetup { fd, meta } => {
+        match setup_frame {
+            ShardFrame::Lifecycle(LifecycleFrame::ClientWsConnectionSetup { fd, meta }) => {
                 assert_eq!(meta.client_id, client_id);
                 assert!(
                     matches!(meta.transport, ClientTransportKind::Ws),
@@ -620,156 +577,4 @@ mod tests {
             _ => panic!("expected ClientWsConnectionSetup variant"),
         }
     }
-
-    #[compio::test]
-    #[allow(clippy::future_not_send)]
-    async fn broadcast_mapping_clear_reaches_every_shard() {
-        let (senders, receivers) = build_senders_with_rx(4);
-        let coord = ShardZeroCoordinator::<()>::new(senders, 4, CoordinatorConfig::default());
-
-        coord.broadcast_mapping_clear(9);
-
-        for (idx, rx) in receivers.iter().enumerate() {
-            let frame = rx.recv().await.unwrap();
-            match frame.payload {
-                ShardFramePayload::ReplicaMappingClear { replica_id } => {
-                    assert_eq!(replica_id, 9, "shard {idx} mapping clear replica id");
-                }
-                _ => panic!("shard {idx} expected ReplicaMappingClear"),
-            }
-        }
-    }
-
-    #[compio::test]
-    #[allow(clippy::future_not_send)]
-    async fn snapshot_rebroadcasts_every_tracked_mapping() {
-        let (senders, receivers) = build_senders_with_rx(4);
-        let coord = Rc::new(ShardZeroCoordinator::<()>::new(
-            senders,
-            4,
-            CoordinatorConfig::default(),
-        ));
-
-        // Seed the coordinator's tracked mappings directly (delegate_replica
-        // needs a real TCP fd; the snapshot path is orthogonal to dup).
-        coord.mappings.borrow_mut()[3] = MappingSlot::Active(1);
-        coord.mappings.borrow_mut()[7] = MappingSlot::Active(2);
-
-        coord.broadcast_mapping_snapshot();
-
-        // Each shard must observe an update for each seeded mapping. The
-        // order across replica ids is unspecified (HashMap->Vec iteration
-        // on the collected snapshot), so collect and compare as a set.
-        for (idx, rx) in receivers.iter().enumerate() {
-            let mut observed = std::collections::BTreeSet::new();
-            for _ in 0..2 {
-                let frame = rx.recv().await.unwrap();
-                match frame.payload {
-                    ShardFramePayload::ReplicaMappingUpdate {
-                        replica_id,
-                        owning_shard,
-                    } => {
-                        observed.insert((replica_id, owning_shard));
-                    }
-                    _ => panic!("shard {idx} expected ReplicaMappingUpdate"),
-                }
-            }
-            let expected: std::collections::BTreeSet<_> =
-                [(3u8, 1u16), (7u8, 2u16)].into_iter().collect();
-            assert_eq!(
-                observed, expected,
-                "shard {idx} did not receive the full snapshot"
-            );
-        }
-    }
-
-    #[compio::test]
-    #[allow(clippy::future_not_send)]
-    async fn forget_mapping_transitions_slot_to_cleared_state() {
-        let (senders, _receivers) = build_senders_with_rx(2);
-        let coord = ShardZeroCoordinator::<()>::new(senders, 2, CoordinatorConfig::default());
-
-        coord.mappings.borrow_mut()[4] = MappingSlot::Active(1);
-        coord.mappings.borrow_mut()[5] = MappingSlot::Active(1);
-        coord.forget_mapping(4);
-
-        assert_eq!(coord.mappings.borrow()[4], MappingSlot::Cleared);
-        assert_eq!(coord.mappings.borrow()[5], MappingSlot::Active(1));
-    }
-
-    #[compio::test]
-    #[allow(clippy::future_not_send)]
-    async fn snapshot_includes_clear_for_forgotten_slots() {
-        let (senders, receivers) = build_senders_with_rx(3);
-        let coord = Rc::new(ShardZeroCoordinator::<()>::new(
-            senders,
-            3,
-            CoordinatorConfig::default(),
-        ));
-
-        // Two live mappings + one slot that was previously delegated and is
-        // now forgotten. Refresh must rebroadcast a Clear for the forgotten
-        // slot so a shard that missed the original Clear (full inbox) can
-        // reconcile its stale Active view on the next tick.
-        coord.mappings.borrow_mut()[3] = MappingSlot::Active(1);
-        coord.mappings.borrow_mut()[7] = MappingSlot::Active(2);
-        coord.mappings.borrow_mut()[5] = MappingSlot::Cleared;
-
-        coord.broadcast_mapping_snapshot();
-
-        for (idx, rx) in receivers.iter().enumerate() {
-            let mut updates = std::collections::BTreeSet::new();
-            let mut clears = std::collections::BTreeSet::new();
-            for _ in 0..3 {
-                let frame = rx.recv().await.unwrap();
-                match frame.payload {
-                    ShardFramePayload::ReplicaMappingUpdate {
-                        replica_id,
-                        owning_shard,
-                    } => {
-                        updates.insert((replica_id, owning_shard));
-                    }
-                    ShardFramePayload::ReplicaMappingClear { replica_id } => {
-                        clears.insert(replica_id);
-                    }
-                    _ => panic!("shard {idx} expected mapping update or clear"),
-                }
-            }
-
-            let expected_updates: std::collections::BTreeSet<_> =
-                [(3u8, 1u16), (7u8, 2u16)].into_iter().collect();
-            let expected_clears: std::collections::BTreeSet<_> = std::iter::once(5u8).collect();
-            assert_eq!(
-                updates, expected_updates,
-                "shard {idx} did not receive both Update frames"
-            );
-            assert_eq!(
-                clears, expected_clears,
-                "shard {idx} did not receive a Clear for the forgotten slot"
-            );
-        }
-    }
-
-    #[compio::test]
-    #[allow(clippy::future_not_send)]
-    async fn snapshot_skips_untouched_slots() {
-        let (senders, receivers) = build_senders_with_rx(2);
-        let coord = Rc::new(ShardZeroCoordinator::<()>::new(
-            senders,
-            2,
-            CoordinatorConfig::default(),
-        ));
-
-        // No mappings populated: every slot is `Untouched`. Snapshot must
-        // be silent so the cluster does not pay 256 try_sends per shard per
-        // refresh tick when nothing has been delegated yet.
-        coord.broadcast_mapping_snapshot();
-
-        for (idx, rx) in receivers.iter().enumerate() {
-            assert!(
-                rx.try_recv().is_err(),
-                "shard {idx} should not receive any frame from an empty snapshot"
-            );
-        }
-    }
 }
diff --git a/core/shard/src/lib.rs b/core/shard/src/lib.rs
index 2dc7a4942f..d6efa5074a 100644
--- a/core/shard/src/lib.rs
+++ b/core/shard/src/lib.rs
@@ -18,6 +18,7 @@
 pub mod builder;
 pub mod config;
 pub mod coordinator;
+pub mod metrics;
 mod router;
 pub mod shards_table;
 
@@ -105,14 +106,36 @@ pub fn channel<T: Send + 'static>(capacity: usize) -> (Sender<T>, Receiver<T>) {
 /// Bootstrap uses this to build the per-shard sender `Vec` such that
 /// `vec[i]` necessarily reaches shard `i`.
 #[must_use]
-pub fn shard_channel<R: Send + 'static>(
-    owner_shard: u16,
-    capacity: usize,
-) -> (TaggedSender<R>, Receiver<ShardFrame<R>>) {
-    let (tx, rx) = channel::<ShardFrame<R>>(capacity);
+pub fn shard_channel(owner_shard: u16, capacity: usize) -> (TaggedSender, Receiver<ShardFrame>) {
+    let (tx, rx) = channel::<ShardFrame>(capacity);
     (TaggedSender::new(owner_shard, tx), rx)
 }
 
+/// Build canonical-ordered `(senders, inboxes)` pair for an N-shard cluster.
+///
+/// Each `inboxes[i]` drains exclusively on the runtime owning shard `i`. The
+/// returned `senders` Vec satisfies `senders[i].shard_id() == i` by
+/// construction; clone it into every shard before spawning so all shards
+/// share the same mesh.
+///
+/// Receivers are wrapped in `Option` because [`Receiver`] (crossfire
+/// `AsyncRx`) is non-cloneable on purpose; bootstrap takes the slot for
+/// shard `i` exactly once when spawning the owning thread.
+#[must_use]
+pub fn shard_cluster_channels(
+    total_shards: u16,
+    capacity: usize,
+) -> (Vec<TaggedSender>, Vec<Option<Receiver<ShardFrame>>>) {
+    let mut senders = Vec::with_capacity(total_shards as usize);
+    let mut inboxes = Vec::with_capacity(total_shards as usize);
+    for shard_id in 0..total_shards {
+        let (tx, rx) = shard_channel(shard_id, capacity);
+        senders.push(tx);
+        inboxes.push(Some(rx));
+    }
+    (senders, inboxes)
+}
+
 /// A [`Sender`] annotated with the id of the shard whose paired receiver it
 /// feeds.
 ///
@@ -123,18 +146,18 @@ pub fn shard_channel<R: Send + 'static>(
 /// [`TaggedSender::new`]) at the channel-creation site; the coordinator and
 /// [`IggyShard`] ctors then assert `senders[i].shard_id() == i`, turning the
 /// ordering invariant from a doc comment into a ctor-checked type property.
-pub struct TaggedSender<R: Send + 'static = ()> {
+pub struct TaggedSender {
     shard_id: u16,
-    inner: Sender<ShardFrame<R>>,
+    inner: Sender<ShardFrame>,
 }
 
-impl<R: Send + 'static> TaggedSender<R> {
+impl TaggedSender {
     /// Wrap an already-constructed sender with the id of the shard whose
     /// paired receiver drains it. Prefer [`shard_channel`] unless an
     /// existing sender is being re-tagged (e.g., tests that build senders
     /// manually and know the ordering is correct).
     #[must_use]
-    pub const fn new(shard_id: u16, inner: Sender<ShardFrame<R>>) -> Self {
+    pub const fn new(shard_id: u16, inner: Sender<ShardFrame>) -> Self {
         Self { shard_id, inner }
     }
 
@@ -144,12 +167,12 @@ impl<R: Send + 'static> TaggedSender<R> {
     }
 
     #[must_use]
-    pub const fn sender(&self) -> &Sender<ShardFrame<R>> {
+    pub const fn sender(&self) -> &Sender<ShardFrame> {
         &self.inner
     }
 }
 
-impl<R: Send + 'static> Clone for TaggedSender<R> {
+impl Clone for TaggedSender {
     fn clone(&self) -> Self {
         Self {
             shard_id: self.shard_id,
@@ -158,8 +181,8 @@ impl<R: Send + 'static> Clone for TaggedSender<R> {
     }
 }
 
-impl<R: Send + 'static> std::ops::Deref for TaggedSender<R> {
-    type Target = Sender<ShardFrame<R>>;
+impl std::ops::Deref for TaggedSender {
+    type Target = Sender<ShardFrame>;
 
     fn deref(&self) -> &Self::Target {
         &self.inner
@@ -167,13 +190,13 @@ impl<R: Send + 'static> std::ops::Deref for TaggedSender<R> {
 }
 
 /// Assert the canonical ordering `senders[i].shard_id() == i`. Called from
-/// every ctor that accepts a pre-built `Vec<TaggedSender<_>>`.
+/// every ctor that accepts a pre-built `Vec<TaggedSender>`.
 ///
 /// # Panics
 ///
 /// Panics if any sender in `senders` carries a `shard_id` that does not
 /// match its index. Bootstrap programming error.
-fn assert_sender_ordering<R: Send + 'static>(senders: &[TaggedSender<R>]) {
+fn assert_sender_ordering(senders: &[TaggedSender]) {
     for (idx, sender) in senders.iter().enumerate() {
         let expected = u16::try_from(idx).expect("shard count must fit in u16");
         assert_eq!(
@@ -185,11 +208,22 @@ fn assert_sender_ordering<R: Send + 'static>(senders: &[TaggedSender<R>]) {
     }
 }
 
-/// Payload carried by a [`ShardFrame`].
+/// Lifecycle frame variants.
+///
+/// Connection setup and cross-shard forwards: every frame the inter-shard
+/// channel carries that is NOT a consensus protocol message lives here.
+/// Splitting these out from [`ShardFrame::Consensus`] keeps the consensus
+/// dispatch path hot and cache-tight while leaving lifecycle traffic on
+/// the same single channel (preserving relative ordering between consensus
+/// and lifecycle frames at near-zero cost).
+///
+/// Trade-off: consensus and lifecycle traffic compete for one bounded
+/// inbox. A consensus burst or retransmit storm can fill it exactly when
+/// a terminal-drop [`LifecycleFrame::ForwardClientSend`] needs the space;
+/// `inbox_capacity` is a single knob and cannot isolate the two frame
+/// classes.
 #[non_exhaustive]
-pub enum ShardFramePayload {
-    /// A consensus protocol message routed between shards.
-    Consensus(Message<GenericHeader>),
+pub enum LifecycleFrame {
     /// Shard 0 distributes an inbound replica TCP connection fd to the owning
     /// shard. The receiving shard wraps the fd in a `TcpStream` and spawns
     /// writer + reader tasks on its own compio runtime. The `fd` is an
@@ -221,12 +255,6 @@ pub enum ShardFramePayload {
     /// Shard 0 therefore terminates QUIC locally and uses the existing
     /// `ForwardClientSend` variant for outbound traffic.
     ClientWsConnectionSetup { fd: DupedFd, meta: ClientConnMeta },
-    /// Shard 0 broadcasts the owner for a replica to every shard so each
-    /// bus' `send_to_replica` slow path can route through the correct owner.
-    ReplicaMappingUpdate { replica_id: u8, owning_shard: u16 },
-    /// Shard 0 broadcasts that a replica mapping should be forgotten (e.g.
-    /// after a connection loss and before the next allocate).
-    ReplicaMappingClear { replica_id: u8 },
     /// A non-owning shard forwards a replica send to the owning shard's
     /// local bus; the owning shard then takes the fast path.
     ForwardReplicaSend {
@@ -239,83 +267,44 @@ pub enum ShardFramePayload {
         client_id: u128,
         msg: Frozen<MESSAGE_ALIGN>,
     },
-    /// Owning shard notifies shard 0 that a replica connection died.
-    /// Shard 0 clears the mapping cluster-wide and drives a reconnect.
-    ConnectionLost { replica_id: u8 },
 }
 
-/// Envelope for inter-shard channel messages.
-///
-/// Wraps a [`ShardFramePayload`] together with an optional one-shot response
-/// channel.  Fire-and-forget dispatches leave `response_sender` as `None`;
-/// request-response dispatches provide a sender that the message pump will
-/// notify once the message has been processed.
+/// Inter-shard channel envelope.
 ///
-/// The response type `R` is generic so that higher layers (e.g. HTTP handlers)
-/// can carry a response enum while the consensus layer can default to `()`.
-pub struct ShardFrame<R: Send + 'static = ()> {
-    pub payload: ShardFramePayload,
-    pub response_sender: Option<Sender<R>>,
+/// Concrete enum; no generic. Consensus dispatches are fire-and-forget by
+/// VSR design (replies travel as their own wire-level messages: `Reply`
+/// to clients, `PrepareOk` to the primary), so no response channel rides
+/// in the frame.
+#[non_exhaustive]
+pub enum ShardFrame {
+    /// A consensus protocol message (Request / Prepare / `PrepareOk` /
+    /// view-change family / Commit). Fire-and-forget. Drops on full inbox
+    /// are recovered by VSR retransmit timers.
+    Consensus(Message<GenericHeader>),
+    /// A connection setup or cross-shard forward frame. Drop recovery
+    /// depends on the frame class: [`LifecycleFrame::ForwardReplicaSend`]
+    /// is VSR-covered, connection-setup frames are recovered by the
+    /// connector's periodic reconnect sweep, but
+    /// [`LifecycleFrame::ForwardClientSend`] is terminal - no retransmit,
+    /// the client never receives the reply.
+    Lifecycle(LifecycleFrame),
 }
 
-impl<R: Send + 'static> ShardFrame<R> {
-    /// Create a fire-and-forget consensus message frame.
+impl ShardFrame {
+    /// Create a consensus frame.
     #[must_use]
-    pub const fn fire_and_forget(message: Message<GenericHeader>) -> Self {
-        Self {
-            payload: ShardFramePayload::Consensus(message),
-            response_sender: None,
-        }
+    pub const fn consensus(message: Message<GenericHeader>) -> Self {
+        Self::Consensus(message)
     }
 
-    /// Create a request-response consensus message frame. Returns the frame
-    /// and a receiver that the caller can await for completion notification.
+    /// Create a lifecycle frame.
     #[must_use]
-    pub fn with_response(message: Message<GenericHeader>) -> (Self, Receiver<R>) {
-        let (tx, rx) = channel(1);
-        (
-            Self {
-                payload: ShardFramePayload::Consensus(message),
-                response_sender: Some(tx),
-            },
-            rx,
-        )
-    }
-
-    /// Create a fire-and-forget lifecycle frame (connection setup, loss).
-    #[must_use]
-    pub const fn lifecycle(payload: ShardFramePayload) -> Self {
-        Self {
-            payload,
-            response_sender: None,
-        }
+    pub const fn lifecycle(payload: LifecycleFrame) -> Self {
+        Self::Lifecycle(payload)
     }
 }
 
-/// Broadcast a `ReplicaMappingClear` to every shard (including sender-self).
-///
-/// Used by shard 0's `ConnectionLost` handler and by
-/// [`coordinator::ShardZeroCoordinator::broadcast_mapping_clear`] so both
-/// paths go through the same try-send-and-log logic. `try_send` failures
-/// (inbox full or disconnected) are logged at debug: the next mapping
-/// broadcast or reconnect sweep will reconcile.
-pub(crate) fn broadcast_mapping_clear<R: Send + 'static>(
-    senders: &[TaggedSender<R>],
-    replica_id: u8,
-) {
-    for sender in senders {
-        let clear = ShardFramePayload::ReplicaMappingClear { replica_id };
-        if let Err(e) = sender.try_send(ShardFrame::lifecycle(clear)) {
-            tracing::debug!(
-                shard_id = sender.shard_id(),
-                replica_id,
-                "mapping clear try_send failed: {e:?}"
-            );
-        }
-    }
-}
-
-pub struct IggyShard<B, MJ, S, M, T = (), R: Send + 'static = ()>
+pub struct IggyShard<B, MJ, S, M, T = ()>
 where
     B: MessageBus,
 {
@@ -324,8 +313,8 @@ where
     pub plane: ShardPlane<B, MJ, S, M>,
 
     /// Handle to the local bus. Retained alongside the bus owned by every
-    /// consensus plane so the router can reach the `ConnectionInstaller` /
-    /// mapping surface without going through consensus.
+    /// consensus plane so the router can reach the `ConnectionInstaller`
+    /// surface without going through consensus.
     pub bus: B,
 
     /// Callback attached to every delegated replica connection installed
@@ -344,26 +333,33 @@ where
     /// [`assert_sender_ordering`] is invoked in the ctor so `senders[i]`
     /// is guaranteed to feed the shard whose `id == i`. Call sites can
     /// therefore index by `target_shard` without re-checking.
-    senders: Vec<TaggedSender<R>>,
+    senders: Vec<TaggedSender>,
+
+    /// Cluster shard count, cached from `senders.len()` at construction.
+    /// `senders` is immutable post-ctor, so consensus routing reads this
+    /// rather than recomputing the `usize -> u32` conversion per frame.
+    shard_count: u32,
 
     /// Receiver end of this shard's inbox.  Peer shards (and self) send
     /// messages here via the corresponding sender.
-    inbox: Receiver<ShardFrame<R>>,
+    inbox: Receiver<ShardFrame>,
 
     /// Partition namespace -> owning shard lookup.
     shards_table: T,
 
     partition_consensus: PartitionConsensusConfig<B>,
 
-    /// Shard 0 coordinator, set at bootstrap on shard 0 only. The router's
-    /// `ConnectionLost` handler calls [`coordinator::ShardZeroCoordinator::forget_mapping`]
-    /// so the periodic refresh task stops re-broadcasting the mapping of
-    /// a replica that has disconnected. `None` on non-zero shards and in
-    /// single-shard tests that bypass the coordinator.
-    coordinator: Option<Rc<crate::coordinator::ShardZeroCoordinator<R>>>,
+    /// Shard 0 coordinator, supplied at construction. Holds round-robin
+    /// state for replica and client delegation. `None` on non-zero shards
+    /// and in single-shard tests that bypass the coordinator.
+    coordinator: Option<Rc<crate::coordinator::ShardZeroCoordinator>>,
+
+    /// Per-shard observability counters. Cloned at metric increment sites,
+    /// so cheap (`Arc` clone) regardless of label cardinality.
+    metrics: crate::metrics::ShardMetrics,
 }
 
-impl<B, MJ, S, M, T, R: Send + 'static> IggyShard<B, MJ, S, M, T, R>
+impl<B, MJ, S, M, T> IggyShard<B, MJ, S, M, T>
 where
     B: MessageBus,
     T: ShardsTable,
@@ -371,14 +367,20 @@ where
     /// Create a new shard with channel links and a shards table.
     ///
     /// * `bus` - shard-local bus handle (kept alongside the buses owned
-    ///   by the consensus planes so the router can reach installer /
-    ///   mapping operations directly).
+    ///   by the consensus planes so the router can reach the
+    ///   `ConnectionInstaller` surface directly).
     /// * `senders` - one [`TaggedSender`] per shard. The ctor asserts
     ///   `senders[i].shard_id() == i`; use [`shard_channel`] at
     ///   construction time so every sender carries the id of the shard
     ///   whose receiver drains it.
     /// * `inbox` - the receiver that this shard drains in its message pump.
     /// * `shards_table` - namespace -> shard routing table.
+    /// * `coordinator` - `Some` on shard 0 (supplied by the builder when
+    ///   `is_shard_zero`), `None` everywhere else. Immutable post-ctor:
+    ///   the coordinator is injected at construction time so an
+    ///   `IggyShard` cannot appear half-wired to a reader.
+    /// * `metrics` - per-shard observability handle; currently the
+    ///   `frame_drops_total` counter.
     ///
     /// # Panics
     ///
@@ -395,12 +397,15 @@ where
         on_client_request: RequestHandler,
         metadata: IggyMetadata<VsrConsensus<B>, MJ, S, M>,
         partitions: IggyPartitions<B>,
-        senders: Vec<TaggedSender<R>>,
-        inbox: Receiver<ShardFrame<R>>,
+        senders: Vec<TaggedSender>,
+        inbox: Receiver<ShardFrame>,
         shards_table: T,
         partition_consensus: PartitionConsensusConfig<B>,
+        coordinator: Option<Rc<crate::coordinator::ShardZeroCoordinator>>,
+        metrics: crate::metrics::ShardMetrics,
     ) -> Self {
         assert_sender_ordering(&senders);
+        let shard_count = u32::try_from(senders.len()).unwrap_or(u32::MAX);
         let plane = MuxPlane::new(variadic!(metadata, partitions));
         let ShardIdentity { id, name } = identity;
         Self {
@@ -411,29 +416,22 @@ where
             on_replica_message,
             on_client_request,
             senders,
+            shard_count,
             inbox,
             shards_table,
             partition_consensus,
-            coordinator: None,
+            coordinator,
+            metrics,
         }
     }
 
-    /// Attach a shard-0 coordinator. Bootstrap calls this on the shard 0
-    /// `IggyShard` only; other shards keep `coordinator = None`. The
-    /// router's `ConnectionLost` handler will call
-    /// [`coordinator::ShardZeroCoordinator::forget_mapping`] via this
-    /// reference so the periodic refresh stops re-broadcasting dead
-    /// replica mappings.
-    pub fn set_coordinator(&mut self, coord: Rc<crate::coordinator::ShardZeroCoordinator<R>>) {
-        self.coordinator = Some(coord);
-    }
-
-    /// `true` when a [`coordinator::ShardZeroCoordinator`] has been attached
-    /// via [`set_coordinator`](Self::set_coordinator). Shard 0 bootstrap is
-    /// expected to flip this on; every other shard keeps it `false`.
+    /// Return a clone of the shard-0 coordinator handle, if attached.
+    /// Bootstrap uses this to wire the listener accept callbacks
+    /// (replica + client) to coordinator-driven fd-delegation instead
+    /// of installing connections locally on shard 0.
     #[must_use]
-    pub const fn has_coordinator(&self) -> bool {
-        self.coordinator.is_some()
+    pub fn coordinator(&self) -> Option<Rc<crate::coordinator::ShardZeroCoordinator>> {
+        self.coordinator.clone()
     }
 
     /// Create a shard without inter-shard channels or delegated connections.
@@ -451,9 +449,9 @@ where
         shards_table: T,
         partition_consensus: PartitionConsensusConfig<B>,
     ) -> Self {
-        // TODO: previously we used unbounded channel with flume,
-        // but this is not possible with crossfire without mangling types due to Flavor trait in crossfire.
-        // This needs to be revisited in the future.
+        // TODO(hubcio): crossfire's Flavor trait blocks unbounded channels
+        // with the current type setup; revisit when crossfire grows an
+        // unbounded variant or we replace it.
         let (_tx, inbox) = channel(1);
         let plane = MuxPlane::new(variadic!(metadata, partitions));
         let ShardIdentity { id, name } = identity;
@@ -466,9 +464,14 @@ where
             plane,
             coordinator: None,
             senders: Vec::new(),
+            // The simulator delivers inbound messages straight to
+            // `on_message`, bypassing the inter-shard router, so the
+            // routing-only `shard_count` is never read on this path.
+            shard_count: 0,
             inbox,
             shards_table,
             partition_consensus,
+            metrics: crate::metrics::ShardMetrics::for_shard(id),
         }
     }
 
@@ -480,7 +483,7 @@ where
 
 /// Local message processing — these methods handle messages that have been
 /// routed to this shard via the message pump.
-impl<B, MJ, S, M, T, R: Send + 'static> IggyShard<B, MJ, S, M, T, R>
+impl<B, MJ, S, M, T> IggyShard<B, MJ, S, M, T>
 where
     B: MessageBus,
 {
@@ -587,7 +590,11 @@ where
     /// # Panics
     /// Panics if a loopback message is not a valid `PrepareOk` message.
     #[allow(clippy::future_not_send)]
-    pub async fn process_loopback(&self, buf: &mut Vec<Message<GenericHeader>>) -> usize
+    pub async fn process_loopback(
+        &self,
+        buf: &mut Vec<Message<GenericHeader>>,
+        namespace_scratch: &mut Vec<IggyNamespace>,
+    ) -> usize
     where
         B: MessageBus,
         MJ: JournalHandle,
@@ -603,6 +610,10 @@ where
             > + StreamsFrontend,
     {
         debug_assert!(buf.is_empty(), "buf must be empty on entry");
+        debug_assert!(
+            namespace_scratch.is_empty(),
+            "namespace_scratch must be empty on entry",
+        );
 
         let mut total = 0;
         let planes = self.plane.inner();
@@ -619,8 +630,8 @@ where
             }
         }
 
-        let namespaces: Vec<_> = planes.1.0.namespaces().copied().collect();
-        for namespace in namespaces {
+        namespace_scratch.extend(planes.1.0.namespaces().copied());
+        for namespace in namespace_scratch.drain(..) {
             let partition = planes
                 .1
                 .0
@@ -642,9 +653,17 @@ where
 
     /// Initializes a partition and its dedicated consensus instance on this shard.
     ///
+    /// Used only by the `simulator` crate to seed partitions on the
+    /// in-process replica array. Production boots via the
+    /// `load_partition` helper in `server-ng` bootstrap, which takes the
+    /// cluster `self_replica_id` explicitly from `topology` and never
+    /// folds the local shard id into the consensus replica space.
+    ///
     /// # Panics
-    /// Panics if the shard id does not fit in `u8`, which is currently required
-    /// by the partition consensus replica id.
+    /// Panics if `self.id > u8::MAX` (256+). The simulator currently
+    /// caps replica count at `MAX_REPLICAS = 256`, so the cast always
+    /// succeeds in the only caller; the constraint is recorded here so
+    /// any future caller in production code reads it before the cast.
     pub fn init_partition(&mut self, namespace: IggyNamespace)
     where
         B: MessageBus + Clone,
diff --git a/core/shard/src/metrics.rs b/core/shard/src/metrics.rs
new file mode 100644
index 0000000000..330a813ac6
--- /dev/null
+++ b/core/shard/src/metrics.rs
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Per-shard frame-drop accounting.
+//!
+//! [`ShardMetrics`] holds `frame_drops_total{shard_id, variant, reason}`,
+//! bumped whenever an inter-shard `try_send` is rejected (`Full` /
+//! `Disconnected`) or its target shard id is out of range
+//! (`Unroutable`):
+//! - [`crate::coordinator::ShardZeroCoordinator`] - fd-transfer delegation.
+//! - the cross-shard forward closures built in [`crate::builder`].
+//! - `IggyShard::try_send_to_target` - consensus frames.
+//!
+//! The counter uses atomic interior mutability, safe to bump from `!Send`
+//! compio reactor contexts. Each shard owns its own instance. It is not
+//! yet exposed through a scrape endpoint; every drop site also logs via
+//! `tracing`, so the counter is a structured complement to those logs
+//! until a per-shard exporter lands.
+//!
+//! TODO(hubcio): register `frame_drops_total` with a per-shard prometheus
+//! exporter so the counter is scrape-able; until then the `tracing`
+//! drop-site logs are the only alertable signal.
+
+use prometheus_client::encoding::EncodeLabelSet;
+use prometheus_client::metrics::counter::Counter;
+use prometheus_client::metrics::family::Family;
+
+/// Label for `frame_drops_total`.
+///
+/// `variant` describes the dropped frame class; `reason` is `"full"` or
+/// `"disconnected"` per crossfire `TrySendError`, or `"unroutable"` when
+/// the target shard id has no sender slot.
+#[derive(Clone, Hash, Eq, PartialEq, EncodeLabelSet, Debug)]
+pub struct FrameDropLabel {
+    pub shard_id: u16,
+    pub variant: &'static str,
+    pub reason: &'static str,
+}
+
+/// Variant labels used in `frame_drops_total`. Exposed as constants to
+/// catch typos at compile time and to keep the cardinality bounded.
+///
+/// `FORWARD_CLIENT_SEND` ticks when the cross-shard client-reply forward
+/// closure fails. Unlike `CONSENSUS` drops (which VSR retransmit
+/// recovers), a `FORWARD_CLIENT_SEND` drop is terminal: the client never
+/// receives the reply and request / response semantics break above the
+/// bus. Operators should alert on the drop-site `tracing` logs (the
+/// counter is not scrape-able yet, see the module doc) and size
+/// `inbox_capacity` for the worst-case cross-shard reply burst.
+/// `FORWARD_REPLICA_SEND` is the symmetric variant for replica forwards;
+/// VSR retransmit covers its loss so it stays informational.
+pub mod frame_drop_variant {
+    pub const CONSENSUS: &str = "consensus";
+    pub const LIFECYCLE: &str = "lifecycle";
+    pub const FD_TRANSFER: &str = "fd_transfer";
+    pub const FORWARD_CLIENT_SEND: &str = "forward_client_send";
+    pub const FORWARD_REPLICA_SEND: &str = "forward_replica_send";
+}
+
+/// Reason labels used in `frame_drops_total`.
+///
+/// `UNROUTABLE` ticks when a frame's target shard id has no sender slot
+/// (`target >= senders.len()`). Unreachable while every shard seeds its
+/// `ShardsTable` identically at boot, but `shard_for` returns a stored
+/// `u16` so the index is guarded rather than trusted.
+pub mod frame_drop_reason {
+    pub const FULL: &str = "full";
+    pub const DISCONNECTED: &str = "disconnected";
+    pub const UNROUTABLE: &str = "unroutable";
+}
+
+/// Per-shard metric handles.
+///
+/// Cheap to clone (`Arc` of a `Family` under the hood). Each shard owns
+/// one instance produced by [`ShardMetrics::for_shard`].
+#[derive(Clone)]
+pub struct ShardMetrics {
+    shard_id: u16,
+    frame_drops_total: Family<FrameDropLabel, Counter>,
+}
+
+impl ShardMetrics {
+    /// Create a metrics handle bound to `shard_id`.
+    #[must_use]
+    pub fn for_shard(shard_id: u16) -> Self {
+        Self {
+            shard_id,
+            frame_drops_total: Family::default(),
+        }
+    }
+
+    /// Increment `frame_drops_total{shard_id, variant, reason}` by 1.
+    pub fn record_frame_drop(&self, variant: &'static str, reason: &'static str) {
+        self.frame_drops_total
+            .get_or_create(&FrameDropLabel {
+                shard_id: self.shard_id,
+                variant,
+                reason,
+            })
+            .inc();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn frame_drop_counter_increments_per_label_set() {
+        let metrics = ShardMetrics::for_shard(0);
+        metrics.record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL);
+        metrics.record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL);
+        metrics.record_frame_drop(
+            frame_drop_variant::CONSENSUS,
+            frame_drop_reason::DISCONNECTED,
+        );
+
+        let count = |variant, reason| {
+            metrics
+                .frame_drops_total
+                .get_or_create(&FrameDropLabel {
+                    shard_id: 0,
+                    variant,
+                    reason,
+                })
+                .get()
+        };
+
+        assert_eq!(
+            count(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL),
+            2,
+            "two FULL drops land on one label set",
+        );
+        assert_eq!(
+            count(
+                frame_drop_variant::CONSENSUS,
+                frame_drop_reason::DISCONNECTED,
+            ),
+            1,
+            "a distinct reason gets its own counter",
+        );
+    }
+}
diff --git a/core/shard/src/router.rs b/core/shard/src/router.rs
index c09ae2c6f5..baf8d77195 100644
--- a/core/shard/src/router.rs
+++ b/core/shard/src/router.rs
@@ -15,12 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::metrics::{frame_drop_reason, frame_drop_variant};
 use crate::shards_table::{ShardsTable, calculate_shard_from_consensus_ns};
-use crate::{IggyShard, Receiver, ShardFrame};
+use crate::{IggyShard, LifecycleFrame, Receiver, ShardFrame};
 use crossfire::TrySendError;
 use futures::FutureExt;
 use iggy_binary_protocol::{
-    ConsensusError, ConsensusHeader, GenericHeader, Message, MessageBag, PrepareHeader,
+    CommitHeader, ConsensusHeader, DoViewChangeHeader, GenericHeader, Message, MessageBag,
+    Operation, PrepareHeader, PrepareOkHeader, RequestHeader, StartViewChangeHeader,
+    StartViewHeader,
 };
 use iggy_common::sharding::IggyNamespace;
 use journal::{Journal, JournalHandle};
@@ -28,24 +31,100 @@ use message_bus::{ConnectionInstaller, MessageBus};
 use metadata::impls::metadata::StreamsFrontend;
 use metadata::stm::StateMachine;
 
+/// Decompose a [`MessageBag`] into the routing-relevant tuple
+/// `(operation, namespace, generic_message)`.
+///
+/// Single source of truth used by every dispatch entry point so the
+/// operation / namespace extraction never drifts between call sites.
+fn extract_routing(bag: MessageBag) -> (Operation, u64, Message<GenericHeader>) {
+    match bag {
+        MessageBag::Request(r) => {
+            let h = *r.header();
+            (h.operation, h.namespace, r.into_generic())
+        }
+        MessageBag::Prepare(p) => {
+            let h = *p.header();
+            (h.operation, h.namespace, p.into_generic())
+        }
+        MessageBag::PrepareOk(p) => {
+            let h = *p.header();
+            (h.operation, h.namespace, p.into_generic())
+        }
+        MessageBag::StartViewChange(m) => {
+            let h = *m.header();
+            (h.operation(), h.namespace, m.into_generic())
+        }
+        MessageBag::DoViewChange(m) => {
+            let h = *m.header();
+            (h.operation(), h.namespace, m.into_generic())
+        }
+        MessageBag::StartView(m) => {
+            let h = *m.header();
+            (h.operation(), h.namespace, m.into_generic())
+        }
+        MessageBag::Commit(m) => {
+            let h = *m.header();
+            (h.operation(), h.namespace, m.into_generic())
+        }
+    }
+}
+
+/// Borrowing variant of [`extract_routing`] used by
+/// [`Router::accept_frame_for_self`]. Reads `operation` and `namespace`
+/// through [`Message::try_as_typed`] so the check no longer clones the
+/// full `Message` (heap memcpy of `Owned<MESSAGE_ALIGN>`) on every
+/// Consensus frame. Returns `None` when the command is unknown or the
+/// typed view fails to validate; the caller treats either as
+/// always-accept rather than a routing mismatch.
+fn extract_routing_ref(message: &Message<GenericHeader>) -> Option<(Operation, u64)> {
+    use iggy_binary_protocol::Command2;
+    match message.header().command {
+        Command2::Request => {
+            let typed = message.try_as_typed::<RequestHeader>().ok()?;
+            Some((typed.header().operation, typed.header().namespace))
+        }
+        Command2::Prepare => {
+            let typed = message.try_as_typed::<PrepareHeader>().ok()?;
+            Some((typed.header().operation, typed.header().namespace))
+        }
+        Command2::PrepareOk => {
+            let typed = message.try_as_typed::<PrepareOkHeader>().ok()?;
+            Some((typed.header().operation, typed.header().namespace))
+        }
+        Command2::StartViewChange => {
+            let typed = message.try_as_typed::<StartViewChangeHeader>().ok()?;
+            Some((typed.header().operation(), typed.header().namespace))
+        }
+        Command2::DoViewChange => {
+            let typed = message.try_as_typed::<DoViewChangeHeader>().ok()?;
+            Some((typed.header().operation(), typed.header().namespace))
+        }
+        Command2::StartView => {
+            let typed = message.try_as_typed::<StartViewHeader>().ok()?;
+            Some((typed.header().operation(), typed.header().namespace))
+        }
+        Command2::Commit => {
+            let typed = message.try_as_typed::<CommitHeader>().ok()?;
+            Some((typed.header().operation(), typed.header().namespace))
+        }
+        _ => None,
+    }
+}
+
 /// Inter-shard dispatch logic.
 ///
 /// All messages — whether destined for a local or remote shard — are routed
 /// through the channel into the target shard's message pump.  This ensures
 /// that every mutation on a shard is serialized through a single point (the
 /// pump), preventing concurrent access from independent async tasks.
-impl<B, MJ, S, M, T, R> IggyShard<B, MJ, S, M, T, R>
+impl<B, MJ, S, M, T> IggyShard<B, MJ, S, M, T>
 where
     B: MessageBus + ConnectionInstaller,
     T: ShardsTable,
-    R: Send + 'static,
 {
-    /// Classify a raw network message and route it to
-    /// the correct shard's message pump.
-    ///
-    /// Decomposes the generic message into its typed form (Request, Prepare,
-    /// or `PrepareOk`) to access the operation and namespace, then resolves
-    /// the target shard and enqueues the message via its channel sender.
+    /// Network-receive entry point. Classifies the raw
+    /// `Message<GenericHeader>` and routes it to the owning shard via
+    /// `route_typed`.
     pub fn dispatch(&self, message: Message<GenericHeader>) {
         let bag = match MessageBag::try_from(message) {
             Ok(bag) => bag,
@@ -54,41 +133,38 @@ where
                 return;
             }
         };
-        let (operation, namespace, generic) = match bag {
-            MessageBag::Request(r) => {
-                let h = *r.header();
-                (h.operation, h.namespace, r.into_generic())
-            }
-            MessageBag::Prepare(p) => {
-                let h = *p.header();
-                (h.operation, h.namespace, p.into_generic())
-            }
-            MessageBag::PrepareOk(p) => {
-                let h = *p.header();
-                (h.operation, h.namespace, p.into_generic())
-            }
-            MessageBag::StartViewChange(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
-            MessageBag::DoViewChange(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
-            MessageBag::StartView(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
-            MessageBag::Commit(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
-        };
-        let raw_namespace = namespace;
-        let partition_namespace = IggyNamespace::from_raw(raw_namespace);
-        let target = if operation.is_metadata() {
-            0
-        } else if operation.is_partition() {
+        let (operation, namespace, generic) = extract_routing(bag);
+        self.route_typed(operation, namespace, generic);
+    }
+
+    /// Route a consensus-control message (`StartViewChange`, `DoViewChange`,
+    /// `StartView`, `Commit`) by hashing its raw `u64` consensus namespace.
+    /// Every node uses the same hash so the shard owning the consensus
+    /// group is deterministic across the cluster.
+    pub(crate) fn route_consensus_control(
+        &self,
+        message: Message<GenericHeader>,
+        namespace_u64: u64,
+        operation: Operation,
+    ) {
+        let target = calculate_shard_from_consensus_ns(namespace_u64, self.shard_count);
+        self.try_send_to_target(target, message, operation);
+    }
+
+    /// Branch on operation class to pick the right routing rule. Used by
+    /// [`Self::dispatch`].
+    fn route_typed(
+        &self,
+        operation: Operation,
+        namespace_u64: u64,
+        generic: Message<GenericHeader>,
+    ) {
+        if operation.is_metadata() {
+            self.try_send_to_target(0, generic, operation);
+            return;
+        }
+        if operation.is_partition() {
+            let partition_namespace = IggyNamespace::from_raw(namespace_u64);
             let Some(target) = self.shards_table.shard_for(partition_namespace) else {
                 tracing::error!(
                     shard = self.id,
@@ -99,144 +175,67 @@ where
                 );
                 return;
             };
-            target
-        } else {
-            // View-change / Commit messages carry an opaque u64 consensus
-            // namespace (not an `IggyNamespace`). Hash it with the same
-            // function the partition-plane lookup table uses so the
-            // shard owning the consensus group is deterministically the
-            // same across every node. Single-shard deployments trivially
-            // collapse to 0.
-            #[allow(clippy::cast_lossless)]
-            let shard_count = u32::try_from(self.senders.len()).unwrap_or(u32::MAX);
-            calculate_shard_from_consensus_ns(raw_namespace, shard_count)
-        };
-        // `senders[target]` is a `crossfire::MTx`, which in compio is
-        // running on an io_uring reactor: a blocking `send` on a full
-        // inbox would park the reactor thread and stall every other
-        // connection on the shard. Drop instead - consensus recovers via
-        // WAL retransmit or a view change.
-        match self.senders[target as usize].try_send(ShardFrame::fire_and_forget(generic)) {
-            Ok(()) => {}
-            Err(TrySendError::Full(_)) => {
-                tracing::warn!(
-                    shard = self.id,
-                    target,
-                    ?operation,
-                    "dispatch: shard inbox full, message dropped"
-                );
-            }
-            Err(TrySendError::Disconnected(_)) => {
-                tracing::warn!(
-                    shard = self.id,
-                    target,
-                    ?operation,
-                    "dispatch: shard inbox closed, message dropped"
-                );
-            }
+            self.try_send_to_target(target, generic, operation);
+            return;
         }
+        self.route_consensus_control(generic, namespace_u64, operation);
     }
 
-    /// Dispatch a message and return a receiver that resolves when the target
-    /// shard has finished processing it.
-    ///
-    /// # Errors
-    /// Returns `ConsensusError` if the message cannot be routed.
-    pub fn dispatch_request(
+    /// Send `message` into `senders[target]`. Honors the `io_uring` reactor
+    /// constraint: never blocks; drops on `Full` / `Disconnected` and
+    /// records the drop in `frame_drops_total{variant=consensus}`. VSR
+    /// retransmit recovers consensus drops. A `target` past the end of
+    /// `senders` (a stored `u16` from `shard_for`, not a trusted index)
+    /// is dropped with `reason=unroutable` rather than panicking.
+    /// Metadata frames always pass `target = 0` here, since `is_metadata`
+    /// operations are owned by shard 0.
+    fn try_send_to_target(
         &self,
+        target: u16,
         message: Message<GenericHeader>,
-    ) -> Result<Receiver<R>, ConsensusError> {
-        let bag = MessageBag::try_from(message)?;
-        let (operation, namespace, generic) = match bag {
-            MessageBag::Request(r) => {
-                let h = *r.header();
-                (h.operation, h.namespace, r.into_generic())
-            }
-            MessageBag::Prepare(p) => {
-                let h = *p.header();
-                (h.operation, h.namespace, p.into_generic())
-            }
-            MessageBag::PrepareOk(p) => {
-                let h = *p.header();
-                (h.operation, h.namespace, p.into_generic())
-            }
-            MessageBag::StartViewChange(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
-            MessageBag::DoViewChange(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
-            MessageBag::StartView(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
-            MessageBag::Commit(m) => {
-                let h = *m.header();
-                (h.operation(), h.namespace, m.into_generic())
-            }
+        operation: Operation,
+    ) {
+        let Some(sender) = self.senders.get(target as usize) else {
+            self.metrics
+                .record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::UNROUTABLE);
+            tracing::error!(
+                shard = self.id,
+                target,
+                ?operation,
+                "dispatch: target shard id out of range, message dropped"
+            );
+            return;
         };
-        let raw_namespace = namespace;
-        let partition_namespace = IggyNamespace::from_raw(raw_namespace);
-
-        // Determine which shard should handle a message given its operation and
-        // namespace.
-        //
-        // - Metadata operations always route to shard 0 (the control plane).
-        // - Partition operations route to the shard that owns the namespace,
-        //   looked up via the [`ShardsTable`].
-        // - Consensus control-plane (`StartViewChange`, `DoViewChange`,
-        //   `StartView`, `Commit`) carries a raw `u64` consensus namespace;
-        //   route it via a deterministic hash function so every node agrees
-        //   on the owning shard without consulting the partitions table.
-        let target = if operation.is_metadata() {
-            0
-        } else if operation.is_partition() {
-            self.shards_table
-                .shard_for(partition_namespace)
-                .ok_or_else(|| {
-                    ConsensusError::InvalidField(format!(
-                        "namespace {raw_namespace} is not registered in shards_table"
-                    ))
-                })?
-        } else {
-            #[allow(clippy::cast_lossless)]
-            let shard_count = u32::try_from(self.senders.len()).unwrap_or(u32::MAX);
-            calculate_shard_from_consensus_ns(raw_namespace, shard_count)
-        };
-        // Create a frame and send it to the target shard. Same non-
-        // blocking `try_send` reason as `dispatch` above: blocking on a
-        // full inbox would park the io_uring reactor.
-        //
-        // On `Full` / `Disconnected` we drop the frame and let the
-        // caller's `rx` observe the disconnect (the `response_sender`
-        // inside the frame is dropped together with the frame). Callers
-        // see an early error rather than hanging.
-        let (frame, rx) = ShardFrame::<R>::with_response(generic);
-        match self.senders[target as usize].try_send(frame) {
+        match sender.try_send(ShardFrame::consensus(message)) {
             Ok(()) => {}
             Err(TrySendError::Full(_)) => {
+                self.metrics
+                    .record_frame_drop(frame_drop_variant::CONSENSUS, frame_drop_reason::FULL);
                 tracing::warn!(
                     shard = self.id,
                     target,
                     ?operation,
-                    "dispatch_request: shard inbox full, message dropped"
+                    "dispatch: shard inbox full, message dropped"
                 );
             }
             Err(TrySendError::Disconnected(_)) => {
+                self.metrics.record_frame_drop(
+                    frame_drop_variant::CONSENSUS,
+                    frame_drop_reason::DISCONNECTED,
+                );
                 tracing::warn!(
                     shard = self.id,
                     target,
                     ?operation,
-                    "dispatch_request: shard inbox closed, message dropped"
+                    "dispatch: shard inbox closed, message dropped"
                 );
             }
         }
-        Ok(rx)
     }
 
-    /// Drain this shard's inbox and process each frame locally.
+    /// Drain this shard's inbox and process each frame locally until the
+    /// `stop` signal fires or the inbox disconnects, then drain any frames
+    /// still queued so in-flight requests still get a response.
     #[allow(clippy::future_not_send)]
     pub async fn run_message_pump(&self, stop: Receiver<()>)
     where
@@ -254,14 +253,17 @@ where
             > + StreamsFrontend,
     {
         let mut loopback_buf = Vec::new();
+        let mut namespace_scratch: Vec<IggyNamespace> = Vec::new();
         loop {
             futures::select! {
                 _ = stop.recv().fuse() => break,
                 frame = self.inbox.recv().fuse() => {
                     match frame {
                         Ok(frame) => {
-                            self.process_frame(frame).await;
-                            self.process_loopback(&mut loopback_buf).await;
+                            if self.accept_frame_for_self(&frame) {
+                                self.process_frame(frame).await;
+                                self.process_loopback(&mut loopback_buf, &mut namespace_scratch).await;
+                            }
                         }
                         Err(_) => break,
                     }
@@ -271,13 +273,79 @@ where
 
         // Drain remaining frames so in-flight requests get a response.
         while let Ok(frame) = self.inbox.try_recv() {
-            self.process_frame(frame).await;
-            self.process_loopback(&mut loopback_buf).await;
+            if self.accept_frame_for_self(&frame) {
+                self.process_frame(frame).await;
+                self.process_loopback(&mut loopback_buf, &mut namespace_scratch)
+                    .await;
+            }
+        }
+    }
+
+    /// Sanity check at pump entry: every Consensus frame routed through
+    /// [`Self::dispatch`] must land on the shard whose `id` matches
+    /// `senders[target]`. The ctor `assert_sender_ordering` proves
+    /// `senders[i].shard_id() == i`, but only checks the sender vec -
+    /// it cannot catch a receiver moved to the wrong runtime. This
+    /// check closes that gap. Non-Consensus frames, and Consensus frames
+    /// whose namespace is not (yet) routable, are always accepted; a
+    /// Consensus frame that resolves to a *different* shard fires a
+    /// `debug_assert_eq!` in test/debug builds and is logged + dropped
+    /// in release. Returns `true` when the frame may be processed,
+    /// `false` when it must be dropped.
+    fn accept_frame_for_self(&self, frame: &ShardFrame) -> bool {
+        // Single-shard clusters cannot misroute: every namespace maps to
+        // shard 0, so the per-frame routing re-derivation is pure waste.
+        if self.shard_count <= 1 {
+            return true;
+        }
+        let ShardFrame::Consensus(message) = frame else {
+            return true;
+        };
+        let Some((operation, namespace_u64)) = extract_routing_ref(message) else {
+            return true;
+        };
+        let expected = if operation.is_metadata() {
+            0
+        } else if operation.is_partition() {
+            let ns = IggyNamespace::from_raw(namespace_u64);
+            let Some(target) = self.shards_table.shard_for(ns) else {
+                // Namespace absent from the shards_table. Until runtime
+                // namespace propagation lands this cannot happen (all
+                // shards seed the table identically); once it does, an
+                // unpropagated entry is a transient state, not a routing
+                // bug - accept and log rather than panic or drop.
+                tracing::error!(
+                    shard = self.id,
+                    stream = ns.stream_id(),
+                    topic = ns.topic_id(),
+                    partition = ns.partition_id(),
+                    "Consensus frame namespace missing from shards_table; accepting frame"
+                );
+                return true;
+            };
+            target
+        } else {
+            calculate_shard_from_consensus_ns(namespace_u64, self.shard_count)
+        };
+        if expected == self.id {
+            return true;
         }
+        debug_assert_eq!(
+            expected, self.id,
+            "shard {} pump received Consensus frame whose target_shard={expected}; \
+             senders/runtime incorrectly bound",
+            self.id
+        );
+        tracing::error!(
+            shard = self.id,
+            expected,
+            "Consensus frame routed to wrong shard; dropping frame to preserve safety"
+        );
+        false
     }
 
     #[allow(clippy::future_not_send)]
-    async fn process_frame(&self, frame: ShardFrame<R>)
+    async fn process_frame(&self, frame: ShardFrame)
     where
         B: MessageBus,
         MJ: JournalHandle,
@@ -292,11 +360,21 @@ where
                 Error = iggy_common::IggyError,
             > + StreamsFrontend,
     {
-        match frame.payload {
-            crate::ShardFramePayload::Consensus(message) => {
+        match frame {
+            ShardFrame::Consensus(message) => {
                 self.on_message(message).await;
             }
-            crate::ShardFramePayload::ReplicaConnectionSetup { fd, replica_id } => {
+            ShardFrame::Lifecycle(payload) => self.process_lifecycle(payload).await,
+        }
+    }
+
+    #[allow(clippy::future_not_send)]
+    async fn process_lifecycle(&self, payload: LifecycleFrame)
+    where
+        B: MessageBus,
+    {
+        match payload {
+            LifecycleFrame::ReplicaConnectionSetup { fd, replica_id } => {
                 tracing::info!(
                     shard = self.id,
                     replica_id,
@@ -306,7 +384,7 @@ where
                 self.bus
                     .install_replica_tcp_fd(fd, replica_id, self.on_replica_message.clone());
             }
-            crate::ShardFramePayload::ClientConnectionSetup { fd, meta } => {
+            LifecycleFrame::ClientConnectionSetup { fd, meta } => {
                 tracing::info!(
                     shard = self.id,
                     client_id = meta.client_id,
@@ -316,7 +394,7 @@ where
                 self.bus
                     .install_client_fd(fd, meta, self.on_client_request.clone());
             }
-            crate::ShardFramePayload::ClientWsConnectionSetup { fd, meta } => {
+            LifecycleFrame::ClientWsConnectionSetup { fd, meta } => {
                 tracing::info!(
                     shard = self.id,
                     client_id = meta.client_id,
@@ -326,18 +404,7 @@ where
                 self.bus
                     .install_client_ws_fd(fd, meta, self.on_client_request.clone());
             }
-            crate::ShardFramePayload::ReplicaMappingUpdate {
-                replica_id,
-                owning_shard,
-            } => {
-                self.bus.set_shard_mapping(replica_id, owning_shard);
-            }
-            crate::ShardFramePayload::ReplicaMappingClear { replica_id } => {
-                self.bus.remove_shard_mapping(replica_id);
-            }
-            crate::ShardFramePayload::ForwardReplicaSend { replica_id, msg } => {
-                // Fast path on the owning shard: re-enter `send_to_replica`
-                // which finds the local `BusSender` in the replicas registry.
+            LifecycleFrame::ForwardReplicaSend { replica_id, msg } => {
                 if let Err(e) = self.bus.send_to_replica(replica_id, msg).await {
                     tracing::debug!(
                         shard = self.id,
@@ -347,7 +414,7 @@ where
                     );
                 }
             }
-            crate::ShardFramePayload::ForwardClientSend { client_id, msg } => {
+            LifecycleFrame::ForwardClientSend { client_id, msg } => {
                 if let Err(e) = self.bus.send_to_client(client_id, msg).await {
                     tracing::debug!(
                         shard = self.id,
@@ -357,35 +424,6 @@ where
                     );
                 }
             }
-            crate::ShardFramePayload::ConnectionLost { replica_id } => {
-                // Shard 0 is the sole responder; the next reconnect sweep
-                // will re-dial the peer. Non-zero shards should not see this
-                // variant.
-                if self.id == 0 {
-                    tracing::warn!(replica_id, "shard 0 observed replica connection loss");
-                    // If a coordinator is attached, let it broadcast the
-                    // clear AND drop its tracked mapping so the periodic
-                    // refresh task stops re-broadcasting a dead replica's
-                    // mapping. Tests that bypass the coordinator fall back
-                    // to the direct broadcast.
-                    if let Some(coord) = &self.coordinator {
-                        coord.forget_mapping(replica_id);
-                        coord.broadcast_mapping_clear(replica_id);
-                    } else {
-                        crate::broadcast_mapping_clear(&self.senders, replica_id);
-                    }
-                } else {
-                    tracing::warn!(
-                        shard = self.id,
-                        replica_id,
-                        "non-zero shard received ConnectionLost; dropping"
-                    );
-                }
-            }
         }
-        // TODO: once on_message returns an R (e.g. ShardResponse), send it
-        // back via frame.response_sender.  For now the sender is dropped and
-        // the caller's receiver will observe a disconnect.
-        drop(frame.response_sender);
     }
 }
diff --git a/core/shard/src/shards_table.rs b/core/shard/src/shards_table.rs
index 0a0a940644..6829f28cdb 100644
--- a/core/shard/src/shards_table.rs
+++ b/core/shard/src/shards_table.rs
@@ -32,7 +32,11 @@ pub trait ShardsTable {
     fn shard_for(&self, namespace: IggyNamespace) -> Option<u16>;
 }
 
-/// No-op shards table for single-shard setups.
+/// Always-`None` impl. Satisfies the trait for test / simulator paths
+/// that never route via the table (e.g. [`crate::IggyShard::without_inbox`]).
+/// Do not use in production: the router drops any partition frame whose
+/// `shard_for` returns `None`, so a real cluster wired to this impl would
+/// silently shed every partition request.
 impl ShardsTable for () {
     fn shard_for(&self, _namespace: IggyNamespace) -> Option<u16> {
         None
diff --git a/core/simulator/src/bus.rs b/core/simulator/src/bus.rs
index d185e77be6..5e6769b77a 100644
--- a/core/simulator/src/bus.rs
+++ b/core/simulator/src/bus.rs
@@ -143,6 +143,10 @@ impl MessageBus for SimOutbox {
 
         Ok(())
     }
+
+    fn set_connection_lost_fn(&self, _f: message_bus::ConnectionLostFn) {}
+    fn set_replica_forward_fn(&self, _f: message_bus::ReplicaForwardFn) {}
+    fn set_client_forward_fn(&self, _f: message_bus::ClientForwardFn) {}
 }
 
 /// Newtype wrapper for shared [`SimOutbox`] that implements [`MessageBus`]
@@ -172,4 +176,16 @@ impl MessageBus for SharedSimOutbox {
     ) -> Result<(), SendError> {
         self.0.send_to_replica(replica, data).await
     }
+
+    fn set_connection_lost_fn(&self, f: message_bus::ConnectionLostFn) {
+        self.0.set_connection_lost_fn(f);
+    }
+
+    fn set_replica_forward_fn(&self, f: message_bus::ReplicaForwardFn) {
+        self.0.set_replica_forward_fn(f);
+    }
+
+    fn set_client_forward_fn(&self, f: message_bus::ClientForwardFn) {
+        self.0.set_client_forward_fn(f);
+    }
 }
diff --git a/core/simulator/src/lib.rs b/core/simulator/src/lib.rs
index 51c97ce37d..28d777a5f9 100644
--- a/core/simulator/src/lib.rs
+++ b/core/simulator/src/lib.rs
@@ -325,8 +325,10 @@ impl Simulator {
         futures::executor::block_on(replica.on_message(message));
 
         let mut buf = Vec::new();
-        futures::executor::block_on(replica.process_loopback(&mut buf));
-        let loopback_count = futures::executor::block_on(replica.process_loopback(&mut buf));
+        let mut ns_scratch: Vec<IggyNamespace> = Vec::new();
+        futures::executor::block_on(replica.process_loopback(&mut buf, &mut ns_scratch));
+        let loopback_count =
+            futures::executor::block_on(replica.process_loopback(&mut buf, &mut ns_scratch));
         debug_assert_eq!(
             loopback_count, 0,
             "on_ack must not re-enqueue loopback messages"