Skip to content

Commit 692d239

Browse files
committed
fix(consensus): implement standard-aligned blocksync mode (TM-B5)
Fixes the TM-B5 chaos test scenario where a node misses commits during a network partition and gets stuck cycling rounds indefinitely at the wrong height. Changes: - Add ConsensusMode state machine (Blocksync/Consensus) to ChainActor - Guard Tendermint handlers to reject consensus messages during sync: - TendermintNewHeight: deferred during blocksync - TendermintPropose/Proposal: rejected during blocksync - TendermintVote: future height votes still trigger sync detection - TendermintTimeout: ignored to prevent round cycling at wrong height - Governance/Evidence: allowed (needed during sync) - Lower sync detection thresholds for faster catch-up: - HEALTH_THRESHOLD: 10 → 1 (detect single-block lag) - MIN_VOTES_FOR_SYNC: 3 → 1 (trigger on first future vote) - DEBOUNCE_DURATION: 500ms → 100ms - SYNC_COOLDOWN: 5s → 2s - Forward NewRound announcements from NetworkActor to ChainActor for additional sync detection - Enhance SyncCompleted handler to enter Consensus mode and reset future_height_tracker - Clear stale pending timeouts in TendermintDriver on Resume
1 parent b226e2b commit 692d239

File tree

7 files changed

+334
-33
lines changed

7 files changed

+334
-33
lines changed

app/src/actors_v2/chain/actor.rs

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,23 @@ impl Default for FutureHeightTracker {
114114
}
115115
}
116116

117+
/// Consensus mode state machine for standard-aligned blocksync.
118+
///
119+
/// When a node falls behind (e.g., during network partition or restart),
120+
/// it must enter Blocksync mode to catch up rather than participating in
121+
/// consensus at the wrong height. This prevents stuck consensus scenarios
122+
/// like TM-B5 where a node misses commits and cycles rounds indefinitely.
123+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
124+
pub enum ConsensusMode {
125+
/// Syncing mode - reject consensus messages, accept sync blocks.
126+
/// Node is catching up to network height.
127+
#[default]
128+
Blocksync,
129+
/// Consensus mode - participate in Tendermint consensus.
130+
/// Node is at network tip and can propose/vote.
131+
Consensus,
132+
}
133+
117134
/// Simplified ChainActor - core blockchain functionality (Clone-enabled for async handlers)
118135
#[derive(Clone)]
119136
pub struct ChainActor {
@@ -200,6 +217,11 @@ pub struct ChainActor {
200217
/// When votes for heights beyond our current height arrive, this helps determine
201218
/// if we've fallen behind and need to trigger catch-up sync.
202219
pub(crate) future_height_tracker: Arc<RwLock<FutureHeightTracker>>,
220+
221+
/// Consensus mode state machine.
222+
/// Determines whether to process consensus messages (Consensus mode) or
223+
/// defer them while catching up (Blocksync mode).
224+
pub(crate) consensus_mode: Arc<RwLock<ConsensusMode>>,
203225
}
204226

205227
impl ChainActor {
@@ -250,6 +272,8 @@ impl ChainActor {
250272
tendermint_sync_validator: None,
251273
// Future height vote tracking for sync detection
252274
future_height_tracker: Arc::new(RwLock::new(FutureHeightTracker::new())),
275+
// Start in Blocksync mode - transition to Consensus after sync completes
276+
consensus_mode: Arc::new(RwLock::new(ConsensusMode::Blocksync)),
253277
}
254278
}
255279

@@ -352,6 +376,96 @@ impl ChainActor {
352376
self.engine_actor = Some(addr);
353377
}
354378

379+
// =========================================================================
380+
// Consensus Mode Transitions (TM-B5 Fix)
381+
// =========================================================================
382+
383+
/// Transition to Consensus mode after sync completion.
384+
///
385+
/// Called when node reaches network tip and should participate in consensus.
386+
/// This resumes the TendermintDriver and resets the future height tracker.
387+
///
388+
/// # Arguments
389+
/// * `height` - The height at which to resume consensus
390+
/// * `correlation_id` - Tracing correlation ID
391+
pub async fn enter_consensus_mode(&self, height: u64, correlation_id: Uuid) {
392+
let mut mode = self.consensus_mode.write().await;
393+
if *mode == ConsensusMode::Blocksync {
394+
*mode = ConsensusMode::Consensus;
395+
info!(
396+
correlation_id = %correlation_id,
397+
height = height,
398+
"Entered Consensus mode - resuming participation"
399+
);
400+
401+
// Reset future height tracker since we're now caught up
402+
self.future_height_tracker.write().await.reset();
403+
404+
// Resume TendermintDriver
405+
if let Some(ref driver) = self.tendermint_driver {
406+
driver.do_send(crate::actors_v2::tendermint_driver::TendermintDriverMessage::Resume {
407+
height,
408+
});
409+
}
410+
} else {
411+
debug!(
412+
correlation_id = %correlation_id,
413+
"Already in Consensus mode"
414+
);
415+
}
416+
}
417+
418+
/// Transition to Blocksync mode when node falls behind.
419+
///
420+
/// Called when node detects it's behind network height (via future height
421+
/// votes, NewRound announcements, or sync health checks). This pauses
422+
/// consensus to prevent the node from cycling rounds at the wrong height.
423+
///
424+
/// # Arguments
425+
/// * `correlation_id` - Tracing correlation ID
426+
pub async fn enter_blocksync_mode(&self, correlation_id: Uuid) {
427+
let mut mode = self.consensus_mode.write().await;
428+
if *mode == ConsensusMode::Consensus {
429+
*mode = ConsensusMode::Blocksync;
430+
info!(
431+
correlation_id = %correlation_id,
432+
"Entered Blocksync mode - pausing consensus participation"
433+
);
434+
435+
// Pause TendermintDriver
436+
if let Some(ref driver) = self.tendermint_driver {
437+
driver.do_send(crate::actors_v2::tendermint_driver::TendermintDriverMessage::Pause);
438+
}
439+
} else {
440+
debug!(
441+
correlation_id = %correlation_id,
442+
"Already in Blocksync mode"
443+
);
444+
}
445+
}
446+
447+
/// Check if currently in Consensus mode.
448+
pub async fn is_consensus_mode(&self) -> bool {
449+
*self.consensus_mode.read().await == ConsensusMode::Consensus
450+
}
451+
452+
/// Get current consensus mode.
453+
pub async fn get_consensus_mode(&self) -> ConsensusMode {
454+
*self.consensus_mode.read().await
455+
}
456+
457+
/// Get current Tendermint height from state machine.
458+
/// Returns 0 if Tendermint is not configured.
459+
pub async fn get_tendermint_height(&self) -> Result<u64, ChainError> {
460+
if let Some(ref tm_state) = self.tendermint_state {
461+
let state = tm_state.read().await;
462+
Ok(state.height)
463+
} else {
464+
// Not configured - return storage height as fallback
465+
Ok(self.state.get_height().await)
466+
}
467+
}
468+
355469
/// Record activity and update metrics
356470
pub(crate) fn record_activity(&mut self) {
357471
self.last_activity = Instant::now();
@@ -860,7 +974,9 @@ impl ChainActor {
860974
}
861975
};
862976

863-
const HEALTH_THRESHOLD: u64 = 10;
977+
// TM-B5 Fix: Lowered from 10 to 1 to detect single-block lag
978+
// This ensures nodes stuck 1 block behind are detected quickly
979+
const HEALTH_THRESHOLD: u64 = 1;
864980

865981
if network_height > storage_height + HEALTH_THRESHOLD {
866982
warn!(

0 commit comments

Comments
 (0)