Skip to content

Commit 210c4b9

Browse files
committed
fix(sync): add QueryingNetworkHeight state to fix late-join regression
The previous fix (517189d) for the sync deadlock when target_height=0 caused a regression for late-joining nodes. Nodes would immediately complete sync at height 0 before actually querying the network, then reject all incoming blocks due to missing parents. This fix adds a proper QueryingNetworkHeight state to the sync state machine that: 1. Transitions from DiscoveringPeers when peers found but target unknown 2. Polls ChainActor every 2s to detect blocks received via gossipsub 3. If higher chain discovered: sets target_height and transitions to RequestingBlocks 4. After 10s timeout with no higher chain: transitions to Synced (handles fresh network / first node case) State machine flow: - Before: DiscoveringPeers → (peers + target=0) → Synced (WRONG) - After: DiscoveringPeers → (peers + target=0) → QueryingNetworkHeight ↓ (timeout, no blocks) → Synced ✓ (gossip block arrives) → RequestingBlocks ✓ Also updates sync completion check to only complete when target_height > 0 and we've reached target, since target_height=0 case is now handled by the QueryingNetworkHeight state.
1 parent 517189d commit 210c4b9

File tree

1 file changed

+112
-27
lines changed

1 file changed

+112
-27
lines changed

app/src/actors_v2/network/sync_actor.rs

Lines changed: 112 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ pub enum SyncState {
2323
Stopped,
2424
Starting,
2525
DiscoveringPeers,
26+
/// Querying connected peers for their chain height before deciding sync strategy.
27+
/// This state ensures we don't prematurely conclude "already synced" before
28+
/// actually discovering what height the network is at.
29+
QueryingNetworkHeight,
2630
RequestingBlocks,
2731
ProcessingBlocks,
2832
Synced,
@@ -149,6 +153,7 @@ impl SyncActorState {
149153
}
150154
}
151155
SyncState::Starting
156+
| SyncState::QueryingNetworkHeight
152157
| SyncState::RequestingBlocks
153158
| SyncState::ProcessingBlocks
154159
| SyncState::Error(_) => true,
@@ -336,6 +341,75 @@ impl Actor for SyncActor {
336341
}
337342
});
338343

344+
// Network height query: Poll for network height when in QueryingNetworkHeight state
345+
ctx.run_interval(Duration::from_secs(2), |act, _ctx| {
346+
let state = std::sync::Arc::clone(&act.state);
347+
let chain_actor = act.chain_actor.clone();
348+
349+
tokio::spawn(async move {
350+
// Check if we're in QueryingNetworkHeight state
351+
let (should_query, current_height, time_in_state) = {
352+
let s = state.read().unwrap();
353+
let in_querying_state = s.is_running
354+
&& s.sync_state == SyncState::QueryingNetworkHeight;
355+
let elapsed = s.state_entered_at.elapsed().unwrap_or(Duration::ZERO);
356+
(in_querying_state, s.current_height, elapsed)
357+
};
358+
359+
if !should_query {
360+
return;
361+
}
362+
363+
// Query ChainActor for chain status (to see if gossipsub has delivered any blocks)
364+
if let Some(chain_actor) = chain_actor {
365+
match chain_actor
366+
.send(crate::actors_v2::chain::messages::ChainMessage::GetChainStatus)
367+
.await
368+
{
369+
Ok(Ok(crate::actors_v2::chain::messages::ChainResponse::ChainStatus(status))) => {
370+
let network_height = status.height;
371+
372+
// If we received blocks via gossipsub, we now know network height
373+
if network_height > current_height {
374+
let mut s = state.write().unwrap();
375+
s.target_height = network_height;
376+
tracing::info!(
377+
current_height = s.current_height,
378+
discovered_height = network_height,
379+
"Network height discovered via gossipsub - transitioning to RequestingBlocks"
380+
);
381+
s.transition_to_state(SyncState::RequestingBlocks);
382+
return;
383+
}
384+
}
385+
_ => {
386+
// Error or unexpected response - log and continue
387+
tracing::debug!("Failed to query ChainActor for chain status during height discovery");
388+
}
389+
}
390+
}
391+
392+
// Timeout after 10 seconds of querying: If no higher chain discovered, we're synced
393+
// This handles the case where we're starting a fresh network or are the first node
394+
const NETWORK_HEIGHT_QUERY_TIMEOUT: Duration = Duration::from_secs(10);
395+
396+
if time_in_state > NETWORK_HEIGHT_QUERY_TIMEOUT {
397+
let mut s = state.write().unwrap();
398+
let height = s.current_height;
399+
400+
tracing::info!(
401+
current_height = height,
402+
query_duration_secs = time_in_state.as_secs(),
403+
"Network height query timeout - no higher chain discovered, completing sync"
404+
);
405+
406+
s.transition_to_state(SyncState::Synced);
407+
s.is_running = false;
408+
s.metrics.record_sync_complete(height);
409+
}
410+
});
411+
});
412+
339413
// Sync loop: Periodic block requesting during active sync
340414
ctx.run_interval(Duration::from_secs(2), |act, ctx| {
341415
let state = std::sync::Arc::clone(&act.state);
@@ -411,37 +485,32 @@ impl Actor for SyncActor {
411485
const SYNC_THRESHOLD: u64 = 2;
412486

413487
// Sync is complete when:
414-
// 1. Sync is running and in active sync state
488+
// 1. Sync is running and in active sync state (RequestingBlocks or ProcessingBlocks)
415489
// 2. No pending requests or blocks
416-
// 3. Either: target_height == 0 (no peer has higher chain, already synced)
417-
// OR: we've reached target_height within threshold
490+
// 3. We have a known target (target_height > 0) AND we've reached it
491+
// NOTE: target_height == 0 case is now handled by QueryingNetworkHeight state
418492
let in_sync_state = s.is_running
419493
&& (s.sync_state == SyncState::RequestingBlocks
420494
|| s.sync_state == SyncState::ProcessingBlocks);
421495

422496
let no_pending_work = s.active_requests.is_empty() && s.block_queue.is_empty();
423497

424-
let reached_target = s.target_height == 0
425-
|| s.current_height + SYNC_THRESHOLD >= s.target_height;
498+
// Only complete when we have a known target and reached it
499+
// (target_height == 0 means height not yet discovered - handled by QueryingNetworkHeight)
500+
let reached_target = s.target_height > 0
501+
&& s.current_height + SYNC_THRESHOLD >= s.target_height;
426502

427503
let is_complete = in_sync_state && no_pending_work && reached_target;
428504

429505
if is_complete {
430506
let current = s.current_height;
431507
let target = s.target_height;
432508

433-
if target == 0 {
434-
tracing::info!(
435-
current_height = current,
436-
"Sync complete - no higher chain discovered (already synced)"
437-
);
438-
} else {
439-
tracing::info!(
440-
current_height = current,
441-
target_height = target,
442-
"Sync complete - reached target height"
443-
);
444-
}
509+
tracing::info!(
510+
current_height = current,
511+
target_height = target,
512+
"Sync complete - reached target height"
513+
);
445514

446515
s.transition_to_state(SyncState::Synced);
447516
s.is_running = false;
@@ -563,22 +632,30 @@ impl Handler<SyncMessage> for SyncActor {
563632

564633
// Transition based on peer availability and sync status
565634
if !s.sync_peers.is_empty() {
566-
// Check if we're already synced (target_height == 0 means no peer has a higher chain)
567635
const SYNC_THRESHOLD: u64 = 2;
568-
let already_synced = s.target_height == 0
569-
|| s.current_height + SYNC_THRESHOLD >= s.target_height;
570636

571-
if already_synced {
637+
if s.target_height == 0 {
638+
// target_height=0 means we haven't queried the network yet
639+
// Transition to QueryingNetworkHeight to actually discover network height
640+
tracing::info!(
641+
current_height = s.current_height,
642+
peer_count = s.sync_peers.len(),
643+
"Peers found - querying network height before deciding sync strategy"
644+
);
645+
s.transition_to_state(SyncState::QueryingNetworkHeight);
646+
} else if s.current_height + SYNC_THRESHOLD >= s.target_height {
647+
// We have a known target and we're already within threshold
572648
let height = s.current_height;
573649
tracing::info!(
574650
current_height = height,
575651
target_height = s.target_height,
576-
"Already synced (no higher chain discovered) - completing sync"
652+
"Already synced (within threshold of known target)"
577653
);
578654
s.transition_to_state(SyncState::Synced);
579655
s.is_running = false;
580656
s.metrics.record_sync_complete(height);
581657
} else {
658+
// We have a known target and we're behind it
582659
s.transition_to_state(SyncState::RequestingBlocks);
583660
tracing::info!(
584661
current_height = s.current_height,
@@ -991,23 +1068,31 @@ impl Handler<SyncMessage> for SyncActor {
9911068

9921069
// Transition based on sync status
9931070
if s.is_running && s.sync_state == SyncState::DiscoveringPeers {
994-
// Check if we're already synced (target_height == 0 means no peer has a higher chain)
9951071
const SYNC_THRESHOLD: u64 = 2;
996-
let already_synced = s.target_height == 0
997-
|| s.current_height + SYNC_THRESHOLD >= s.target_height;
9981072

999-
if already_synced {
1073+
if s.target_height == 0 {
1074+
// target_height=0 means we haven't queried the network yet
1075+
// Transition to QueryingNetworkHeight to discover actual network height
1076+
tracing::info!(
1077+
current_height = s.current_height,
1078+
peer_count = s.sync_peers.len(),
1079+
"First peers discovered - querying network height"
1080+
);
1081+
s.transition_to_state(SyncState::QueryingNetworkHeight);
1082+
} else if s.current_height + SYNC_THRESHOLD >= s.target_height {
1083+
// We have a known target and we're within threshold
10001084
let height = s.current_height;
10011085
tracing::info!(
10021086
current_height = height,
10031087
target_height = s.target_height,
10041088
peer_count = s.sync_peers.len(),
1005-
"Already synced (no higher chain discovered) - completing sync"
1089+
"Already synced (within threshold of known target)"
10061090
);
10071091
s.transition_to_state(SyncState::Synced);
10081092
s.is_running = false;
10091093
s.metrics.record_sync_complete(height);
10101094
} else {
1095+
// We have a known target and we're behind it
10111096
s.transition_to_state(SyncState::RequestingBlocks);
10121097
tracing::info!(
10131098
current_height = s.current_height,

0 commit comments

Comments
 (0)