Skip to content

Commit 54b3429

Browse files
altonennathanwhit
authored andcommitted
Don't start evicting peers right after SyncingEngine is started (paritytech#14216)
* Don't start evicting peers right after `SyncingEngine` is started Parachain collators may need to wait to receive a relaychain block before they can start producing blocks which can cause `SyncingEngine` to incorrectly evict them. When `SyncingEngine` is started, wait 2 minutes before the eviction is activated to give collators a chance to produce a block. * fix doc * Use `continue` instead of `break` * Trigger CI --------- Co-authored-by: parity-processbot <>
1 parent 10e95e7 commit 54b3429

File tree

1 file changed

+41
-3
lines changed

1 file changed

+41
-3
lines changed

client/network/sync/src/engine.rs

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,23 @@ const TICK_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(1100)
7676
/// Maximum number of known block hashes to keep for a peer.
7777
const MAX_KNOWN_BLOCKS: usize = 1024; // ~32kb per peer + LruHashSet overhead
7878

79-
/// If the block announces stream to peer has been inactive for two minutes meaning local node
79+
/// If the block announces stream to peer has been inactive for 30 seconds meaning local node
8080
/// has not sent or received block announcements to/from the peer, report the node for inactivity,
8181
/// disconnect it and attempt to establish connection to some other peer.
8282
const INACTIVITY_EVICT_THRESHOLD: Duration = Duration::from_secs(30);
8383

84+
/// When `SyncingEngine` is started, wait two minutes before actually staring to count peers as
85+
/// evicted.
86+
///
87+
/// Parachain collator may incorrectly get evicted because it's waiting to receive a number of
88+
/// relaychain blocks before it can start creating parachain blocks. During this wait,
89+
/// `SyncingEngine` still counts it as active and as the peer is not sending blocks, it may get
90+
/// evicted if a block is not received within the first 30 secons since the peer connected.
91+
///
92+
/// To prevent this from happening, define a threshold for how long `SyncingEngine` should wait
93+
/// before it starts evicting peers.
94+
const INITIAL_EVICTION_WAIT_PERIOD: Duration = Duration::from_secs(2 * 60);
95+
8496
mod rep {
8597
use sc_peerset::ReputationChange as Rep;
8698
/// Peer has different genesis.
@@ -243,6 +255,12 @@ pub struct SyncingEngine<B: BlockT, Client> {
243255

244256
/// Prometheus metrics.
245257
metrics: Option<Metrics>,
258+
259+
/// When the syncing was started.
260+
///
261+
/// Stored as an `Option<Instant>` so once the initial wait has passed, `SyncingEngine`
262+
/// can reset the peer timers and continue with the normal eviction process.
263+
syncing_started: Option<Instant>,
246264
}
247265

248266
impl<B: BlockT, Client> SyncingEngine<B, Client>
@@ -389,6 +407,7 @@ where
389407
default_peers_set_num_light,
390408
event_streams: Vec::new(),
391409
tick_timeout: Delay::new(TICK_TIMEOUT),
410+
syncing_started: None,
392411
metrics: if let Some(r) = metrics_registry {
393412
match Metrics::register(r, is_major_syncing.clone()) {
394413
Ok(metrics) => Some(metrics),
@@ -607,6 +626,8 @@ where
607626
}
608627

609628
pub async fn run(mut self) {
629+
self.syncing_started = Some(Instant::now());
630+
610631
loop {
611632
futures::future::poll_fn(|cx| self.poll(cx)).await;
612633
}
@@ -619,6 +640,25 @@ where
619640

620641
while let Poll::Ready(()) = self.tick_timeout.poll_unpin(cx) {
621642
self.report_metrics();
643+
self.tick_timeout.reset(TICK_TIMEOUT);
644+
645+
// if `SyncingEngine` has just started, don't evict seemingly inactive peers right away
646+
// as they may not have produced blocks not because they've disconnected but because
647+
// they're still waiting to receive enough relaychain blocks to start producing blocks.
648+
if let Some(started) = self.syncing_started {
649+
if started.elapsed() < INITIAL_EVICTION_WAIT_PERIOD {
650+
continue
651+
}
652+
653+
// reset the peer activity timers so they don't expire right away after
654+
// the initial wait is done.
655+
for info in self.peers.values_mut() {
656+
info.last_notification_received = Instant::now();
657+
info.last_notification_sent = Instant::now();
658+
}
659+
660+
self.syncing_started = None;
661+
}
622662

623663
// go over all connected peers and check if any of them have been idle for a while. Idle
624664
// in this case means that we haven't sent or received block announcements to/from this
@@ -647,8 +687,6 @@ where
647687
self.evicted.insert(*id);
648688
}
649689
}
650-
651-
self.tick_timeout.reset(TICK_TIMEOUT);
652690
}
653691

654692
while let Poll::Ready(Some(event)) = self.service_rx.poll_next_unpin(cx) {

0 commit comments

Comments
 (0)