Skip to content

Commit ce9a161

Browse files
committed
fix: (1) connect_begin() succeeds when at least one replica is connected. Subsequent calls to connect_begin() in subsequent passes will connect and keep connected more and more peers. (2) Evict peers periodically so we get some churn. (3) Always, always, always try to send up to request_capacity messages (doing a full cycle through the push schedule).
1 parent d4a0619 commit ce9a161

File tree

1 file changed

+57
-12
lines changed

1 file changed

+57
-12
lines changed

stackslib/src/net/stackerdb/sync.rs

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
7979
num_attempted_connections: 0,
8080
rounds: 0,
8181
push_round: 0,
82+
last_eviction_time: get_epoch_time_secs(),
8283
};
8384
dbsync.reset(None, config);
8485
dbsync
@@ -217,10 +218,32 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
217218
self.expected_versions.clear();
218219
self.downloaded_chunks.clear();
219220

220-
// reset comms, but keep all connected replicas pinned
221+
// reset comms, but keep all connected replicas pinned.
222+
// Randomly evict one every so often.
221223
self.comms.reset();
222224
if let Some(network) = network {
223-
for naddr in self.replicas.iter() {
225+
let mut eviction_index = None;
226+
if self.last_eviction_time + 60 < get_epoch_time_secs() {
227+
self.last_eviction_time = get_epoch_time_secs();
228+
if self.replicas.len() > 0 {
229+
eviction_index = Some(thread_rng().gen::<usize>() % self.replicas.len());
230+
}
231+
}
232+
233+
let mut remove_naddr = None;
234+
for (i, naddr) in self.replicas.iter().enumerate() {
235+
if let Some(eviction_index) = eviction_index.as_ref() {
236+
if *eviction_index == i {
237+
debug!(
238+
"{:?}: {}: don't reuse connection for replica {:?}",
239+
network.get_local_peer(),
240+
&self.smart_contract_id,
241+
&naddr,
242+
);
243+
remove_naddr = Some(naddr.clone());
244+
continue;
245+
}
246+
}
224247
if let Some(event_id) = network.get_event_id(&naddr.to_neighbor_key(network)) {
225248
self.comms.pin_connection(event_id);
226249
debug!(
@@ -232,6 +255,9 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
232255
);
233256
}
234257
}
258+
if let Some(naddr) = remove_naddr.take() {
259+
self.replicas.remove(&naddr);
260+
}
235261
}
236262

237263
// reload from config
@@ -668,7 +694,8 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
668694
/// We might not be connected to any yet.
669695
/// Clears self.replicas, and fills in self.connected_replicas with already-connected neighbors
670696
/// Returns Ok(true) if we can proceed to sync
671-
/// Returns Ok(false) if we have no known peers
697+
/// Returns Ok(false) if we should try this again
698+
/// Returns Err(NoSuchNeighbor) if we don't have anyone to talk to
672699
/// Returns Err(..) on DB query error
673700
pub fn connect_begin(&mut self, network: &mut PeerNetwork) -> Result<bool, net_error> {
674701
if self.replicas.len() == 0 {
@@ -686,7 +713,7 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
686713
);
687714
if self.replicas.len() == 0 {
688715
// nothing to do
689-
return Ok(false);
716+
return Err(net_error::NoSuchNeighbor);
690717
}
691718

692719
let naddrs = mem::replace(&mut self.replicas, HashSet::new());
@@ -729,11 +756,12 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
729756
);
730757
self.num_attempted_connections += 1;
731758
self.num_connections += 1;
759+
self.connected_replicas.insert(naddr);
732760
}
733761
Ok(false) => {
734762
// need to retry
735-
self.replicas.insert(naddr);
736763
self.num_attempted_connections += 1;
764+
self.replicas.insert(naddr);
737765
}
738766
Err(_e) => {
739767
debug!(
@@ -746,7 +774,7 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
746774
}
747775
}
748776
}
749-
Ok(self.replicas.len() == 0)
777+
Ok(self.connected_replicas.len() > 0)
750778
}
751779

752780
/// Finish up connecting to our replicas.
@@ -1154,7 +1182,8 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
11541182
);
11551183

11561184
// fill up our comms with $capacity requests
1157-
for _i in 0..self.request_capacity {
1185+
let mut num_sent = 0;
1186+
for _i in 0..self.chunk_push_priorities.len() {
11581187
if self.comms.count_inflight() >= self.request_capacity {
11591188
break;
11601189
}
@@ -1173,6 +1202,9 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
11731202
chunk_push.chunk_data.slot_id,
11741203
chunk_push.chunk_data.slot_version,
11751204
);
1205+
1206+
// next-prioritized chunk
1207+
cur_priority = (cur_priority + 1) % self.chunk_push_priorities.len();
11761208
continue;
11771209
};
11781210

@@ -1213,6 +1245,11 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
12131245

12141246
// next-prioritized chunk
12151247
cur_priority = (cur_priority + 1) % self.chunk_push_priorities.len();
1248+
1249+
num_sent += 1;
1250+
if num_sent > self.request_capacity {
1251+
break;
1252+
}
12161253
}
12171254
self.next_chunk_push_priority = cur_priority;
12181255
Ok(self
@@ -1370,14 +1407,22 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
13701407
let mut blocked = true;
13711408
match self.state {
13721409
StackerDBSyncState::ConnectBegin => {
1373-
let done = self.connect_begin(network)?;
1410+
let done = match self.connect_begin(network) {
1411+
Ok(done) => done,
1412+
Err(net_error::NoSuchNeighbor) => {
1413+
// nothing to do
1414+
self.state = StackerDBSyncState::Finished;
1415+
blocked = false;
1416+
false
1417+
}
1418+
Err(e) => {
1419+
return Err(e);
1420+
}
1421+
};
13741422
if done {
13751423
self.state = StackerDBSyncState::ConnectFinish;
1376-
} else {
1377-
// no replicas; try again
1378-
self.state = StackerDBSyncState::Finished;
1424+
blocked = false;
13791425
}
1380-
blocked = false;
13811426
}
13821427
StackerDBSyncState::ConnectFinish => {
13831428
let done = self.connect_try_finish(network)?;

0 commit comments

Comments
 (0)