Skip to content

Commit 80ba0b1

Browse files
authored
Backfill peer attribution (#7762)
Partly addresses #7744 Implement similar peer sync attribution like in #7733 for backfill sync.
1 parent 122f167 commit 80ba0b1

File tree

7 files changed

+620
-144
lines changed

7 files changed

+620
-144
lines changed

beacon_node/lighthouse_network/src/peer_manager/peerdb.rs

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -253,15 +253,17 @@ impl<E: EthSpec> PeerDB<E> {
253253
///
254254
/// If `earliest_available_slot` info is not available, then return peer anyway assuming it has the
255255
/// required data.
256+
///
257+
/// If `allowed_peers` is `Some`, then filters for the epoch only for those peers.
256258
pub fn synced_peers_for_epoch<'a>(
257259
&'a self,
258260
epoch: Epoch,
259-
allowed_peers: &'a HashSet<PeerId>,
261+
allowed_peers: Option<&'a HashSet<PeerId>>,
260262
) -> impl Iterator<Item = &'a PeerId> {
261263
self.peers
262264
.iter()
263265
.filter(move |(peer_id, info)| {
264-
allowed_peers.contains(peer_id)
266+
allowed_peers.is_none_or(|allowed| allowed.contains(peer_id))
265267
&& info.is_connected()
266268
&& match info.sync_status() {
267269
SyncStatus::Synced { info } => {
@@ -270,7 +272,9 @@ impl<E: EthSpec> PeerDB<E> {
270272
SyncStatus::Advanced { info } => {
271273
info.has_slot(epoch.end_slot(E::slots_per_epoch()))
272274
}
273-
_ => false,
275+
SyncStatus::IrrelevantPeer
276+
| SyncStatus::Behind { .. }
277+
| SyncStatus::Unknown => false,
274278
}
275279
})
276280
.map(|(peer_id, _)| peer_id)
@@ -320,22 +324,36 @@ impl<E: EthSpec> PeerDB<E> {
320324
}
321325

322326
/// Returns an iterator of all peers that are supposed to be custodying
323-
/// the given subnet id that also belong to `allowed_peers`.
324-
pub fn good_range_sync_custody_subnet_peer<'a>(
325-
&'a self,
327+
/// the given subnet id.
328+
pub fn good_range_sync_custody_subnet_peers(
329+
&self,
326330
subnet: DataColumnSubnetId,
327-
allowed_peers: &'a HashSet<PeerId>,
328-
) -> impl Iterator<Item = &'a PeerId> {
331+
) -> impl Iterator<Item = &PeerId> {
329332
self.peers
330333
.iter()
331-
.filter(move |(peer_id, info)| {
334+
.filter(move |(_, info)| {
332335
// The custody_subnets hashset can be populated via enr or metadata
333-
let is_custody_subnet_peer = info.is_assigned_to_custody_subnet(&subnet);
334-
allowed_peers.contains(peer_id) && info.is_connected() && is_custody_subnet_peer
336+
info.is_connected() && info.is_assigned_to_custody_subnet(&subnet)
335337
})
336338
.map(|(peer_id, _)| peer_id)
337339
}
338340

341+
/// Returns `true` if the given peer is assigned to the given subnet.
342+
/// else returns `false`
343+
///
344+
/// Returns `false` if peer doesn't exist in peerdb.
345+
pub fn is_good_range_sync_custody_subnet_peer(
346+
&self,
347+
subnet: DataColumnSubnetId,
348+
peer: &PeerId,
349+
) -> bool {
350+
if let Some(info) = self.peers.get(peer) {
351+
info.is_connected() && info.is_assigned_to_custody_subnet(&subnet)
352+
} else {
353+
false
354+
}
355+
}
356+
339357
/// Gives the ids of all known disconnected peers.
340358
pub fn disconnected_peers(&self) -> impl Iterator<Item = &PeerId> {
341359
self.peers

beacon_node/network/src/sync/backfill_sync/mod.rs

Lines changed: 132 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//! sync as failed, log an error and attempt to retry once a new peer joins the node.
1010
1111
use crate::network_beacon_processor::ChainSegmentProcessId;
12+
use crate::sync::block_sidecar_coupling::CouplingError;
1213
use crate::sync::manager::BatchProcessResult;
1314
use crate::sync::network_context::{
1415
RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext,
@@ -28,7 +29,7 @@ use std::collections::{
2829
};
2930
use std::sync::Arc;
3031
use tracing::{debug, error, info, warn};
31-
use types::{Epoch, EthSpec};
32+
use types::{ColumnIndex, Epoch, EthSpec};
3233

3334
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
3435
/// blocks per batch are requested _at most_. A batch may request less blocks to account for
@@ -209,9 +210,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
209210
.network_globals
210211
.peers
211212
.read()
212-
.synced_peers()
213+
.synced_peers_for_epoch(self.to_be_downloaded, None)
213214
.next()
214215
.is_some()
216+
// backfill can't progress if we do not have peers in the required subnets post peerdas.
217+
&& self.good_peers_on_sampling_subnets(self.to_be_downloaded, network)
215218
{
216219
// If there are peers to resume with, begin the resume.
217220
debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync");
@@ -305,6 +308,46 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
305308
err: RpcResponseError,
306309
) -> Result<(), BackFillError> {
307310
if let Some(batch) = self.batches.get_mut(&batch_id) {
311+
if let RpcResponseError::BlockComponentCouplingError(coupling_error) = &err {
312+
match coupling_error {
313+
CouplingError::DataColumnPeerFailure {
314+
error,
315+
faulty_peers,
316+
action,
317+
exceeded_retries,
318+
} => {
319+
debug!(?batch_id, error, "Block components coupling error");
320+
// Note: we don't fail the batch here because a `CouplingError` is
321+
// recoverable by requesting from other honest peers.
322+
let mut failed_columns = HashSet::new();
323+
let mut failed_peers = HashSet::new();
324+
for (column, peer) in faulty_peers {
325+
failed_columns.insert(*column);
326+
failed_peers.insert(*peer);
327+
}
328+
for peer in failed_peers.iter() {
329+
network.report_peer(*peer, *action, "failed to return columns");
330+
}
331+
332+
// Only retry if peer failure **and** retries have been exceeded
333+
if !*exceeded_retries {
334+
return self.retry_partial_batch(
335+
network,
336+
batch_id,
337+
request_id,
338+
failed_columns,
339+
failed_peers,
340+
);
341+
}
342+
}
343+
CouplingError::BlobPeerFailure(msg) => {
344+
tracing::debug!(?batch_id, msg, "Blob peer failure");
345+
}
346+
CouplingError::InternalError(msg) => {
347+
error!(?batch_id, msg, "Block components coupling internal error");
348+
}
349+
}
350+
}
308351
// A batch could be retried without the peer failing the request (disconnecting/
309352
// sending an error /timeout) if the peer is removed from the chain for other
310353
// reasons. Check that this block belongs to the expected peer
@@ -834,12 +877,16 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
834877
network: &mut SyncNetworkContext<T>,
835878
batch_id: BatchId,
836879
) -> Result<(), BackFillError> {
880+
if matches!(self.state(), BackFillState::Paused) {
881+
return Err(BackFillError::Paused);
882+
}
837883
if let Some(batch) = self.batches.get_mut(&batch_id) {
884+
debug!(?batch_id, "Sending backfill batch");
838885
let synced_peers = self
839886
.network_globals
840887
.peers
841888
.read()
842-
.synced_peers()
889+
.synced_peers_for_epoch(batch_id, None)
843890
.cloned()
844891
.collect::<HashSet<_>>();
845892

@@ -898,6 +945,53 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
898945
Ok(())
899946
}
900947

948+
/// Retries partial column requests within the batch by creating new requests for the failed columns.
949+
pub fn retry_partial_batch(
950+
&mut self,
951+
network: &mut SyncNetworkContext<T>,
952+
batch_id: BatchId,
953+
id: Id,
954+
failed_columns: HashSet<ColumnIndex>,
955+
mut failed_peers: HashSet<PeerId>,
956+
) -> Result<(), BackFillError> {
957+
if let Some(batch) = self.batches.get_mut(&batch_id) {
958+
failed_peers.extend(&batch.failed_peers());
959+
let req = batch.to_blocks_by_range_request().0;
960+
961+
let synced_peers = network
962+
.network_globals()
963+
.peers
964+
.read()
965+
.synced_peers_for_epoch(batch_id, None)
966+
.cloned()
967+
.collect::<HashSet<_>>();
968+
969+
match network.retry_columns_by_range(
970+
id,
971+
&synced_peers,
972+
&failed_peers,
973+
req,
974+
&failed_columns,
975+
) {
976+
Ok(_) => {
977+
debug!(
978+
?batch_id,
979+
id, "Retried column requests from different peers"
980+
);
981+
return Ok(());
982+
}
983+
Err(e) => {
984+
debug!(?batch_id, id, e, "Failed to retry partial batch");
985+
}
986+
}
987+
} else {
988+
return Err(BackFillError::InvalidSyncState(
989+
"Batch should exist to be retried".to_string(),
990+
));
991+
}
992+
Ok(())
993+
}
994+
901995
/// When resuming a chain, this function searches for batches that need to be re-downloaded and
902996
/// transitions their state to redownload the batch.
903997
fn resume_batches(&mut self, network: &mut SyncNetworkContext<T>) -> Result<(), BackFillError> {
@@ -973,6 +1067,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
9731067
return None;
9741068
}
9751069

1070+
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
1071+
debug!("Waiting for peers to be available on custody column subnets");
1072+
return None;
1073+
}
1074+
9761075
let batch_id = self.to_be_downloaded;
9771076
// this batch could have been included already being an optimistic batch
9781077
match self.batches.entry(batch_id) {
@@ -1005,6 +1104,36 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
10051104
}
10061105
}
10071106

1107+
/// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in
1108+
/// every sampling column subnet.
1109+
///
1110+
/// Returns `true` if peerdas isn't enabled for the epoch.
1111+
fn good_peers_on_sampling_subnets(
1112+
&self,
1113+
epoch: Epoch,
1114+
network: &SyncNetworkContext<T>,
1115+
) -> bool {
1116+
if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) {
1117+
// Require peers on all sampling column subnets before sending batches
1118+
let peers_on_all_custody_subnets = network
1119+
.network_globals()
1120+
.sampling_subnets()
1121+
.iter()
1122+
.all(|subnet_id| {
1123+
let peer_count = network
1124+
.network_globals()
1125+
.peers
1126+
.read()
1127+
.good_range_sync_custody_subnet_peers(*subnet_id)
1128+
.count();
1129+
peer_count > 0
1130+
});
1131+
peers_on_all_custody_subnets
1132+
} else {
1133+
true
1134+
}
1135+
}
1136+
10081137
/// Resets the start epoch based on the beacon chain.
10091138
///
10101139
/// This errors if the beacon chain indicates that backfill sync has already completed or is

0 commit comments

Comments
 (0)