Skip to content

Commit e733978

Browse files
collator-protocol: don't disconnect on slot end when we have pending collations (#10446)
We pre-connect one slot in advance and we disconnect when our slot ends even when we have pending collations which is wrong. This change fixes this by keeping the connections open until the collation is requested or relay parent expires. --------- Signed-off-by: Andrei Sandu <[email protected]> Co-authored-by: cmd[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 3f69cfc commit e733978

File tree

3 files changed

+112
-17
lines changed

3 files changed

+112
-17
lines changed

polkadot/node/network/collator-protocol/src/collator_side/mod.rs

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -697,10 +697,14 @@ fn has_assigned_cores(
697697
false
698698
}
699699

700+
/// Get a list of backing validators at all allowed relay parents.
701+
/// If `pending_collation` is true, we will only return the validators
702+
/// that have a collation pending.
700703
fn list_of_backing_validators_in_view(
701704
implicit_view: &Option<ImplicitView>,
702705
per_relay_parent: &HashMap<Hash, PerRelayParent>,
703706
para_id: ParaId,
707+
pending_collation: bool,
704708
) -> Vec<AuthorityDiscoveryId> {
705709
let mut backing_validators = HashSet::new();
706710
let Some(implicit_view) = implicit_view else { return vec![] };
@@ -711,7 +715,17 @@ fn list_of_backing_validators_in_view(
711715
.unwrap_or_default();
712716

713717
for allowed_relay_parent in allowed_ancestry {
714-
if let Some(relay_parent) = per_relay_parent.get(allowed_relay_parent) {
718+
let Some(relay_parent) = per_relay_parent.get(allowed_relay_parent) else { continue };
719+
720+
if pending_collation {
721+
// Check if there is any collation for this relay parent.
722+
for collation_data in relay_parent.collations.values() {
723+
let core_index = collation_data.core_index();
724+
if let Some(group) = relay_parent.validator_group.get(core_index) {
725+
backing_validators.extend(group.validators.iter().cloned());
726+
}
727+
}
728+
} else {
715729
for group in relay_parent.validator_group.values() {
716730
backing_validators.extend(group.validators.iter().cloned());
717731
}
@@ -743,7 +757,7 @@ async fn update_validator_connections<Context>(
743757
// to the network bridge passing an empty list of validator ids. Otherwise, it will keep
744758
// connecting to the last requested validators until a new request is issued.
745759
let validator_ids = if cores_assigned {
746-
list_of_backing_validators_in_view(implicit_view, per_relay_parent, para_id)
760+
list_of_backing_validators_in_view(implicit_view, per_relay_parent, para_id, false)
747761
} else {
748762
Vec::new()
749763
};
@@ -764,15 +778,18 @@ async fn update_validator_connections<Context>(
764778
return
765779
}
766780

781+
let validator_ids =
782+
list_of_backing_validators_in_view(implicit_view, per_relay_parent, para_id, true);
783+
767784
gum::trace!(
768785
target: LOG_TARGET,
769-
"Disconnecting from validators: {:?}",
770-
peer_ids.keys(),
786+
"Keeping connections to validators with pending collations: {:?}",
787+
validator_ids,
771788
);
772789

773-
// Disconnect from all connected validators on the `Collation` protocol.
790+
// Disconnect from all validators with no pending collations.
774791
NetworkBridgeTxMessage::ConnectToValidators {
775-
validator_ids: vec![],
792+
validator_ids,
776793
peer_set: PeerSet::Collation,
777794
failed,
778795
}

polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs

Lines changed: 80 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ async fn distribute_collation(
415415
expected_connected: Vec<AuthorityDiscoveryId>,
416416
test_state: &TestState,
417417
relay_parent: Hash,
418+
core_index: CoreIndex,
418419
) -> DistributeCollation {
419420
// Now we want to distribute a `PoVBlock`
420421
let pov_block = PoV { block_data: BlockData(vec![42, 43, 44]) };
@@ -426,6 +427,7 @@ async fn distribute_collation(
426427
para_id: test_state.para_id,
427428
relay_parent,
428429
pov_hash,
430+
core_index,
429431
..Default::default()
430432
}
431433
.build();
@@ -484,10 +486,9 @@ async fn expect_advertise_collation_msg(
484486
virtual_overseer: &mut VirtualOverseer,
485487
any_peers: &[PeerId],
486488
expected_relay_parent: Hash,
487-
expected_candidate_hashes: Vec<CandidateHash>,
489+
mut expected_candidate_hashes: Vec<CandidateHash>,
488490
) {
489-
let mut candidate_hashes: HashSet<_> = expected_candidate_hashes.into_iter().collect();
490-
let iter_num = candidate_hashes.len();
491+
let iter_num = expected_candidate_hashes.len();
491492

492493
for _ in 0..iter_num {
493494
assert_matches!(
@@ -511,10 +512,12 @@ async fn expect_advertise_collation_msg(
511512
..
512513
} => {
513514
assert_eq!(relay_parent, expected_relay_parent);
514-
assert!(candidate_hashes.contains(&candidate_hash));
515+
assert!(expected_candidate_hashes.contains(&candidate_hash));
515516

516517
// Drop the hash we've already seen.
517-
candidate_hashes.remove(&candidate_hash);
518+
if let Some(pos) = expected_candidate_hashes.iter().position(|h| h == &candidate_hash) {
519+
expected_candidate_hashes.remove(pos);
520+
}
518521
}
519522
);
520523
},
@@ -584,6 +587,7 @@ fn v1_protocol_rejected() {
584587
test_state.current_group_validator_authority_ids(),
585588
&test_state,
586589
test_state.relay_parent,
590+
CoreIndex(0),
587591
)
588592
.await;
589593

@@ -644,6 +648,7 @@ fn advertise_and_send_collation() {
644648
test_state.current_group_validator_authority_ids(),
645649
&test_state,
646650
test_state.relay_parent,
651+
CoreIndex(0),
647652
)
648653
.await;
649654

@@ -786,6 +791,7 @@ fn advertise_and_send_collation() {
786791
test_state.current_group_validator_authority_ids(),
787792
&test_state,
788793
test_state.relay_parent,
794+
CoreIndex(0),
789795
)
790796
.await;
791797

@@ -848,6 +854,7 @@ fn delay_reputation_change() {
848854
test_state.current_group_validator_authority_ids(),
849855
&test_state,
850856
test_state.relay_parent,
857+
CoreIndex(0),
851858
)
852859
.await;
853860

@@ -1066,6 +1073,7 @@ fn collations_are_only_advertised_to_validators_with_correct_view() {
10661073
test_state.current_group_validator_authority_ids(),
10671074
&test_state,
10681075
test_state.relay_parent,
1076+
CoreIndex(0),
10691077
)
10701078
.await;
10711079

@@ -1140,6 +1148,7 @@ fn collate_on_two_different_relay_chain_blocks() {
11401148
test_state.current_group_validator_authority_ids(),
11411149
&test_state,
11421150
test_state.relay_parent,
1151+
CoreIndex(0),
11431152
)
11441153
.await;
11451154

@@ -1162,6 +1171,7 @@ fn collate_on_two_different_relay_chain_blocks() {
11621171
test_state.current_group_validator_authority_ids(),
11631172
&test_state,
11641173
test_state.relay_parent,
1174+
CoreIndex(0),
11651175
)
11661176
.await;
11671177

@@ -1228,6 +1238,7 @@ fn validator_reconnect_does_not_advertise_a_second_time() {
12281238
test_state.current_group_validator_authority_ids(),
12291239
&test_state,
12301240
test_state.relay_parent,
1241+
CoreIndex(0),
12311242
)
12321243
.await;
12331244

@@ -1358,6 +1369,7 @@ where
13581369
test_state.current_group_validator_authority_ids(),
13591370
&test_state,
13601371
test_state.relay_parent,
1372+
CoreIndex(0),
13611373
)
13621374
.await;
13631375

@@ -1519,6 +1531,7 @@ fn connect_to_group_in_view() {
15191531
test_state.current_group_validator_authority_ids(),
15201532
&test_state,
15211533
test_state.relay_parent,
1534+
CoreIndex(0),
15221535
)
15231536
.await;
15241537

@@ -1604,6 +1617,7 @@ fn connect_to_group_in_view() {
16041617
expected_group,
16051618
&test_state,
16061619
test_state.relay_parent,
1620+
CoreIndex(0),
16071621
)
16081622
.await;
16091623

@@ -1652,6 +1666,7 @@ fn connect_with_no_cores_assigned() {
16521666
test_state.current_group_validator_authority_ids(),
16531667
&test_state,
16541668
test_state.relay_parent,
1669+
CoreIndex(0),
16551670
)
16561671
.await;
16571672

@@ -1803,6 +1818,7 @@ fn distribute_collation_forces_connect() {
18031818
test_state.current_group_validator_authority_ids(),
18041819
&test_state,
18051820
test_state.relay_parent,
1821+
CoreIndex(0),
18061822
)
18071823
.await;
18081824

@@ -1911,7 +1927,7 @@ fn connect_advertise_disconnect_three_backing_groups() {
19111927
let validator_peer_ids: Vec<_> =
19121928
(0..expected_validators.len()).map(|_| PeerId::random()).sorted().collect();
19131929

1914-
for (auth_id, peer_id) in expected_validators.iter().zip(validator_peer_ids.iter()) {
1930+
for (peer_id, auth_id) in validator_peer_ids.iter().zip(expected_validators.iter()) {
19151931
overseer_send(
19161932
&mut virtual_overseer,
19171933
CollatorProtocolMessage::NetworkBridgeUpdate(
@@ -1926,32 +1942,85 @@ fn connect_advertise_disconnect_three_backing_groups() {
19261942
.await;
19271943
}
19281944

1929-
// Expect collation advertisement for each validator
1945+
// Expect declare messages for each validator
19301946
for peer_id in validator_peer_ids.iter() {
19311947
expect_declare_msg(&mut virtual_overseer, &test_state, peer_id).await;
19321948
}
19331949

1950+
// Distribute collations for first 2 cores
1951+
let mut candidate_hashes = HashMap::new();
1952+
for core_idx in [0, 1] {
1953+
let DistributeCollation { candidate, .. } = distribute_collation(
1954+
&mut virtual_overseer,
1955+
expected_validators.clone(),
1956+
&test_state,
1957+
test_state.relay_parent,
1958+
CoreIndex(core_idx),
1959+
)
1960+
.await;
1961+
1962+
// Add the same candidate hash twice we remove them once per validator.
1963+
candidate_hashes.insert(core_idx as usize, vec![candidate.hash(); 2]);
1964+
}
1965+
1966+
// Send peer view changes for all validators to trigger advertisements
1967+
for peer_id in validator_peer_ids.iter() {
1968+
send_peer_view_change(
1969+
&mut virtual_overseer,
1970+
peer_id,
1971+
vec![test_state.relay_parent],
1972+
)
1973+
.await;
1974+
}
1975+
1976+
// Expect advertisements for 2 collations to each validator
1977+
for (idx, peer_ids) in
1978+
validator_peer_ids.iter().take(4).chunks(2).into_iter().enumerate()
1979+
{
1980+
let peer_ids_vec: Vec<PeerId> = peer_ids.copied().collect();
1981+
expect_advertise_collation_msg(
1982+
&mut virtual_overseer,
1983+
&peer_ids_vec,
1984+
test_state.relay_parent,
1985+
candidate_hashes[&idx].clone(),
1986+
)
1987+
.await;
1988+
}
1989+
19341990
// Send the disconnect message
19351991
overseer_send(
19361992
&mut virtual_overseer,
19371993
CollatorProtocolMessage::DisconnectFromBackingGroups,
19381994
)
19391995
.await;
19401996

1941-
// Expect a DisconnectPeers for all connected validators
1997+
// We should disconnect from validator of core 2, but keep the other validators
1998+
// connected
19421999
assert_matches!(
19432000
overseer_recv(&mut virtual_overseer).await,
19442001
AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::ConnectToValidators{
1945-
validator_ids,
2002+
mut validator_ids,
19462003
peer_set,
19472004
failed: _,
19482005
}) => {
1949-
// We should disconnect from all validators we were connected to
1950-
assert_eq!(validator_ids, vec![], "Expected to disconnect from all validators");
2006+
let mut expected: Vec<_> = expected_validators.into_iter().take(4).collect();
2007+
validator_ids.sort();
2008+
expected.sort();
2009+
assert_eq!(validator_ids, expected, "Expected to disconnect validator assigned to core 2");
19512010
assert_eq!(peer_set, PeerSet::Collation);
19522011
}
19532012
);
19542013

2014+
// Update view and expect connections to all validators to be dropped.
2015+
update_view(
2016+
Some(vec![]),
2017+
&test_state,
2018+
&mut virtual_overseer,
2019+
vec![(Hash::random(), 11)],
2020+
1,
2021+
)
2022+
.await;
2023+
19552024
TestHarness { virtual_overseer, req_v2_cfg: req_cfg }
19562025
},
19572026
);

prdoc/pr_10446.prdoc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
title: 'collator-protocol: pre-connect fix'
2+
doc:
3+
- audience: Node Dev
4+
description: |-
5+
Keep the connections to backers open until the relay parent of the advertised
6+
collation expires.
7+
crates:
8+
- name: polkadot-collator-protocol
9+
bump: patch

0 commit comments

Comments
 (0)