approval-distribution: Update topology if authorities are discovered later (#2981)

Fixes: https://github.com/paritytech/polkadot-sdk/issues/2138.

Especially on restart AuthorithyDiscovery cache is not populated so we
create an invalid topology and messages won't be routed correctly for
the entire session. This PR proposes to try to fix this by updating the
topology as soon as we now the Authority/PeerId mapping, that should
impact the situation dramatically.


[This issue was hit
yesterday](https://grafana.teleport.parity.io/goto/o9q2625Sg?orgId=1),
on Westend and resulted in stalling the finality.


# TODO

- [x] Unit tests
- [x] Test impact on versi

---------

Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io>
This commit is contained in:
Alexandru Gheorghe
2024-01-25 12:58:37 +02:00
committed by GitHub
parent b57e53dc13
commit a6952c7469
6 changed files with 473 additions and 42 deletions
@@ -667,9 +667,12 @@ impl State {
rng: &mut (impl CryptoRng + Rng),
) {
match event {
NetworkBridgeEvent::PeerConnected(peer_id, role, version, _) => {
NetworkBridgeEvent::PeerConnected(peer_id, role, version, authority_ids) => {
gum::trace!(target: LOG_TARGET, ?peer_id, ?role, ?authority_ids, "Peer connected");
if let Some(authority_ids) = authority_ids {
self.topologies.update_authority_ids(peer_id, &authority_ids);
}
// insert a blank view if none already present
gum::trace!(target: LOG_TARGET, ?peer_id, ?role, "Peer connected");
self.peer_views
.entry(peer_id)
.or_insert(PeerEntry { view: Default::default(), version });
@@ -716,8 +719,41 @@ impl State {
NetworkBridgeEvent::PeerMessage(peer_id, message) => {
self.process_incoming_peer_message(ctx, metrics, peer_id, message, rng).await;
},
NetworkBridgeEvent::UpdatedAuthorityIds { .. } => {
// The approval-distribution subsystem doesn't deal with `AuthorityDiscoveryId`s.
NetworkBridgeEvent::UpdatedAuthorityIds(peer_id, authority_ids) => {
gum::debug!(target: LOG_TARGET, ?peer_id, ?authority_ids, "Update Authority Ids");
// If we learn about a new PeerId for an authority ids we need to try to route the
// messages that should have sent to that validator according to the topology.
if self.topologies.update_authority_ids(peer_id, &authority_ids) {
if let Some(PeerEntry { view, version }) = self.peer_views.get(&peer_id) {
let intersection = self
.blocks_by_number
.iter()
.filter(|(block_number, _)| *block_number > &view.finalized_number)
.flat_map(|(_, hashes)| {
hashes.iter().filter(|hash| {
self.blocks
.get(&hash)
.map(|block| block.known_by.get(&peer_id).is_some())
.unwrap_or_default()
})
});
let view_intersection =
View::new(intersection.cloned(), view.finalized_number);
Self::unify_with_peer(
ctx.sender(),
metrics,
&mut self.blocks,
&self.topologies,
self.peer_views.len(),
peer_id,
*version,
view_intersection,
rng,
true,
)
.await;
}
}
},
}
}
@@ -789,6 +825,7 @@ impl State {
*version,
view_intersection,
rng,
false,
)
.await;
}
@@ -1101,6 +1138,7 @@ impl State {
protocol_version,
view,
rng,
false,
)
.await;
}
@@ -1838,6 +1876,7 @@ impl State {
protocol_version: ProtocolVersion,
view: View,
rng: &mut (impl CryptoRng + Rng),
retry_known_blocks: bool,
) {
metrics.on_unify_with_peer();
let _timer = metrics.time_unify_with_peer();
@@ -1856,10 +1895,12 @@ impl State {
_ => break,
};
// Any peer which is in the `known_by` set has already been
// sent all messages it's meant to get for that block and all
// in-scope prior blocks.
if entry.known_by.contains_key(&peer_id) {
// Any peer which is in the `known_by` see and we know its peer_id authorithy id
// mapping has already been sent all messages it's meant to get for that block and
// all in-scope prior blocks. In case, we just learnt about its peer_id
// authorithy-id mapping we have to retry sending the messages that should be sent
// to it for all un-finalized blocks.
if entry.known_by.contains_key(&peer_id) && !retry_known_blocks {
break
}