Log info about low connectivity and unreachable validators (#3916)

* Attempt to add log stats to gossip-support.

* WIP: Keep track of connected validators.

* Clarify metric.

* WIP: Make gossip support report connectivity.

* WIP: Fixing tests.

* Fix network bridge + integrate in overseer.

* Consistent naming.

* Fix logic error

* cargo fmt

* Pretty logs.

* cargo fmt

* Use `Delay` to trigger periodic checks.

* fmt

* Fix warning for authority set size of 1.

* More correct ratio report if there are no resolved validators.

* Prettier rendering of empty set.

* Fix typo.

* Another typo.

* Don't check on every leaf update.

* Make compatible with older rustc.

* Fix tests.

* Demote warning.
This commit is contained in:
Robert Klotzner
2021-09-27 16:30:02 +02:00
committed by GitHub
parent fdebbbf4b3
commit 7c3b3c4a59
11 changed files with 635 additions and 315 deletions
+22 -1
View File
@@ -198,7 +198,7 @@ impl metrics::Metrics for Metrics {
prometheus::GaugeVec::new(
prometheus::Opts::new(
"parachain_desired_peer_count",
"The number of peers that the local node is expected to connect to on a parachain-related peer-set",
"The number of peers that the local node is expected to connect to on a parachain-related peer-set (either including or not including unresolvable authorities, depending on whether `ConnectToValidators` or `ConnectToValidatorsResolved` was used.)",
),
&["protocol"]
)?,
@@ -552,6 +552,27 @@ where
network_service = ns;
authority_discovery_service = ads;
}
NetworkBridgeMessage::ConnectToResolvedValidators {
validator_addrs,
peer_set,
} => {
tracing::trace!(
target: LOG_TARGET,
action = "ConnectToPeers",
peer_set = ?peer_set,
?validator_addrs,
"Received a resolved validator connection request",
);
metrics.note_desired_peer_count(peer_set, validator_addrs.len());
let all_addrs = validator_addrs.into_iter().flatten().collect();
network_service = validator_discovery.on_resolved_request(
all_addrs,
peer_set,
network_service,
).await;
}
NetworkBridgeMessage::NewGossipTopology {
our_neighbors,
} => {
+13 -3
View File
@@ -37,7 +37,8 @@ use polkadot_primitives::v1::AuthorityDiscoveryId;
use polkadot_subsystem::{
jaeger,
messages::{
ApprovalDistributionMessage, BitfieldDistributionMessage, StatementDistributionMessage,
ApprovalDistributionMessage, BitfieldDistributionMessage, GossipSupportMessage,
StatementDistributionMessage,
},
ActiveLeavesUpdate, FromOverseer, LeafStatus, OverseerSignal,
};
@@ -337,6 +338,13 @@ async fn assert_sends_validation_event_to_all(
ApprovalDistributionMessage::NetworkBridgeUpdateV1(e)
) if e == event.focus().expect("could not focus message")
);
assert_matches!(
virtual_overseer.recv().await,
AllMessages::GossipSupport(
GossipSupportMessage::NetworkBridgeUpdateV1(e)
) if e == event.focus().expect("could not focus message")
);
}
async fn assert_sends_collation_event_to_all(
@@ -1189,7 +1197,7 @@ fn send_messages_to_peers() {
fn spread_event_to_subsystems_is_up_to_date() {
// Number of subsystems expected to be interested in a network event,
// and hence the network event broadcasted to.
const EXPECTED_COUNT: usize = 3;
const EXPECTED_COUNT: usize = 4;
let mut cnt = 0_usize;
for msg in AllMessages::dispatch_iter(NetworkBridgeEvent::PeerDisconnected(PeerId::random())) {
@@ -1219,7 +1227,9 @@ fn spread_event_to_subsystems_is_up_to_date() {
AllMessages::ApprovalDistribution(_) => {
cnt += 1;
},
AllMessages::GossipSupport(_) => unreachable!("Not interested in network events"),
AllMessages::GossipSupport(_) => {
cnt += 1;
},
AllMessages::DisputeCoordinator(_) => unreachable!("Not interested in network events"),
AllMessages::DisputeParticipation(_) =>
unreachable!("Not interested in network events"),
@@ -47,6 +47,44 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
Self { state: Default::default(), _phantom: PhantomData }
}
/// Connect to already resolved addresses:
pub async fn on_resolved_request(
&mut self,
newly_requested: HashSet<Multiaddr>,
peer_set: PeerSet,
mut network_service: N,
) -> N {
let state = &mut self.state[peer_set];
// clean up revoked requests
let multiaddr_to_remove: HashSet<_> =
state.previously_requested.difference(&newly_requested).cloned().collect();
let multiaddr_to_add: HashSet<_> =
newly_requested.difference(&state.previously_requested).cloned().collect();
state.previously_requested = newly_requested;
tracing::debug!(
target: LOG_TARGET,
?peer_set,
added = multiaddr_to_add.len(),
removed = multiaddr_to_remove.len(),
"New ConnectToValidators resolved request",
);
// ask the network to connect to these nodes and not disconnect
// from them until removed from the set
if let Err(e) = network_service
.add_to_peers_set(peer_set.into_protocol_name(), multiaddr_to_add)
.await
{
tracing::warn!(target: LOG_TARGET, err = ?e, "AuthorityDiscoveryService returned an invalid multiaddress");
}
// the addresses are known to be valid
let _ = network_service
.remove_from_peers_set(peer_set.into_protocol_name(), multiaddr_to_remove)
.await;
network_service
}
/// On a new connection request, a peer set update will be issued.
/// It will ask the network to connect to the validators and not disconnect
/// from them at least until the next request is issued for the same peer set.
@@ -59,7 +97,7 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
validator_ids: Vec<AuthorityDiscoveryId>,
peer_set: PeerSet,
failed: oneshot::Sender<usize>,
mut network_service: N,
network_service: N,
mut authority_discovery_service: AD,
) -> (N, AD) {
// collect multiaddress of validators
@@ -82,39 +120,19 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
}
}
let state = &mut self.state[peer_set];
// clean up revoked requests
let multiaddr_to_remove: HashSet<_> =
state.previously_requested.difference(&newly_requested).cloned().collect();
let multiaddr_to_add: HashSet<_> =
newly_requested.difference(&state.previously_requested).cloned().collect();
state.previously_requested = newly_requested;
tracing::debug!(
target: LOG_TARGET,
?peer_set,
?requested,
added = multiaddr_to_add.len(),
removed = multiaddr_to_remove.len(),
?failed_to_resolve,
"New ConnectToValidators request",
);
// ask the network to connect to these nodes and not disconnect
// from them until removed from the set
if let Err(e) = network_service
.add_to_peers_set(peer_set.into_protocol_name(), multiaddr_to_add)
.await
{
tracing::warn!(target: LOG_TARGET, err = ?e, "AuthorityDiscoveryService returned an invalid multiaddress");
}
// the addresses are known to be valid
let _ = network_service
.remove_from_peers_set(peer_set.into_protocol_name(), multiaddr_to_remove)
.await;
let r = self.on_resolved_request(newly_requested, peer_set, network_service).await;
let _ = failed.send(failed_to_resolve);
(network_service, authority_discovery_service)
(r, authority_discovery_service)
}
}