validator-discovery: basic retrying logic (#3059)

* validator_discovery: less flexible, but simpler design

* fix test

* remove unused struct

* smol optimization

* validator_discovery: basic retrying logic

* add a test

* add more tests

* update the guide

* more test logic

* Require at least 2/3 connectivity.

* Fix test.

* Update node/network/gossip-support/src/lib.rs

Co-authored-by: André Silva <123550+andresilva@users.noreply.github.com>

* Update node/network/gossip-support/src/lib.rs

Co-authored-by: André Silva <123550+andresilva@users.noreply.github.com>

Co-authored-by: Robert Klotzner <robert.klotzner@gmx.at>
Co-authored-by: Robert Klotzner <eskimor@users.noreply.github.com>
Co-authored-by: André Silva <123550+andresilva@users.noreply.github.com>
This commit is contained in:
Andronik Ordian
2021-05-20 12:05:44 +02:00
committed by GitHub
parent 933c9ac2bf
commit 2e70f4ea08
9 changed files with 432 additions and 27 deletions
+5
View File
@@ -5755,12 +5755,17 @@ dependencies = [
name = "polkadot-gossip-support" name = "polkadot-gossip-support"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"assert_matches",
"futures 0.3.14", "futures 0.3.14",
"polkadot-node-network-protocol", "polkadot-node-network-protocol",
"polkadot-node-subsystem", "polkadot-node-subsystem",
"polkadot-node-subsystem-test-helpers",
"polkadot-node-subsystem-util", "polkadot-node-subsystem-util",
"polkadot-primitives", "polkadot-primitives",
"sc-keystore",
"sp-application-crypto", "sp-application-crypto",
"sp-core",
"sp-keyring",
"sp-keystore", "sp-keystore",
"tracing", "tracing",
] ]
+2
View File
@@ -509,6 +509,7 @@ where
NetworkBridgeMessage::ConnectToValidators { NetworkBridgeMessage::ConnectToValidators {
validator_ids, validator_ids,
peer_set, peer_set,
failed,
} => { } => {
tracing::trace!( tracing::trace!(
target: LOG_TARGET, target: LOG_TARGET,
@@ -521,6 +522,7 @@ where
let (ns, ads) = validator_discovery.on_request( let (ns, ads) = validator_discovery.on_request(
validator_ids, validator_ids,
peer_set, peer_set,
failed,
network_service, network_service,
authority_discovery_service, authority_discovery_service,
).await; ).await;
@@ -22,6 +22,7 @@ use core::marker::PhantomData;
use std::collections::HashSet; use std::collections::HashSet;
use async_trait::async_trait; use async_trait::async_trait;
use futures::channel::oneshot;
use sc_network::multiaddr::Multiaddr; use sc_network::multiaddr::Multiaddr;
use sc_authority_discovery::Service as AuthorityDiscoveryService; use sc_authority_discovery::Service as AuthorityDiscoveryService;
@@ -81,16 +82,19 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
&mut self, &mut self,
validator_ids: Vec<AuthorityDiscoveryId>, validator_ids: Vec<AuthorityDiscoveryId>,
peer_set: PeerSet, peer_set: PeerSet,
failed: oneshot::Sender<usize>,
mut network_service: N, mut network_service: N,
mut authority_discovery_service: AD, mut authority_discovery_service: AD,
) -> (N, AD) { ) -> (N, AD) {
// collect multiaddress of validators // collect multiaddress of validators
let mut failed_to_resolve: usize = 0;
let mut newly_requested = HashSet::new(); let mut newly_requested = HashSet::new();
for authority in validator_ids.into_iter() { for authority in validator_ids.into_iter() {
let result = authority_discovery_service.get_addresses_by_authority_id(authority.clone()).await; let result = authority_discovery_service.get_addresses_by_authority_id(authority.clone()).await;
if let Some(addresses) = result { if let Some(addresses) = result {
newly_requested.extend(addresses); newly_requested.extend(addresses);
} else { } else {
failed_to_resolve += 1;
tracing::debug!(target: LOG_TARGET, "Authority Discovery couldn't resolve {:?}", authority); tracing::debug!(target: LOG_TARGET, "Authority Discovery couldn't resolve {:?}", authority);
} }
} }
@@ -120,6 +124,8 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
multiaddr_to_remove multiaddr_to_remove
).await; ).await;
let _ = failed.send(failed_to_resolve);
(network_service, authority_discovery_service) (network_service, authority_discovery_service)
} }
} }
@@ -237,22 +243,55 @@ mod tests {
let authority_ids: Vec<_> = ads.by_peer_id.values().cloned().collect(); let authority_ids: Vec<_> = ads.by_peer_id.values().cloned().collect();
futures::executor::block_on(async move { futures::executor::block_on(async move {
let (failed, _) = oneshot::channel();
let (ns, ads) = service.on_request( let (ns, ads) = service.on_request(
vec![authority_ids[0].clone()], vec![authority_ids[0].clone()],
PeerSet::Validation, PeerSet::Validation,
failed,
ns, ns,
ads, ads,
).await; ).await;
let _ = service.on_request( let (failed, _) = oneshot::channel();
let (_, ads) = service.on_request(
vec![authority_ids[1].clone()], vec![authority_ids[1].clone()],
PeerSet::Validation, PeerSet::Validation,
failed,
ns, ns,
ads, ads,
).await; ).await;
let state = &service.state[PeerSet::Validation]; let state = &service.state[PeerSet::Validation];
assert_eq!(state.previously_requested.len(), 1); assert_eq!(state.previously_requested.len(), 1);
assert!(state.previously_requested.contains(ads.by_authority_id.get(&authority_ids[1]).unwrap()));
});
}
#[test]
fn failed_resolution_is_reported_properly() {
let mut service = new_service();
let (ns, ads) = new_network();
let authority_ids: Vec<_> = ads.by_peer_id.values().cloned().collect();
futures::executor::block_on(async move {
let (failed, failed_rx) = oneshot::channel();
let unknown = Sr25519Keyring::Ferdie.public().into();
let (_, ads) = service.on_request(
vec![authority_ids[0].clone(), unknown],
PeerSet::Validation,
failed,
ns,
ads,
).await;
let state = &service.state[PeerSet::Validation];
assert_eq!(state.previously_requested.len(), 1);
assert!(state.previously_requested.contains(ads.by_authority_id.get(&authority_ids[0]).unwrap()));
let failed = failed_rx.await.unwrap();
assert_eq!(failed, 1);
}); });
} }
} }
@@ -455,8 +455,11 @@ async fn connect_to_validators(
ctx: &mut impl SubsystemContext, ctx: &mut impl SubsystemContext,
validator_ids: Vec<AuthorityDiscoveryId>, validator_ids: Vec<AuthorityDiscoveryId>,
) { ) {
// ignore address resolution failure
// will reissue a new request on new collation
let (failed, _) = oneshot::channel();
ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators { ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators {
validator_ids, peer_set: PeerSet::Collation, validator_ids, peer_set: PeerSet::Collation, failed,
})).await; })).await;
} }
@@ -15,3 +15,12 @@ polkadot-primitives = { path = "../../../primitives" }
futures = "0.3.8" futures = "0.3.8"
tracing = "0.1.25" tracing = "0.1.25"
[dev-dependencies]
sp-core = { git = "https://github.com/paritytech/substrate", branch = "master", features = ["std"] }
sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" }
sc-keystore = { git = "https://github.com/paritytech/substrate", branch = "master" }
polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" }
assert_matches = "1.4.0"
@@ -18,7 +18,10 @@
//! and issuing a connection request to the validators relevant to //! and issuing a connection request to the validators relevant to
//! the gossiping subsystems on every new session. //! the gossiping subsystems on every new session.
use futures::FutureExt as _; #[cfg(test)]
mod tests;
use futures::{channel::oneshot, FutureExt as _};
use polkadot_node_subsystem::{ use polkadot_node_subsystem::{
messages::{ messages::{
AllMessages, GossipSupportMessage, NetworkBridgeMessage, AllMessages, GossipSupportMessage, NetworkBridgeMessage,
@@ -44,6 +47,7 @@ pub struct GossipSupport {
#[derive(Default)] #[derive(Default)]
struct State { struct State {
last_session_index: Option<SessionIndex>, last_session_index: Option<SessionIndex>,
force_request: bool,
} }
impl GossipSupport { impl GossipSupport {
@@ -54,12 +58,18 @@ impl GossipSupport {
} }
} }
#[tracing::instrument(skip(self, ctx), fields(subsystem = LOG_TARGET))] async fn run<Context>(self, ctx: Context)
async fn run<Context>(self, mut ctx: Context)
where where
Context: SubsystemContext<Message = GossipSupportMessage>, Context: SubsystemContext<Message = GossipSupportMessage>,
{ {
let mut state = State::default(); let mut state = State::default();
self.run_inner(ctx, &mut state).await;
}
async fn run_inner<Context>(self, mut ctx: Context, state: &mut State)
where
Context: SubsystemContext<Message = GossipSupportMessage>,
{
let Self { keystore } = self; let Self { keystore } = self;
loop { loop {
let message = match ctx.recv().await { let message = match ctx.recv().await {
@@ -128,13 +138,16 @@ pub async fn connect_to_authorities(
ctx: &mut impl SubsystemContext, ctx: &mut impl SubsystemContext,
validator_ids: Vec<AuthorityDiscoveryId>, validator_ids: Vec<AuthorityDiscoveryId>,
peer_set: PeerSet, peer_set: PeerSet,
) { ) -> oneshot::Receiver<usize> {
let (failed, failed_rx) = oneshot::channel();
ctx.send_message(AllMessages::NetworkBridge( ctx.send_message(AllMessages::NetworkBridge(
NetworkBridgeMessage::ConnectToValidators { NetworkBridgeMessage::ConnectToValidators {
validator_ids, validator_ids,
peer_set, peer_set,
failed,
} }
)).await; )).await;
failed_rx
} }
impl State { impl State {
@@ -150,7 +163,7 @@ impl State {
for leaf in leaves { for leaf in leaves {
let current_index = util::request_session_index_for_child(leaf, ctx.sender()).await.await??; let current_index = util::request_session_index_for_child(leaf, ctx.sender()).await.await??;
let maybe_new_session = match self.last_session_index { let maybe_new_session = match self.last_session_index {
Some(i) if i >= current_index => None, Some(i) if current_index <= i && !self.force_request => None,
_ => Some((current_index, leaf)), _ => Some((current_index, leaf)),
}; };
@@ -158,15 +171,22 @@ impl State {
tracing::debug!(target: LOG_TARGET, %new_session, "New session detected"); tracing::debug!(target: LOG_TARGET, %new_session, "New session detected");
let authorities = determine_relevant_authorities(ctx, relay_parent).await?; let authorities = determine_relevant_authorities(ctx, relay_parent).await?;
ensure_i_am_an_authority(keystore, &authorities).await?; ensure_i_am_an_authority(keystore, &authorities).await?;
tracing::debug!(target: LOG_TARGET, num = ?authorities.len(), "Issuing a connection request"); let num = authorities.len();
tracing::debug!(target: LOG_TARGET, %num, "Issuing a connection request");
connect_to_authorities( let failures = connect_to_authorities(
ctx, ctx,
authorities, authorities,
PeerSet::Validation, PeerSet::Validation,
).await; ).await;
// we await for the request to be processed
// this is fine, it should take much less time than one session
let failures = failures.await.unwrap_or(num);
self.last_session_index = Some(new_session); self.last_session_index = Some(new_session);
// issue another request if at least a third of the authorities were not resolved
self.force_request = failures >= num / 3;
} }
} }
@@ -0,0 +1,319 @@
// Copyright 2021 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Unit tests for Gossip Support Subsystem.
use super::*;
use polkadot_node_subsystem::{
jaeger, ActivatedLeaf,
messages::{RuntimeApiMessage, RuntimeApiRequest},
};
use polkadot_node_subsystem_test_helpers as test_helpers;
use polkadot_node_subsystem_util::TimeoutExt as _;
use sc_keystore::LocalKeystore;
use sp_keyring::Sr25519Keyring;
use sp_keystore::SyncCryptoStore;
use std::sync::Arc;
use std::time::Duration;
use assert_matches::assert_matches;
use futures::{Future, executor, future};
type VirtualOverseer = test_helpers::TestSubsystemContextHandle<GossipSupportMessage>;
fn test_harness<T: Future<Output = VirtualOverseer>>(
mut state: State,
test_fn: impl FnOnce(VirtualOverseer) -> T,
) -> State {
let pool = sp_core::testing::TaskExecutor::new();
let (context, virtual_overseer) = test_helpers::make_subsystem_context(pool.clone());
let keystore = make_ferdie_keystore();
let subsystem = GossipSupport::new(keystore);
{
let subsystem = subsystem.run_inner(context, &mut state);
let test_fut = test_fn(virtual_overseer);
futures::pin_mut!(test_fut);
futures::pin_mut!(subsystem);
executor::block_on(future::join(async move {
let mut overseer = test_fut.await;
overseer
.send(FromOverseer::Signal(OverseerSignal::Conclude))
.timeout(TIMEOUT)
.await
.expect("Conclude send timeout");
}, subsystem));
}
state
}
const TIMEOUT: Duration = Duration::from_millis(100);
async fn overseer_signal_active_leaves(
overseer: &mut VirtualOverseer,
leaf: Hash,
) {
let leaf = ActivatedLeaf {
hash: leaf,
number: 0xdeadcafe,
span: Arc::new(jaeger::Span::Disabled),
};
overseer
.send(FromOverseer::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(leaf))))
.timeout(TIMEOUT)
.await
.expect("signal send timeout");
}
async fn overseer_recv(
overseer: &mut VirtualOverseer,
) -> AllMessages {
let msg = overseer
.recv()
.timeout(TIMEOUT)
.await
.expect("msg recv timeout");
msg
}
fn make_ferdie_keystore() -> SyncCryptoStorePtr {
let keystore: SyncCryptoStorePtr = Arc::new(LocalKeystore::in_memory());
SyncCryptoStore::sr25519_generate_new(
&*keystore,
AuthorityDiscoveryId::ID,
Some(&Sr25519Keyring::Ferdie.to_seed()),
)
.expect("Insert key into keystore");
keystore
}
fn authorities() -> Vec<AuthorityDiscoveryId> {
vec![
Sr25519Keyring::Alice.public().into(),
Sr25519Keyring::Bob.public().into(),
Sr25519Keyring::Charlie.public().into(),
Sr25519Keyring::Ferdie.public().into(),
Sr25519Keyring::Eve.public().into(),
Sr25519Keyring::One.public().into(),
]
}
#[test]
fn issues_a_connection_request_on_new_session() {
let hash = Hash::repeat_byte(0xAA);
let state = test_harness(State::default(), |mut virtual_overseer| async move {
let overseer = &mut virtual_overseer;
overseer_signal_active_leaves(overseer, hash).await;
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(1)).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::Authorities(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(authorities())).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators {
validator_ids,
peer_set,
failed,
}) => {
assert_eq!(validator_ids, authorities());
assert_eq!(peer_set, PeerSet::Validation);
failed.send(0).unwrap();
}
);
virtual_overseer
});
assert_eq!(state.last_session_index, Some(1));
assert!(!state.force_request);
// does not issue on the same session
let hash = Hash::repeat_byte(0xBB);
let state = test_harness(state, |mut virtual_overseer| async move {
let overseer = &mut virtual_overseer;
overseer_signal_active_leaves(overseer, hash).await;
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(1)).unwrap();
}
);
virtual_overseer
});
assert_eq!(state.last_session_index, Some(1));
assert!(!state.force_request);
// does on the new one
let hash = Hash::repeat_byte(0xCC);
let state = test_harness(state, |mut virtual_overseer| async move {
let overseer = &mut virtual_overseer;
overseer_signal_active_leaves(overseer, hash).await;
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(2)).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::Authorities(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(authorities())).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators {
validator_ids,
peer_set,
failed,
}) => {
assert_eq!(validator_ids, authorities());
assert_eq!(peer_set, PeerSet::Validation);
failed.send(0).unwrap();
}
);
virtual_overseer
});
assert_eq!(state.last_session_index, Some(2));
assert!(!state.force_request);
}
#[test]
fn issues_a_connection_request_when_last_request_was_mostly_unresolved() {
let hash = Hash::repeat_byte(0xAA);
let state = test_harness(State::default(), |mut virtual_overseer| async move {
let overseer = &mut virtual_overseer;
overseer_signal_active_leaves(overseer, hash).await;
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(1)).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::Authorities(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(authorities())).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators {
validator_ids,
peer_set,
failed,
}) => {
assert_eq!(validator_ids, authorities());
assert_eq!(peer_set, PeerSet::Validation);
failed.send(2).unwrap();
}
);
virtual_overseer
});
assert_eq!(state.last_session_index, Some(1));
assert!(state.force_request);
let hash = Hash::repeat_byte(0xBB);
let state = test_harness(state, |mut virtual_overseer| async move {
let overseer = &mut virtual_overseer;
overseer_signal_active_leaves(overseer, hash).await;
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(1)).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::Authorities(tx),
)) => {
assert_eq!(relay_parent, hash);
tx.send(Ok(authorities())).unwrap();
}
);
assert_matches!(
overseer_recv(overseer).await,
AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators {
validator_ids,
peer_set,
failed,
}) => {
assert_eq!(validator_ids, authorities());
assert_eq!(peer_set, PeerSet::Validation);
failed.send(1).unwrap();
}
);
virtual_overseer
});
assert_eq!(state.last_session_index, Some(1));
assert!(!state.force_request);
}
+3
View File
@@ -253,6 +253,9 @@ pub enum NetworkBridgeMessage {
validator_ids: Vec<AuthorityDiscoveryId>, validator_ids: Vec<AuthorityDiscoveryId>,
/// The underlying protocol to use for this request. /// The underlying protocol to use for this request.
peer_set: PeerSet, peer_set: PeerSet,
/// Sends back the number of `AuthorityDiscoveryId`s which
/// authority discovery has failed to resolve.
failed: oneshot::Sender<usize>,
}, },
} }
@@ -156,18 +156,18 @@ enum AvailabilityDistributionMessage {
/// ///
/// NOTE: The result of this fetch is not yet locally validated and could be bogus. /// NOTE: The result of this fetch is not yet locally validated and could be bogus.
FetchPoV { FetchPoV {
/// The relay parent giving the necessary context. /// The relay parent giving the necessary context.
relay_parent: Hash, relay_parent: Hash,
/// Validator to fetch the PoV from. /// Validator to fetch the PoV from.
from_validator: ValidatorIndex, from_validator: ValidatorIndex,
/// Candidate hash to fetch the PoV for. /// Candidate hash to fetch the PoV for.
candidate_hash: CandidateHash, candidate_hash: CandidateHash,
/// Expected hash of the PoV, a PoV not matching this hash will be rejected. /// Expected hash of the PoV, a PoV not matching this hash will be rejected.
pov_hash: Hash, pov_hash: Hash,
/// Sender for getting back the result of this fetch. /// Sender for getting back the result of this fetch.
/// ///
/// The sender will be canceled if the fetching failed for some reason. /// The sender will be canceled if the fetching failed for some reason.
tx: oneshot::Sender<PoV>, tx: oneshot::Sender<PoV>,
}, },
} }
``` ```
@@ -446,17 +446,22 @@ enum NetworkBridgeMessage {
/// Connect to peers who represent the given `validator_ids`. /// Connect to peers who represent the given `validator_ids`.
/// ///
/// Also ask the network to stay connected to these peers at least /// Also ask the network to stay connected to these peers at least
/// until the request is revoked. /// until a new request is issued.
/// This can be done by dropping the receiver. ///
/// Because it overrides the previous request, it must be ensured
/// that `validator_ids` include all peers the subsystems
/// are interested in (per `PeerSet`).
///
/// A caller can learn about validator connections by listening to the
/// `PeerConnected` events from the network bridge.
ConnectToValidators { ConnectToValidators {
/// Ids of the validators to connect to. /// Ids of the validators to connect to.
validator_ids: Vec<AuthorityDiscoveryId>, validator_ids: Vec<AuthorityDiscoveryId>,
/// The underlying protocol to use for this request. /// The underlying protocol to use for this request.
peer_set: PeerSet, peer_set: PeerSet,
/// Response sender by which the issuer can learn the `PeerId`s of /// Sends back the number of `AuthorityDiscoveryId`s which
/// the validators as they are connected. /// authority discovery has failed to resolve.
/// The response is sent immediately for already connected peers. failed: oneshot::Sender<usize>,
connected: ResponseStream<(AuthorityDiscoveryId, PeerId)>,
}, },
} }
``` ```