mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-23 02:41:08 +00:00
improved gossip topology (#3270)
* gossip-support: gossip topology * some fixes * handle view update for newly added gossip peers * fix neighbors calculation * fix test * resolve TODOs * typo * guide updates * spaces in the guide * sneaky spaces * hash randomness * address some review nits * use unbounded in bridge for subsystem msg
This commit is contained in:
@@ -15,14 +15,23 @@
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! This subsystem is responsible for keeping track of session changes
|
||||
//! and issuing a connection request to the validators relevant to
|
||||
//! the gossiping subsystems on every new session.
|
||||
//! and issuing a connection request to the relevant validators
|
||||
//! on every new session.
|
||||
//!
|
||||
//! In addition to that, it creates a gossip overlay topology
|
||||
//! which limits the amount of messages sent and received
|
||||
//! to be an order of sqrt of the validators. Our neighbors
|
||||
//! in this graph will be forwarded to the network bridge with
|
||||
//! the `NetworkBridgeMessage::NewGossipTopology` message.
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
use futures::{channel::oneshot, FutureExt as _};
|
||||
use rand::{SeedableRng, seq::SliceRandom as _};
|
||||
use rand_chacha::ChaCha20Rng;
|
||||
use polkadot_node_subsystem::{
|
||||
messages::{
|
||||
AllMessages, GossipSupportMessage, NetworkBridgeMessage,
|
||||
RuntimeApiMessage, RuntimeApiRequest,
|
||||
},
|
||||
ActiveLeavesUpdate, FromOverseer, OverseerSignal,
|
||||
Subsystem, SpawnedSubsystem, SubsystemContext,
|
||||
@@ -39,8 +48,8 @@ use sp_application_crypto::{Public, AppKey};
|
||||
mod tests;
|
||||
|
||||
const LOG_TARGET: &str = "parachain::gossip-support";
|
||||
// How much time should we wait since the last
|
||||
// authority discovery resolution failure.
|
||||
// How much time should we wait to reissue a connection request
|
||||
// since the last authority discovery resolution failure.
|
||||
const BACKOFF_DURATION: Duration = Duration::from_secs(5);
|
||||
|
||||
/// The Gossip Support subsystem.
|
||||
@@ -85,7 +94,7 @@ impl GossipSupport {
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
err = ?e,
|
||||
"Failed to receive a message from Overseer, exiting"
|
||||
"Failed to receive a message from Overseer, exiting",
|
||||
);
|
||||
return;
|
||||
},
|
||||
@@ -120,28 +129,30 @@ async fn determine_relevant_authorities(
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
authority_count = ?authorities.len(),
|
||||
"Determined relevant authorities"
|
||||
"Determined relevant authorities",
|
||||
);
|
||||
Ok(authorities)
|
||||
}
|
||||
|
||||
/// Return an error if we're not a validator in the given set (do not have keys).
|
||||
/// Otherwise, returns the index of our keys in `authorities`.
|
||||
async fn ensure_i_am_an_authority(
|
||||
keystore: &SyncCryptoStorePtr,
|
||||
authorities: &[AuthorityDiscoveryId],
|
||||
) -> Result<(), util::Error> {
|
||||
for v in authorities {
|
||||
if CryptoStore::has_keys(&**keystore, &[(v.to_raw_vec(), AuthorityDiscoveryId::ID)])
|
||||
.await
|
||||
{
|
||||
return Ok(());
|
||||
) -> Result<usize, util::Error> {
|
||||
for (i, v) in authorities.iter().enumerate() {
|
||||
if CryptoStore::has_keys(
|
||||
&**keystore,
|
||||
&[(v.to_raw_vec(), AuthorityDiscoveryId::ID)]
|
||||
).await {
|
||||
return Ok(i);
|
||||
}
|
||||
}
|
||||
Err(util::Error::NotAValidator)
|
||||
}
|
||||
|
||||
/// A helper function for making a `ConnectToValidators` request.
|
||||
pub async fn connect_to_authorities(
|
||||
async fn connect_to_authorities(
|
||||
ctx: &mut impl SubsystemContext,
|
||||
validator_ids: Vec<AuthorityDiscoveryId>,
|
||||
peer_set: PeerSet,
|
||||
@@ -157,6 +168,79 @@ pub async fn connect_to_authorities(
|
||||
failed_rx
|
||||
}
|
||||
|
||||
/// We partition the list of all sorted `authorities` into sqrt(len) groups of sqrt(len) size
|
||||
/// and form a matrix where each validator is connected to all validators in its row and column.
|
||||
/// This is similar to [web3] research proposed topology, except for the groups are not parachain
|
||||
/// groups (because not all validators are parachain validators and the group size is small),
|
||||
/// but formed randomly via BABE randomness from two epochs ago.
|
||||
/// This limits the amount of gossip peers to 2 * sqrt(len) and ensures the diameter of 2.
|
||||
///
|
||||
/// [web3]: https://research.web3.foundation/en/latest/polkadot/networking/3-avail-valid.html#topology
|
||||
async fn update_gossip_topology(
|
||||
ctx: &mut impl SubsystemContext,
|
||||
our_index: usize,
|
||||
authorities: Vec<AuthorityDiscoveryId>,
|
||||
relay_parent: Hash,
|
||||
) -> Result<(), util::Error> {
|
||||
// retrieve BABE randomness
|
||||
let random_seed = {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
ctx.send_message(RuntimeApiMessage::Request(
|
||||
relay_parent,
|
||||
RuntimeApiRequest::CurrentBabeEpoch(tx),
|
||||
).into()).await;
|
||||
|
||||
let randomness = rx.await??.randomness;
|
||||
let mut subject = [0u8; 40];
|
||||
subject[..8].copy_from_slice(b"gossipsu");
|
||||
subject[8..].copy_from_slice(&randomness);
|
||||
sp_core::blake2_256(&subject)
|
||||
};
|
||||
|
||||
// shuffle the indices
|
||||
let mut rng: ChaCha20Rng = SeedableRng::from_seed(random_seed);
|
||||
let len = authorities.len();
|
||||
let mut indices: Vec<usize> = (0..len).collect();
|
||||
indices.shuffle(&mut rng);
|
||||
let our_shuffled_position = indices.iter()
|
||||
.position(|i| *i == our_index)
|
||||
.expect("our_index < len; indices contains it; qed");
|
||||
|
||||
let neighbors = matrix_neighbors(our_shuffled_position, len);
|
||||
let our_neighbors = neighbors.map(|i| authorities[indices[i]].clone()).collect();
|
||||
|
||||
ctx.send_message(AllMessages::NetworkBridge(
|
||||
NetworkBridgeMessage::NewGossipTopology {
|
||||
our_neighbors,
|
||||
}
|
||||
)).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute our row and column neighbors in a matrix
|
||||
fn matrix_neighbors(our_index: usize, len: usize) -> impl Iterator<Item=usize> {
|
||||
assert!(our_index < len, "our_index is computed using `enumerate`; qed");
|
||||
|
||||
// e.g. for size 11 the matrix would be
|
||||
//
|
||||
// 0 1 2
|
||||
// 3 4 5
|
||||
// 6 7 8
|
||||
// 9 10
|
||||
//
|
||||
// and for index 10, the neighbors would be 1, 4, 7, 9
|
||||
|
||||
let sqrt = (len as f64).sqrt() as usize;
|
||||
let our_row = our_index / sqrt;
|
||||
let our_column = our_index % sqrt;
|
||||
let row_neighbors = our_row * sqrt..std::cmp::min(our_row * sqrt + sqrt, len);
|
||||
let column_neighbors = (our_column..len).step_by(sqrt);
|
||||
|
||||
row_neighbors.chain(column_neighbors).filter(move |i| *i != our_index)
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// 1. Determine if the current session index has changed.
|
||||
/// 2. If it has, determine relevant validators
|
||||
@@ -171,46 +255,72 @@ impl State {
|
||||
let current_index = util::request_session_index_for_child(leaf, ctx.sender()).await.await??;
|
||||
let since_failure = self.last_failure.map(|i| i.elapsed()).unwrap_or_default();
|
||||
let force_request = since_failure >= BACKOFF_DURATION;
|
||||
let leaf_session = Some((current_index, leaf));
|
||||
let maybe_new_session = match self.last_session_index {
|
||||
Some(i) if current_index <= i && !force_request => None,
|
||||
_ => Some((current_index, leaf)),
|
||||
Some(i) if current_index <= i => None,
|
||||
_ => leaf_session,
|
||||
};
|
||||
|
||||
if let Some((new_session, relay_parent)) = maybe_new_session {
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
%new_session,
|
||||
%force_request,
|
||||
"New session detected",
|
||||
);
|
||||
let maybe_issue_connection = if force_request {
|
||||
leaf_session
|
||||
} else {
|
||||
maybe_new_session
|
||||
};
|
||||
|
||||
if let Some((session_index, relay_parent)) = maybe_issue_connection {
|
||||
let is_new_session = maybe_new_session.is_some();
|
||||
if is_new_session {
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
%session_index,
|
||||
"New session detected",
|
||||
);
|
||||
}
|
||||
|
||||
let authorities = determine_relevant_authorities(ctx, relay_parent).await?;
|
||||
ensure_i_am_an_authority(keystore, &authorities).await?;
|
||||
let num = authorities.len();
|
||||
tracing::debug!(target: LOG_TARGET, %num, "Issuing a connection request");
|
||||
let our_index = ensure_i_am_an_authority(keystore, &authorities).await?;
|
||||
|
||||
let failures = connect_to_authorities(
|
||||
ctx,
|
||||
authorities,
|
||||
PeerSet::Validation,
|
||||
).await;
|
||||
self.issue_connection_request(ctx, authorities.clone()).await?;
|
||||
|
||||
// we await for the request to be processed
|
||||
// this is fine, it should take much less time than one session
|
||||
let failures = failures.await.unwrap_or(num);
|
||||
|
||||
self.last_session_index = Some(new_session);
|
||||
// issue another request for the same session
|
||||
// if at least a third of the authorities were not resolved
|
||||
self.last_failure = if failures >= num / 3 {
|
||||
Some(Instant::now())
|
||||
} else {
|
||||
None
|
||||
if is_new_session {
|
||||
self.last_session_index = Some(session_index);
|
||||
update_gossip_topology(ctx, our_index, authorities, relay_parent).await?;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn issue_connection_request(
|
||||
&mut self,
|
||||
ctx: &mut impl SubsystemContext,
|
||||
authorities: Vec<AuthorityDiscoveryId>,
|
||||
) -> Result<(), util::Error> {
|
||||
let num = authorities.len();
|
||||
tracing::debug!(target: LOG_TARGET, %num, "Issuing a connection request");
|
||||
|
||||
let failures = connect_to_authorities(
|
||||
ctx,
|
||||
authorities,
|
||||
PeerSet::Validation,
|
||||
).await;
|
||||
|
||||
// we await for the request to be processed
|
||||
// this is fine, it should take much less time than one session
|
||||
let failures = failures.await.unwrap_or(num);
|
||||
|
||||
// issue another request for the same session
|
||||
// if at least a third of the authorities were not resolved
|
||||
self.last_failure = if failures >= num / 3 {
|
||||
Some(Instant::now())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Context> Subsystem<Context> for GossipSupport
|
||||
|
||||
@@ -26,6 +26,9 @@ use polkadot_node_subsystem_util::TimeoutExt as _;
|
||||
use sc_keystore::LocalKeystore;
|
||||
use sp_keyring::Sr25519Keyring;
|
||||
use sp_keystore::SyncCryptoStore;
|
||||
use sp_consensus_babe::{
|
||||
Epoch as BabeEpoch, BabeEpochConfiguration, AllowedSlots,
|
||||
};
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -117,6 +120,47 @@ fn authorities() -> Vec<AuthorityDiscoveryId> {
|
||||
]
|
||||
}
|
||||
|
||||
fn neighbors() -> Vec<AuthorityDiscoveryId> {
|
||||
vec![
|
||||
Sr25519Keyring::One.public().into(),
|
||||
Sr25519Keyring::Alice.public().into(),
|
||||
Sr25519Keyring::Eve.public().into(),
|
||||
]
|
||||
}
|
||||
|
||||
async fn test_neighbors(overseer: &mut VirtualOverseer) {
|
||||
assert_matches!(
|
||||
overseer_recv(overseer).await,
|
||||
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
|
||||
_,
|
||||
RuntimeApiRequest::CurrentBabeEpoch(tx),
|
||||
)) => {
|
||||
let _ = tx.send(Ok(BabeEpoch {
|
||||
epoch_index: 2 as _,
|
||||
start_slot: 0.into(),
|
||||
duration: 200,
|
||||
authorities: vec![(Sr25519Keyring::Alice.public().into(), 1)],
|
||||
randomness: [0u8; 32],
|
||||
config: BabeEpochConfiguration {
|
||||
c: (1, 4),
|
||||
allowed_slots: AllowedSlots::PrimarySlots,
|
||||
},
|
||||
})).unwrap();
|
||||
}
|
||||
);
|
||||
|
||||
assert_matches!(
|
||||
overseer_recv(overseer).await,
|
||||
AllMessages::NetworkBridge(NetworkBridgeMessage::NewGossipTopology {
|
||||
our_neighbors,
|
||||
}) => {
|
||||
let mut got: Vec<_> = our_neighbors.into_iter().collect();
|
||||
got.sort();
|
||||
assert_eq!(got, neighbors());
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn issues_a_connection_request_on_new_session() {
|
||||
let hash = Hash::repeat_byte(0xAA);
|
||||
@@ -157,6 +201,8 @@ fn issues_a_connection_request_on_new_session() {
|
||||
}
|
||||
);
|
||||
|
||||
test_neighbors(overseer).await;
|
||||
|
||||
virtual_overseer
|
||||
});
|
||||
|
||||
@@ -223,6 +269,8 @@ fn issues_a_connection_request_on_new_session() {
|
||||
}
|
||||
);
|
||||
|
||||
test_neighbors(overseer).await;
|
||||
|
||||
virtual_overseer
|
||||
});
|
||||
assert_eq!(state.last_session_index, Some(2));
|
||||
@@ -268,6 +316,9 @@ fn issues_a_connection_request_when_last_request_was_mostly_unresolved() {
|
||||
failed.send(2).unwrap();
|
||||
}
|
||||
);
|
||||
|
||||
test_neighbors(overseer).await;
|
||||
|
||||
virtual_overseer
|
||||
});
|
||||
|
||||
@@ -312,6 +363,7 @@ fn issues_a_connection_request_when_last_request_was_mostly_unresolved() {
|
||||
failed.send(1).unwrap();
|
||||
}
|
||||
);
|
||||
|
||||
virtual_overseer
|
||||
});
|
||||
|
||||
@@ -319,3 +371,18 @@ fn issues_a_connection_request_when_last_request_was_mostly_unresolved() {
|
||||
assert!(state.last_failure.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_matrix_neighbors() {
|
||||
for (our_index, len, expected) in vec![
|
||||
(0usize, 1usize, vec![]),
|
||||
(1, 2, vec![0usize]),
|
||||
(0, 9, vec![1, 2, 3, 6]),
|
||||
(9, 10, vec![0, 3, 6]),
|
||||
(10, 11, vec![1, 4, 7, 9]),
|
||||
(7, 11, vec![1, 4, 6, 8, 10]),
|
||||
].into_iter() {
|
||||
let mut result: Vec<_> = matrix_neighbors(our_index, len).collect();
|
||||
result.sort();
|
||||
assert_eq!(result, expected);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user