Reduce network bandwidth, improve parablock times: optimize approval-distribution (#5164)

* gossip-support: be explicit about dimensions

* some guide updates

* update network-bridge to distinguish x and y dimensions

* get everything to compile

* beginnings

* some TODOs

* polkadot runtime: use relevant_authorities

* make gossip topologies per-session

* better formatting

* gossip support: use current session validators

* expand in comment

* adjust tests and fix index bug

* add past/present/future connection test and clean up code

* fmt

* network bridge: updated types

* update protocols to new gossip topology message

* guide updates

* add session to BlockApprovalMeta

* add session to block info

* refactor knowledge and remove most unify logic

* start replacing gossip_peers with new SessionTopologies

* add routing information to message state

* add some utilities to SessionTopology

* implement new gossip topology logic

* re-implement unify_with_peer

* distribute assignments according to topology

* finish grid topology implementation

* refactor network bridge slightly

* issue connection requests on all past/present/future

* fmt

* address grumbles

* tighten invariants in unify_with_peer

* implement random propagation

* refactor: extract required routing adjustment logic

* some block-age logic

* aggressively propagate messages when finality is slow

* overhaul aggression system to have 3 levels

* add aggression metrics

* remove aggression L3

* reduce random circulation

* remove PeerData

* get approval tests compiling

* use btree_map in known_by to make deterministic

* Revert "use btree_map in known_by to make deterministic"

This reverts commit 330d65343a7bb6fe4dd0f24bd8dbc15c0cbdbd9d.

* test XY grid propagation

* remove stray println

* test unshared dimension propagation

* add random gossip check

* test unify_with_peer better

* test sending after getting gossip topology

* test L1 aggression on originator

* test L1 aggression for non-originators

* test non-originator aggression L2

* fnt

* ~spellcheck

* fix statement-distribution tests

* fix flaky test

* fix metrics typo

* re-send periodically

* test resending

* typo

Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>

* add more metrics about apd messages

* add back unify_with_peer logs

* make Resend an enum

* be more explicit when resending

* fmt

* fix error

* add a TODO for refactoring

* remove debug metrics

* add some guide stuff

* fmt

* update runtime API in test-runtim

Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>
This commit is contained in:
asynchronous rob
2022-04-19 13:26:55 -05:00
committed by GitHub
parent edfa24bbc5
commit 79ecc53801
25 changed files with 2563 additions and 499 deletions
+125 -52
View File
@@ -49,10 +49,12 @@ use polkadot_node_subsystem::{
RuntimeApiRequest,
},
overseer, ActiveLeavesUpdate, FromOverseer, OverseerSignal, SpawnedSubsystem, SubsystemContext,
SubsystemError, SubsystemSender,
SubsystemError,
};
use polkadot_node_subsystem_util as util;
use polkadot_primitives::v2::{AuthorityDiscoveryId, Hash, SessionIndex};
use polkadot_primitives::v2::{
AuthorityDiscoveryId, Hash, SessionIndex, SessionInfo, ValidatorIndex,
};
#[cfg(test)]
mod tests;
@@ -213,6 +215,24 @@ where
if force_request { leaf_session } else { maybe_new_session };
if let Some((session_index, relay_parent)) = maybe_issue_connection {
let session_info =
util::request_session_info(leaf, session_index, ctx.sender()).await.await??;
let session_info = match session_info {
Some(s) => s,
None => {
gum::warn!(
relay_parent = ?leaf,
session_index = self.last_session_index,
"Failed to get session info.",
);
continue
},
};
// Note: we only update `last_session_index` once we've
// successfully gotten the `SessionInfo`.
let is_new_session = maybe_new_session.is_some();
if is_new_session {
gum::debug!(
@@ -223,45 +243,52 @@ where
self.last_session_index = Some(session_index);
}
let all_authorities = determine_relevant_authorities(ctx, relay_parent).await?;
let our_index = ensure_i_am_an_authority(&self.keystore, &all_authorities).await?;
let other_authorities = {
let mut authorities = all_authorities.clone();
authorities.swap_remove(our_index);
authorities
};
// Connect to authorities from the past/present/future.
//
// This is maybe not the right place for this logic to live,
// but at the moment we're limited by the network bridge's ability
// to handle connection requests (it only allows one, globally).
//
// Certain network protocols - mostly req/res, but some gossip,
// will require being connected to past/future validators as well
// as current. That is, the old authority sets are not made obsolete
// by virtue of a new session being entered. Therefore we maintain
// connections to a much broader set of validators.
{
let mut connections = authorities_past_present_future(ctx, leaf).await?;
self.issue_connection_request(ctx, other_authorities).await;
// Remove all of our locally controlled validator indices so we don't connect to ourself.
// If we control none of them, don't issue connection requests - we're outside
// of the 'clique' of recent validators.
if remove_all_controlled(&self.keystore, &mut connections).await != 0 {
self.issue_connection_request(ctx, connections).await;
}
}
// Gossip topology is only relevant for authorities in the current session.
let our_index =
ensure_i_am_an_authority(&self.keystore, &session_info.discovery_keys).await?;
if is_new_session {
update_gossip_topology(ctx, our_index, all_authorities, relay_parent).await?;
self.update_authority_status_metrics(leaf, ctx.sender()).await?;
self.update_authority_status_metrics(&session_info).await;
update_gossip_topology(
ctx,
our_index,
session_info.discovery_keys,
relay_parent,
session_index,
)
.await?;
}
}
}
Ok(())
}
async fn update_authority_status_metrics(
&mut self,
leaf: Hash,
sender: &mut impl SubsystemSender,
) -> Result<(), util::Error> {
if let Some(session_info) = util::request_session_info(
leaf,
self.last_session_index
.expect("Last session index is always set on every session index change"),
sender,
)
.await
.await??
{
let maybe_index = match ensure_i_am_an_authority(
&self.keystore,
&session_info.discovery_keys,
)
.await
{
async fn update_authority_status_metrics(&mut self, session_info: &SessionInfo) {
let maybe_index =
match ensure_i_am_an_authority(&self.keystore, &session_info.discovery_keys).await {
Ok(index) => {
self.metrics.on_is_authority();
Some(index)
@@ -275,21 +302,19 @@ where
Err(_) => None,
};
if let Some(validator_index) = maybe_index {
// The subset of authorities participating in parachain consensus.
let parachain_validators_this_session = session_info.validators;
if let Some(validator_index) = maybe_index {
// The subset of authorities participating in parachain consensus.
let parachain_validators_this_session = session_info.validators.len();
// First `maxValidators` entries are the parachain validators. We'll check
// if our index is in this set to avoid searching for the keys.
// https://github.com/paritytech/polkadot/blob/a52dca2be7840b23c19c153cf7e110b1e3e475f8/runtime/parachains/src/configuration.rs#L148
if validator_index < parachain_validators_this_session.len() {
self.metrics.on_is_parachain_validator();
} else {
self.metrics.on_is_not_parachain_validator();
}
// First `maxValidators` entries are the parachain validators. We'll check
// if our index is in this set to avoid searching for the keys.
// https://github.com/paritytech/polkadot/blob/a52dca2be7840b23c19c153cf7e110b1e3e475f8/runtime/parachains/src/configuration.rs#L148
if validator_index < parachain_validators_this_session {
self.metrics.on_is_parachain_validator();
} else {
self.metrics.on_is_not_parachain_validator();
}
}
Ok(())
}
async fn issue_connection_request<Context>(
@@ -378,7 +403,7 @@ where
},
NetworkBridgeEvent::OurViewChange(_) => {},
NetworkBridgeEvent::PeerViewChange(_, _) => {},
NetworkBridgeEvent::NewGossipTopology(_) => {},
NetworkBridgeEvent::NewGossipTopology { .. } => {},
NetworkBridgeEvent::PeerMessage(_, v) => {
match v {};
},
@@ -416,7 +441,8 @@ where
}
}
async fn determine_relevant_authorities<Context>(
// Get the authorities of the past, present, and future.
async fn authorities_past_present_future<Context>(
ctx: &mut Context,
relay_parent: Hash,
) -> Result<Vec<AuthorityDiscoveryId>, util::Error>
@@ -428,7 +454,7 @@ where
gum::debug!(
target: LOG_TARGET,
authority_count = ?authorities.len(),
"Determined relevant authorities",
"Determined past/present/future authorities",
);
Ok(authorities)
}
@@ -447,6 +473,25 @@ async fn ensure_i_am_an_authority(
Err(util::Error::NotAValidator)
}
/// Filter out all controlled keys in the given set. Returns the number of keys removed.
async fn remove_all_controlled(
keystore: &SyncCryptoStorePtr,
authorities: &mut Vec<AuthorityDiscoveryId>,
) -> usize {
let mut to_remove = Vec::new();
for (i, v) in authorities.iter().enumerate() {
if CryptoStore::has_keys(&**keystore, &[(v.to_raw_vec(), AuthorityDiscoveryId::ID)]).await {
to_remove.push(i);
}
}
for i in to_remove.iter().rev().copied() {
authorities.remove(i);
}
to_remove.len()
}
/// We partition the list of all sorted `authorities` into `sqrt(len)` groups of `sqrt(len)` size
/// and form a matrix where each validator is connected to all validators in its row and column.
/// This is similar to `[web3]` research proposed topology, except for the groups are not parachain
@@ -460,6 +505,7 @@ async fn update_gossip_topology<Context>(
our_index: usize,
authorities: Vec<AuthorityDiscoveryId>,
relay_parent: Hash,
session_index: SessionIndex,
) -> Result<(), util::Error>
where
Context: SubsystemContext<Message = GossipSupportMessage>,
@@ -469,6 +515,8 @@ where
let random_seed = {
let (tx, rx) = oneshot::channel();
// TODO https://github.com/paritytech/polkadot/issues/5316:
// get the random seed from the `SessionInfo` instead.
ctx.send_message(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::CurrentBabeEpoch(tx),
@@ -493,16 +541,38 @@ where
.expect("our_index < len; indices contains it; qed");
let neighbors = matrix_neighbors(our_shuffled_position, len);
let our_neighbors = neighbors.map(|i| authorities[indices[i]].clone()).collect();
let row_neighbors = neighbors
.row_neighbors
.map(|i| indices[i])
.map(|i| (authorities[i].clone(), ValidatorIndex::from(i as u32)))
.collect();
ctx.send_message(NetworkBridgeMessage::NewGossipTopology { our_neighbors })
.await;
let column_neighbors = neighbors
.column_neighbors
.map(|i| indices[i])
.map(|i| (authorities[i].clone(), ValidatorIndex::from(i as u32)))
.collect();
ctx.send_message(NetworkBridgeMessage::NewGossipTopology {
session: session_index,
our_neighbors_x: row_neighbors,
our_neighbors_y: column_neighbors,
})
.await;
Ok(())
}
struct MatrixNeighbors<R, C> {
row_neighbors: R,
column_neighbors: C,
}
/// Compute our row and column neighbors in a matrix
fn matrix_neighbors(our_index: usize, len: usize) -> impl Iterator<Item = usize> {
fn matrix_neighbors(
our_index: usize,
len: usize,
) -> MatrixNeighbors<impl Iterator<Item = usize>, impl Iterator<Item = usize>> {
assert!(our_index < len, "our_index is computed using `enumerate`; qed");
// e.g. for size 11 the matrix would be
@@ -520,7 +590,10 @@ fn matrix_neighbors(our_index: usize, len: usize) -> impl Iterator<Item = usize>
let row_neighbors = our_row * sqrt..std::cmp::min(our_row * sqrt + sqrt, len);
let column_neighbors = (our_column..len).step_by(sqrt);
row_neighbors.chain(column_neighbors).filter(move |i| *i != our_index)
MatrixNeighbors {
row_neighbors: row_neighbors.filter(move |i| *i != our_index),
column_neighbors: column_neighbors.filter(move |i| *i != our_index),
}
}
impl<Context, AD> overseer::Subsystem<Context, SubsystemError> for GossipSupport<AD>