Reduce network bandwidth, improve parablock times: optimize approval-distribution (#5164)

* gossip-support: be explicit about dimensions * some guide updates * update network-bridge to distinguish x and y dimensions * get everything to compile * beginnings * some TODOs * polkadot runtime: use relevant_authorities * make gossip topologies per-session * better formatting * gossip support: use current session validators * expand in comment * adjust tests and fix index bug * add past/present/future connection test and clean up code * fmt * network bridge: updated types * update protocols to new gossip topology message * guide updates * add session to BlockApprovalMeta * add session to block info * refactor knowledge and remove most unify logic * start replacing gossip_peers with new SessionTopologies * add routing information to message state * add some utilities to SessionTopology * implement new gossip topology logic * re-implement unify_with_peer * distribute assignments according to topology * finish grid topology implementation * refactor network bridge slightly * issue connection requests on all past/present/future * fmt * address grumbles * tighten invariants in unify_with_peer * implement random propagation * refactor: extract required routing adjustment logic * some block-age logic * aggressively propagate messages when finality is slow * overhaul aggression system to have 3 levels * add aggression metrics * remove aggression L3 * reduce random circulation * remove PeerData * get approval tests compiling * use btree_map in known_by to make deterministic * Revert "use btree_map in known_by to make deterministic" This reverts commit 330d65343a7bb6fe4dd0f24bd8dbc15c0cbdbd9d. * test XY grid propagation * remove stray println * test unshared dimension propagation * add random gossip check * test unify_with_peer better * test sending after getting gossip topology * test L1 aggression on originator * test L1 aggression for non-originators * test non-originator aggression L2 * fnt * ~spellcheck * fix statement-distribution tests * fix flaky test * fix metrics typo * re-send periodically * test resending * typo Co-authored-by: Bernhard Schuster <bernhard@ahoi.io> * add more metrics about apd messages * add back unify_with_peer logs * make Resend an enum * be more explicit when resending * fmt * fix error * add a TODO for refactoring * remove debug metrics * add some guide stuff * fmt * update runtime API in test-runtim Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>
2026-04-26 07:37:57 +00:00 · 2022-04-19 13:26:55 -05:00
parent edfa24bbc5
commit 79ecc53801
25 changed files with 2563 additions and 499 deletions
@@ -49,10 +49,12 @@ use polkadot_node_subsystem::{
 		RuntimeApiRequest,
 	},
 	overseer, ActiveLeavesUpdate, FromOverseer, OverseerSignal, SpawnedSubsystem, SubsystemContext,
-	SubsystemError, SubsystemSender,
+	SubsystemError,
 };
 use polkadot_node_subsystem_util as util;
-use polkadot_primitives::v2::{AuthorityDiscoveryId, Hash, SessionIndex};
+use polkadot_primitives::v2::{
+	AuthorityDiscoveryId, Hash, SessionIndex, SessionInfo, ValidatorIndex,
+};

 #[cfg(test)]
 mod tests;
@@ -213,6 +215,24 @@ where
 				if force_request { leaf_session } else { maybe_new_session };

 			if let Some((session_index, relay_parent)) = maybe_issue_connection {
+				let session_info =
+					util::request_session_info(leaf, session_index, ctx.sender()).await.await??;
+
+				let session_info = match session_info {
+					Some(s) => s,
+					None => {
+						gum::warn!(
+							relay_parent = ?leaf,
+							session_index = self.last_session_index,
+							"Failed to get session info.",
+						);
+
+						continue
+					},
+				};
+
+				// Note: we only update `last_session_index` once we've
+				// successfully gotten the `SessionInfo`.
 				let is_new_session = maybe_new_session.is_some();
 				if is_new_session {
 					gum::debug!(
@@ -223,45 +243,52 @@ where
 					self.last_session_index = Some(session_index);
 				}

-				let all_authorities = determine_relevant_authorities(ctx, relay_parent).await?;
-				let our_index = ensure_i_am_an_authority(&self.keystore, &all_authorities).await?;
-				let other_authorities = {
-					let mut authorities = all_authorities.clone();
-					authorities.swap_remove(our_index);
-					authorities
-				};
+				// Connect to authorities from the past/present/future.
+				//
+				// This is maybe not the right place for this logic to live,
+				// but at the moment we're limited by the network bridge's ability
+				// to handle connection requests (it only allows one, globally).
+				//
+				// Certain network protocols - mostly req/res, but some gossip,
+				// will require being connected to past/future validators as well
+				// as current. That is, the old authority sets are not made obsolete
+				// by virtue of a new session being entered. Therefore we maintain
+				// connections to a much broader set of validators.
+				{
+					let mut connections = authorities_past_present_future(ctx, leaf).await?;

-				self.issue_connection_request(ctx, other_authorities).await;
+					// Remove all of our locally controlled validator indices so we don't connect to ourself.
+					// If we control none of them, don't issue connection requests - we're outside
+					// of the 'clique' of recent validators.
+					if remove_all_controlled(&self.keystore, &mut connections).await != 0 {
+						self.issue_connection_request(ctx, connections).await;
+					}
+				}
+
+				// Gossip topology is only relevant for authorities in the current session.
+				let our_index =
+					ensure_i_am_an_authority(&self.keystore, &session_info.discovery_keys).await?;

 				if is_new_session {
-					update_gossip_topology(ctx, our_index, all_authorities, relay_parent).await?;
-					self.update_authority_status_metrics(leaf, ctx.sender()).await?;
+					self.update_authority_status_metrics(&session_info).await;
+
+					update_gossip_topology(
+						ctx,
+						our_index,
+						session_info.discovery_keys,
+						relay_parent,
+						session_index,
+					)
+					.await?;
 				}
 			}
 		}
 		Ok(())
 	}

-	async fn update_authority_status_metrics(
-		&mut self,
-		leaf: Hash,
-		sender: &mut impl SubsystemSender,
-	) -> Result<(), util::Error> {
-		if let Some(session_info) = util::request_session_info(
-			leaf,
-			self.last_session_index
-				.expect("Last session index is always set on every session index change"),
-			sender,
-		)
-		.await
-		.await??
-		{
-			let maybe_index = match ensure_i_am_an_authority(
-				&self.keystore,
-				&session_info.discovery_keys,
-			)
-			.await
-			{
+	async fn update_authority_status_metrics(&mut self, session_info: &SessionInfo) {
+		let maybe_index =
+			match ensure_i_am_an_authority(&self.keystore, &session_info.discovery_keys).await {
 				Ok(index) => {
 					self.metrics.on_is_authority();
 					Some(index)
@@ -275,21 +302,19 @@ where
 				Err(_) => None,
 			};

-			if let Some(validator_index) = maybe_index {
-				// The subset of authorities participating in parachain consensus.
-				let parachain_validators_this_session = session_info.validators;
+		if let Some(validator_index) = maybe_index {
+			// The subset of authorities participating in parachain consensus.
+			let parachain_validators_this_session = session_info.validators.len();

-				// First `maxValidators` entries are the parachain validators. We'll check
-				// if our index is in this set to avoid searching for the keys.
-				// https://github.com/paritytech/polkadot/blob/a52dca2be7840b23c19c153cf7e110b1e3e475f8/runtime/parachains/src/configuration.rs#L148
-				if validator_index < parachain_validators_this_session.len() {
-					self.metrics.on_is_parachain_validator();
-				} else {
-					self.metrics.on_is_not_parachain_validator();
-				}
+			// First `maxValidators` entries are the parachain validators. We'll check
+			// if our index is in this set to avoid searching for the keys.
+			// https://github.com/paritytech/polkadot/blob/a52dca2be7840b23c19c153cf7e110b1e3e475f8/runtime/parachains/src/configuration.rs#L148
+			if validator_index < parachain_validators_this_session {
+				self.metrics.on_is_parachain_validator();
+			} else {
+				self.metrics.on_is_not_parachain_validator();
 			}
 		}
-		Ok(())
 	}

 	async fn issue_connection_request<Context>(
@@ -378,7 +403,7 @@ where
 			},
 			NetworkBridgeEvent::OurViewChange(_) => {},
 			NetworkBridgeEvent::PeerViewChange(_, _) => {},
-			NetworkBridgeEvent::NewGossipTopology(_) => {},
+			NetworkBridgeEvent::NewGossipTopology { .. } => {},
 			NetworkBridgeEvent::PeerMessage(_, v) => {
 				match v {};
 			},
@@ -416,7 +441,8 @@ where
 	}
 }

-async fn determine_relevant_authorities<Context>(
+// Get the authorities of the past, present, and future.
+async fn authorities_past_present_future<Context>(
 	ctx: &mut Context,
 	relay_parent: Hash,
 ) -> Result<Vec<AuthorityDiscoveryId>, util::Error>
@@ -428,7 +454,7 @@ where
 	gum::debug!(
 		target: LOG_TARGET,
 		authority_count = ?authorities.len(),
-		"Determined relevant authorities",
+		"Determined past/present/future authorities",
 	);
 	Ok(authorities)
 }
@@ -447,6 +473,25 @@ async fn ensure_i_am_an_authority(
 	Err(util::Error::NotAValidator)
 }

+/// Filter out all controlled keys in the given set. Returns the number of keys removed.
+async fn remove_all_controlled(
+	keystore: &SyncCryptoStorePtr,
+	authorities: &mut Vec<AuthorityDiscoveryId>,
+) -> usize {
+	let mut to_remove = Vec::new();
+	for (i, v) in authorities.iter().enumerate() {
+		if CryptoStore::has_keys(&**keystore, &[(v.to_raw_vec(), AuthorityDiscoveryId::ID)]).await {
+			to_remove.push(i);
+		}
+	}
+
+	for i in to_remove.iter().rev().copied() {
+		authorities.remove(i);
+	}
+
+	to_remove.len()
+}
+
 /// We partition the list of all sorted `authorities` into `sqrt(len)` groups of `sqrt(len)` size
 /// and form a matrix where each validator is connected to all validators in its row and column.
 /// This is similar to `[web3]` research proposed topology, except for the groups are not parachain
@@ -460,6 +505,7 @@ async fn update_gossip_topology<Context>(
 	our_index: usize,
 	authorities: Vec<AuthorityDiscoveryId>,
 	relay_parent: Hash,
+	session_index: SessionIndex,
 ) -> Result<(), util::Error>
 where
 	Context: SubsystemContext<Message = GossipSupportMessage>,
@@ -469,6 +515,8 @@ where
 	let random_seed = {
 		let (tx, rx) = oneshot::channel();

+		// TODO https://github.com/paritytech/polkadot/issues/5316:
+		// get the random seed from the `SessionInfo` instead.
 		ctx.send_message(RuntimeApiMessage::Request(
 			relay_parent,
 			RuntimeApiRequest::CurrentBabeEpoch(tx),
@@ -493,16 +541,38 @@ where
 		.expect("our_index < len; indices contains it; qed");

 	let neighbors = matrix_neighbors(our_shuffled_position, len);
-	let our_neighbors = neighbors.map(|i| authorities[indices[i]].clone()).collect();
+	let row_neighbors = neighbors
+		.row_neighbors
+		.map(|i| indices[i])
+		.map(|i| (authorities[i].clone(), ValidatorIndex::from(i as u32)))
+		.collect();

-	ctx.send_message(NetworkBridgeMessage::NewGossipTopology { our_neighbors })
-		.await;
+	let column_neighbors = neighbors
+		.column_neighbors
+		.map(|i| indices[i])
+		.map(|i| (authorities[i].clone(), ValidatorIndex::from(i as u32)))
+		.collect();
+
+	ctx.send_message(NetworkBridgeMessage::NewGossipTopology {
+		session: session_index,
+		our_neighbors_x: row_neighbors,
+		our_neighbors_y: column_neighbors,
+	})
+	.await;

 	Ok(())
 }

+struct MatrixNeighbors<R, C> {
+	row_neighbors: R,
+	column_neighbors: C,
+}
+
 /// Compute our row and column neighbors in a matrix
-fn matrix_neighbors(our_index: usize, len: usize) -> impl Iterator<Item = usize> {
+fn matrix_neighbors(
+	our_index: usize,
+	len: usize,
+) -> MatrixNeighbors<impl Iterator<Item = usize>, impl Iterator<Item = usize>> {
 	assert!(our_index < len, "our_index is computed using `enumerate`; qed");

 	// e.g. for size 11 the matrix would be
@@ -520,7 +590,10 @@ fn matrix_neighbors(our_index: usize, len: usize) -> impl Iterator<Item = usize>
 	let row_neighbors = our_row * sqrt..std::cmp::min(our_row * sqrt + sqrt, len);
 	let column_neighbors = (our_column..len).step_by(sqrt);

-	row_neighbors.chain(column_neighbors).filter(move |i| *i != our_index)
+	MatrixNeighbors {
+		row_neighbors: row_neighbors.filter(move |i| *i != our_index),
+		column_neighbors: column_neighbors.filter(move |i| *i != our_index),
+	}
 }

 impl<Context, AD> overseer::Subsystem<Context, SubsystemError> for GossipSupport<AD>