Vstaging statement distribution omnibus (#1436)

in-progress PR adding new tests and solving bugs --------- Co-authored-by: Bradley Olson <34992650+BradleyOlson64@users.noreply.github.com> Co-authored-by: eskimor <eskimor@no-such-url.com> Co-authored-by: eskimor <eskimor@users.noreply.github.com> Co-authored-by: Andrei Sandu <54316454+sandreim@users.noreply.github.com>
2026-04-26 05:17:58 +00:00 · 2023-10-21 04:01:14 -05:00
parent 76994356fc
commit a46183c706
14 changed files with 655 additions and 128 deletions
@@ -93,13 +93,18 @@ const COST_APPARENT_FLOOD: Rep =
 /// For considerations on this value, see: https://github.com/paritytech/polkadot/issues/4386
 const MAX_UNSHARED_UPLOAD_TIME: Duration = Duration::from_millis(150);

-/// Ensure that collator issues a connection request at least once every this many seconds.
-/// Usually it's done when advertising new collation. However, if the core stays occupied or
-/// it's not our turn to produce a candidate, it's important to disconnect from previous
-/// peers.
+/// Ensure that collator updates its connection requests to validators
+/// this long after the most recent leaf.
+///
+/// The timeout is designed for substreams to be properly closed if they need to be
+/// reopened shortly after the next leaf.
+///
+/// Collators also update their connection requests on every new collation.
+/// This timeout is mostly about removing stale connections while avoiding races
+/// with new collations which may want to reactivate them.
 ///
 /// Validators are obtained from [`ValidatorGroupsBuffer::validators_to_connect`].
-const RECONNECT_TIMEOUT: Duration = Duration::from_secs(12);
+const RECONNECT_AFTER_LEAF_TIMEOUT: Duration = Duration::from_secs(4);

 /// Future that when resolved indicates that we should update reserved peer-set
 /// of validators we want to be connected to.
@@ -108,6 +113,13 @@ const RECONNECT_TIMEOUT: Duration = Duration::from_secs(12);
 /// connected.
 type ReconnectTimeout = Fuse<futures_timer::Delay>;

+#[derive(Debug)]
+enum ShouldAdvertiseTo {
+	Yes,
+	NotAuthority,
+	AlreadyAdvertised,
+}
+
 /// Info about validators we are currently connected to.
 ///
 /// It keeps track to which validators we advertised our collation.
@@ -129,10 +141,10 @@ impl ValidatorGroup {
 		candidate_hash: &CandidateHash,
 		peer_ids: &HashMap<PeerId, HashSet<AuthorityDiscoveryId>>,
 		peer: &PeerId,
-	) -> bool {
+	) -> ShouldAdvertiseTo {
 		let authority_ids = match peer_ids.get(peer) {
 			Some(authority_ids) => authority_ids,
-			None => return false,
+			None => return ShouldAdvertiseTo::NotAuthority,
 		};

 		for id in authority_ids {
@@ -151,11 +163,13 @@ impl ValidatorGroup {
 				.get(candidate_hash)
 				.map_or(true, |advertised| !advertised[validator_index])
 			{
-				return true
+				return ShouldAdvertiseTo::Yes
+			} else {
+				return ShouldAdvertiseTo::AlreadyAdvertised
 			}
 		}

-		false
+		ShouldAdvertiseTo::NotAuthority
 	}

 	/// Should be called after we advertised our collation to the given `peer` to keep track of it.
@@ -255,8 +269,8 @@ struct State {
 	/// Tracks which validators we want to stay connected to.
 	validator_groups_buf: ValidatorGroupsBuffer,

-	/// Timeout-future that enforces collator to update the peer-set at least once
-	/// every [`RECONNECT_TIMEOUT`] seconds.
+	/// Timeout-future which is reset after every leaf to [`RECONNECT_AFTER_LEAF_TIMEOUT`] seconds.
+	/// When it fires, we update our reserved peers.
 	reconnect_timeout: ReconnectTimeout,

 	/// Metrics.
@@ -443,7 +457,7 @@ async fn distribute_collation<Context>(
 	}

 	// Update a set of connected validators if necessary.
-	state.reconnect_timeout = connect_to_validators(ctx, &state.validator_groups_buf).await;
+	connect_to_validators(ctx, &state.validator_groups_buf).await;

 	if let Some(result_sender) = result_sender {
 		state.collation_result_senders.insert(candidate_hash, result_sender);
@@ -619,15 +633,12 @@ async fn declare<Context>(

 /// Updates a set of connected validators based on their advertisement-bits
 /// in a validators buffer.
-///
-/// Should be called again once a returned future resolves.
 #[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
 async fn connect_to_validators<Context>(
 	ctx: &mut Context,
 	validator_groups_buf: &ValidatorGroupsBuffer,
-) -> ReconnectTimeout {
+) {
 	let validator_ids = validator_groups_buf.validators_to_connect();
-	let is_disconnect = validator_ids.is_empty();

 	// ignore address resolution failure
 	// will reissue a new request on new collation
@@ -638,14 +649,6 @@ async fn connect_to_validators<Context>(
 		failed,
 	})
 	.await;
-
-	if is_disconnect {
-		gum::trace!(target: LOG_TARGET, "Disconnecting from all peers");
-		// Never resolves.
-		Fuse::terminated()
-	} else {
-		futures_timer::Delay::new(RECONNECT_TIMEOUT).fuse()
-	}
 }

 /// Advertise collation to the given `peer`.
@@ -685,22 +688,29 @@ async fn advertise_collation<Context>(
 				.validator_group
 				.should_advertise_to(candidate_hash, peer_ids, &peer);

-		if !should_advertise {
-			gum::debug!(
-				target: LOG_TARGET,
-				?relay_parent,
-				peer_id = %peer,
-				"Not advertising collation since validator is not interested",
-			);
-			continue
+		match should_advertise {
+			ShouldAdvertiseTo::Yes => {},
+			ShouldAdvertiseTo::NotAuthority | ShouldAdvertiseTo::AlreadyAdvertised => {
+				gum::trace!(
+					target: LOG_TARGET,
+					?relay_parent,
+					?candidate_hash,
+					peer_id = %peer,
+					reason = ?should_advertise,
+					"Not advertising collation"
+				);
+				continue
+			},
 		}

 		gum::debug!(
 			target: LOG_TARGET,
 			?relay_parent,
+			?candidate_hash,
 			peer_id = %peer,
 			"Advertising collation.",
 		);
+
 		collation.status.advance_to_advertised();

 		let collation_message = match protocol_version {
@@ -1149,7 +1159,7 @@ async fn handle_network_msg<Context>(
 		PeerConnected(peer_id, observed_role, protocol_version, maybe_authority) => {
 			// If it is possible that a disconnected validator would attempt a reconnect
 			// it should be handled here.
-			gum::trace!(target: LOG_TARGET, ?peer_id, ?observed_role, "Peer connected");
+			gum::trace!(target: LOG_TARGET, ?peer_id, ?observed_role, ?maybe_authority, "Peer connected");

 			let version = match protocol_version.try_into() {
 				Ok(version) => version,
@@ -1200,7 +1210,11 @@ async fn handle_network_msg<Context>(
 		},
 		UpdatedAuthorityIds(peer_id, authority_ids) => {
 			gum::trace!(target: LOG_TARGET, ?peer_id, ?authority_ids, "Updated authority ids");
-			state.peer_ids.insert(peer_id, authority_ids);
+			if let Some(version) = state.peer_data.get(&peer_id).map(|d| d.version) {
+				if state.peer_ids.insert(peer_id, authority_ids).is_none() {
+					declare(ctx, state, &peer_id, version).await;
+				}
+			}
 		},
 		NewGossipTopology { .. } => {
 			// impossible!
@@ -1369,7 +1383,11 @@ async fn run_inner<Context>(
 						"Failed to process message"
 					)?;
 				},
-				FromOrchestra::Signal(ActiveLeaves(_update)) => {}
+				FromOrchestra::Signal(ActiveLeaves(update)) => {
+					if update.activated.is_some() {
+						*reconnect_timeout = futures_timer::Delay::new(RECONNECT_AFTER_LEAF_TIMEOUT).fuse();
+					}
+				}
 				FromOrchestra::Signal(BlockFinalized(..)) => {}
 				FromOrchestra::Signal(Conclude) => return Ok(()),
 			},
@@ -1390,7 +1408,7 @@ async fn run_inner<Context>(
 						// The request it still alive, it should be kept in a waiting queue.
 					} else {
 						for authority_id in state.peer_ids.get(&peer_id).into_iter().flatten() {
-							// Timeout not hit, this peer is no longer interested in this relay parent.
+							// This peer has received the candidate. Not interested anymore.
 							state.validator_groups_buf.reset_validator_interest(candidate_hash, authority_id);
 						}
 						waiting.waiting_peers.remove(&(peer_id, candidate_hash));
@@ -1446,12 +1464,11 @@ async fn run_inner<Context>(
 				}
 			}
 			_ = reconnect_timeout => {
-				state.reconnect_timeout =
-					connect_to_validators(&mut ctx, &state.validator_groups_buf).await;
+				connect_to_validators(&mut ctx, &state.validator_groups_buf).await;

 				gum::trace!(
 					target: LOG_TARGET,
-					timeout = ?RECONNECT_TIMEOUT,
+					timeout = ?RECONNECT_AFTER_LEAF_TIMEOUT,
 					"Peer-set updated due to a timeout"
 				);
 			},
@@ -133,7 +133,7 @@ impl ValidatorGroupsBuffer {
 		}
 	}

-	/// Note that a validator is no longer interested in a given relay parent.
+	/// Note that a validator is no longer interested in a given candidate.
 	pub fn reset_validator_interest(
 		&mut self,
 		candidate_hash: CandidateHash,