Vstaging statement distribution omnibus (#1436)

in-progress PR adding new tests and solving bugs

---------

Co-authored-by: Bradley Olson <34992650+BradleyOlson64@users.noreply.github.com>
Co-authored-by: eskimor <eskimor@no-such-url.com>
Co-authored-by: eskimor <eskimor@users.noreply.github.com>
Co-authored-by: Andrei Sandu <54316454+sandreim@users.noreply.github.com>
This commit is contained in:
asynchronous rob
2023-10-21 04:01:14 -05:00
committed by GitHub
parent 76994356fc
commit a46183c706
14 changed files with 655 additions and 128 deletions
@@ -93,13 +93,18 @@ const COST_APPARENT_FLOOD: Rep =
/// For considerations on this value, see: https://github.com/paritytech/polkadot/issues/4386
const MAX_UNSHARED_UPLOAD_TIME: Duration = Duration::from_millis(150);
/// Ensure that collator issues a connection request at least once every this many seconds.
/// Usually it's done when advertising new collation. However, if the core stays occupied or
/// it's not our turn to produce a candidate, it's important to disconnect from previous
/// peers.
/// Ensure that collator updates its connection requests to validators
/// this long after the most recent leaf.
///
/// The timeout is designed for substreams to be properly closed if they need to be
/// reopened shortly after the next leaf.
///
/// Collators also update their connection requests on every new collation.
/// This timeout is mostly about removing stale connections while avoiding races
/// with new collations which may want to reactivate them.
///
/// Validators are obtained from [`ValidatorGroupsBuffer::validators_to_connect`].
const RECONNECT_TIMEOUT: Duration = Duration::from_secs(12);
const RECONNECT_AFTER_LEAF_TIMEOUT: Duration = Duration::from_secs(4);
/// Future that when resolved indicates that we should update reserved peer-set
/// of validators we want to be connected to.
@@ -108,6 +113,13 @@ const RECONNECT_TIMEOUT: Duration = Duration::from_secs(12);
/// connected.
type ReconnectTimeout = Fuse<futures_timer::Delay>;
#[derive(Debug)]
enum ShouldAdvertiseTo {
Yes,
NotAuthority,
AlreadyAdvertised,
}
/// Info about validators we are currently connected to.
///
/// It keeps track to which validators we advertised our collation.
@@ -129,10 +141,10 @@ impl ValidatorGroup {
candidate_hash: &CandidateHash,
peer_ids: &HashMap<PeerId, HashSet<AuthorityDiscoveryId>>,
peer: &PeerId,
) -> bool {
) -> ShouldAdvertiseTo {
let authority_ids = match peer_ids.get(peer) {
Some(authority_ids) => authority_ids,
None => return false,
None => return ShouldAdvertiseTo::NotAuthority,
};
for id in authority_ids {
@@ -151,11 +163,13 @@ impl ValidatorGroup {
.get(candidate_hash)
.map_or(true, |advertised| !advertised[validator_index])
{
return true
return ShouldAdvertiseTo::Yes
} else {
return ShouldAdvertiseTo::AlreadyAdvertised
}
}
false
ShouldAdvertiseTo::NotAuthority
}
/// Should be called after we advertised our collation to the given `peer` to keep track of it.
@@ -255,8 +269,8 @@ struct State {
/// Tracks which validators we want to stay connected to.
validator_groups_buf: ValidatorGroupsBuffer,
/// Timeout-future that enforces collator to update the peer-set at least once
/// every [`RECONNECT_TIMEOUT`] seconds.
/// Timeout-future which is reset after every leaf to [`RECONNECT_AFTER_LEAF_TIMEOUT`] seconds.
/// When it fires, we update our reserved peers.
reconnect_timeout: ReconnectTimeout,
/// Metrics.
@@ -443,7 +457,7 @@ async fn distribute_collation<Context>(
}
// Update a set of connected validators if necessary.
state.reconnect_timeout = connect_to_validators(ctx, &state.validator_groups_buf).await;
connect_to_validators(ctx, &state.validator_groups_buf).await;
if let Some(result_sender) = result_sender {
state.collation_result_senders.insert(candidate_hash, result_sender);
@@ -619,15 +633,12 @@ async fn declare<Context>(
/// Updates a set of connected validators based on their advertisement-bits
/// in a validators buffer.
///
/// Should be called again once a returned future resolves.
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
async fn connect_to_validators<Context>(
ctx: &mut Context,
validator_groups_buf: &ValidatorGroupsBuffer,
) -> ReconnectTimeout {
) {
let validator_ids = validator_groups_buf.validators_to_connect();
let is_disconnect = validator_ids.is_empty();
// ignore address resolution failure
// will reissue a new request on new collation
@@ -638,14 +649,6 @@ async fn connect_to_validators<Context>(
failed,
})
.await;
if is_disconnect {
gum::trace!(target: LOG_TARGET, "Disconnecting from all peers");
// Never resolves.
Fuse::terminated()
} else {
futures_timer::Delay::new(RECONNECT_TIMEOUT).fuse()
}
}
/// Advertise collation to the given `peer`.
@@ -685,22 +688,29 @@ async fn advertise_collation<Context>(
.validator_group
.should_advertise_to(candidate_hash, peer_ids, &peer);
if !should_advertise {
gum::debug!(
target: LOG_TARGET,
?relay_parent,
peer_id = %peer,
"Not advertising collation since validator is not interested",
);
continue
match should_advertise {
ShouldAdvertiseTo::Yes => {},
ShouldAdvertiseTo::NotAuthority | ShouldAdvertiseTo::AlreadyAdvertised => {
gum::trace!(
target: LOG_TARGET,
?relay_parent,
?candidate_hash,
peer_id = %peer,
reason = ?should_advertise,
"Not advertising collation"
);
continue
},
}
gum::debug!(
target: LOG_TARGET,
?relay_parent,
?candidate_hash,
peer_id = %peer,
"Advertising collation.",
);
collation.status.advance_to_advertised();
let collation_message = match protocol_version {
@@ -1149,7 +1159,7 @@ async fn handle_network_msg<Context>(
PeerConnected(peer_id, observed_role, protocol_version, maybe_authority) => {
// If it is possible that a disconnected validator would attempt a reconnect
// it should be handled here.
gum::trace!(target: LOG_TARGET, ?peer_id, ?observed_role, "Peer connected");
gum::trace!(target: LOG_TARGET, ?peer_id, ?observed_role, ?maybe_authority, "Peer connected");
let version = match protocol_version.try_into() {
Ok(version) => version,
@@ -1200,7 +1210,11 @@ async fn handle_network_msg<Context>(
},
UpdatedAuthorityIds(peer_id, authority_ids) => {
gum::trace!(target: LOG_TARGET, ?peer_id, ?authority_ids, "Updated authority ids");
state.peer_ids.insert(peer_id, authority_ids);
if let Some(version) = state.peer_data.get(&peer_id).map(|d| d.version) {
if state.peer_ids.insert(peer_id, authority_ids).is_none() {
declare(ctx, state, &peer_id, version).await;
}
}
},
NewGossipTopology { .. } => {
// impossible!
@@ -1369,7 +1383,11 @@ async fn run_inner<Context>(
"Failed to process message"
)?;
},
FromOrchestra::Signal(ActiveLeaves(_update)) => {}
FromOrchestra::Signal(ActiveLeaves(update)) => {
if update.activated.is_some() {
*reconnect_timeout = futures_timer::Delay::new(RECONNECT_AFTER_LEAF_TIMEOUT).fuse();
}
}
FromOrchestra::Signal(BlockFinalized(..)) => {}
FromOrchestra::Signal(Conclude) => return Ok(()),
},
@@ -1390,7 +1408,7 @@ async fn run_inner<Context>(
// The request it still alive, it should be kept in a waiting queue.
} else {
for authority_id in state.peer_ids.get(&peer_id).into_iter().flatten() {
// Timeout not hit, this peer is no longer interested in this relay parent.
// This peer has received the candidate. Not interested anymore.
state.validator_groups_buf.reset_validator_interest(candidate_hash, authority_id);
}
waiting.waiting_peers.remove(&(peer_id, candidate_hash));
@@ -1446,12 +1464,11 @@ async fn run_inner<Context>(
}
}
_ = reconnect_timeout => {
state.reconnect_timeout =
connect_to_validators(&mut ctx, &state.validator_groups_buf).await;
connect_to_validators(&mut ctx, &state.validator_groups_buf).await;
gum::trace!(
target: LOG_TARGET,
timeout = ?RECONNECT_TIMEOUT,
timeout = ?RECONNECT_AFTER_LEAF_TIMEOUT,
"Peer-set updated due to a timeout"
);
},
@@ -133,7 +133,7 @@ impl ValidatorGroupsBuffer {
}
}
/// Note that a validator is no longer interested in a given relay parent.
/// Note that a validator is no longer interested in a given candidate.
pub fn reset_validator_interest(
&mut self,
candidate_hash: CandidateHash,