Delay reputation updates (#7214)

* Add futures-timer

* Make cost_or_benefit public

* Update ReportPeer message format

* Add delay to reputation updates (dirtywork)

* Update ReputationAggregator

* Update tests

* Fix flucky tests

* Move reputation to state

* Use the main loop for handling reputation sendings

* Update

* Move reputation to utils

* Update reputation sending

* Fix arguments order

* Update state

* Remove new from state

* Add constant

* Add failing test for delay

* Change mocking approach

* Fix type errors

* Fix comments

* Add message handling to select

* Fix bitfields-distribution tests

* Add docs to reputation aggregator

* Replace .into_base_rep

* Use one REPUTATION_CHANGE_INTERVAL by default

* Add reputation change to statement-distribution

* Update polkadot-availability-bitfield-distribution

* Update futures selecting in subsystems

* Update reputation adding

* Send malicious changes right away without adding to state

* Add reputation to StatementDistributionSubsystem

* Handle reputation in statement distribution

* Add delay test for polkadot-statement-distribution

* Fix collator-protocol tests before applying reputation delay

* Remove into_base_rep

* Add reputation to State

* Fix failed tests

* Add reputation delay

* Update tests

* Add batched network message for peer reporting

* Update approval-distribution tests

* Update bitfield-distribution tests

* Update statement-distribution tests

* Update collator-protocol tests

* Remove levels in matching

* Address clippy errors

* Fix overseer test

* Add a metric for original count of rep changes

* Update Reputation

* Revert "Add a metric for original count of rep changes"

This reverts commit 6c9b0c1ec34491d16e562bdcba8db6b9dcf484db.

* Update node/subsystem-util/src/reputation.rs

Co-authored-by: Vsevolod Stakhov <vsevolod.stakhov@parity.io>

* Remove redundant vec

---------

Co-authored-by: Vsevolod Stakhov <vsevolod.stakhov@parity.io>
This commit is contained in:
Andrei Eres
2023-06-15 15:46:06 +02:00
committed by GitHub
parent d3d9d4ae66
commit 0a1bc654d9
27 changed files with 2231 additions and 805 deletions
@@ -35,11 +35,18 @@ use polkadot_node_subsystem::{
jaeger, messages::*, overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, PerLeafSpan,
SpawnedSubsystem, SubsystemError, SubsystemResult,
};
use polkadot_node_subsystem_util::{self as util};
use polkadot_node_subsystem_util::{
self as util,
reputation::{ReputationAggregator, REPUTATION_CHANGE_INTERVAL},
};
use futures::select;
use polkadot_primitives::{Hash, SignedAvailabilityBitfield, SigningContext, ValidatorId};
use rand::{CryptoRng, Rng, SeedableRng};
use std::collections::{HashMap, HashSet};
use std::{
collections::{HashMap, HashSet},
time::Duration,
};
use self::metrics::Metrics;
@@ -97,6 +104,9 @@ struct ProtocolState {
/// Additional data particular to a relay parent.
per_relay_parent: HashMap<Hash, PerRelayParentData>,
/// Aggregated reputation change
reputation: ReputationAggregator,
}
/// Data for a particular relay parent.
@@ -178,95 +188,107 @@ impl BitfieldDistribution {
async fn run<Context>(self, ctx: Context) {
let mut state = ProtocolState::default();
let mut rng = rand::rngs::StdRng::from_entropy();
self.run_inner(ctx, &mut state, &mut rng).await
self.run_inner(ctx, &mut state, REPUTATION_CHANGE_INTERVAL, &mut rng).await
}
async fn run_inner<Context>(
self,
mut ctx: Context,
state: &mut ProtocolState,
reputation_interval: Duration,
rng: &mut (impl CryptoRng + Rng),
) {
// work: process incoming messages from the overseer and process accordingly.
let new_reputation_delay = || futures_timer::Delay::new(reputation_interval).fuse();
let mut reputation_delay = new_reputation_delay();
loop {
let message = match ctx.recv().await {
Ok(message) => message,
Err(err) => {
gum::error!(
target: LOG_TARGET,
?err,
"Failed to receive a message from Overseer, exiting"
);
return
select! {
_ = reputation_delay => {
state.reputation.send(ctx.sender()).await;
reputation_delay = new_reputation_delay();
},
};
match message {
FromOrchestra::Communication {
msg:
BitfieldDistributionMessage::DistributeBitfield(
relay_parent,
signed_availability,
),
} => {
gum::trace!(target: LOG_TARGET, ?relay_parent, "Processing DistributeBitfield");
handle_bitfield_distribution(
&mut ctx,
state,
&self.metrics,
relay_parent,
signed_availability,
rng,
)
.await;
},
FromOrchestra::Communication {
msg: BitfieldDistributionMessage::NetworkBridgeUpdate(event),
} => {
gum::trace!(target: LOG_TARGET, "Processing NetworkMessage");
// a network message was received
handle_network_msg(&mut ctx, state, &self.metrics, event, rng).await;
},
FromOrchestra::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated,
..
})) => {
let _timer = self.metrics.time_active_leaves_update();
if let Some(activated) = activated {
let relay_parent = activated.hash;
gum::trace!(target: LOG_TARGET, ?relay_parent, "activated");
let span = PerLeafSpan::new(activated.span, "bitfield-distribution");
let _span = span.child("query-basics");
// query validator set and signing context per relay_parent once only
match query_basics(&mut ctx, relay_parent).await {
Ok(Some((validator_set, signing_context))) => {
// If our runtime API fails, we don't take down the node,
// but we might alter peers' reputations erroneously as a result
// of not having the correct bookkeeping. If we have lost a race
// with state pruning, it is unlikely that peers will be sending
// us anything to do with this relay-parent anyway.
let _ = state.per_relay_parent.insert(
message = ctx.recv().fuse() => {
let message = match message {
Ok(message) => message,
Err(err) => {
gum::error!(
target: LOG_TARGET,
?err,
"Failed to receive a message from Overseer, exiting"
);
return
},
};
match message {
FromOrchestra::Communication {
msg:
BitfieldDistributionMessage::DistributeBitfield(
relay_parent,
PerRelayParentData::new(signing_context, validator_set, span),
);
},
Err(err) => {
gum::warn!(target: LOG_TARGET, ?err, "query_basics has failed");
},
_ => {},
}
signed_availability,
),
} => {
gum::trace!(target: LOG_TARGET, ?relay_parent, "Processing DistributeBitfield");
handle_bitfield_distribution(
&mut ctx,
state,
&self.metrics,
relay_parent,
signed_availability,
rng,
)
.await;
},
FromOrchestra::Communication {
msg: BitfieldDistributionMessage::NetworkBridgeUpdate(event),
} => {
gum::trace!(target: LOG_TARGET, "Processing NetworkMessage");
// a network message was received
handle_network_msg(&mut ctx, state, &self.metrics, event, rng).await;
},
FromOrchestra::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated,
..
})) => {
let _timer = self.metrics.time_active_leaves_update();
if let Some(activated) = activated {
let relay_parent = activated.hash;
gum::trace!(target: LOG_TARGET, ?relay_parent, "activated");
let span = PerLeafSpan::new(activated.span, "bitfield-distribution");
let _span = span.child("query-basics");
// query validator set and signing context per relay_parent once only
match query_basics(&mut ctx, relay_parent).await {
Ok(Some((validator_set, signing_context))) => {
// If our runtime API fails, we don't take down the node,
// but we might alter peers' reputations erroneously as a result
// of not having the correct bookkeeping. If we have lost a race
// with state pruning, it is unlikely that peers will be sending
// us anything to do with this relay-parent anyway.
let _ = state.per_relay_parent.insert(
relay_parent,
PerRelayParentData::new(signing_context, validator_set, span),
);
},
Err(err) => {
gum::warn!(target: LOG_TARGET, ?err, "query_basics has failed");
},
_ => {},
}
}
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, number)) => {
gum::trace!(target: LOG_TARGET, ?hash, %number, "block finalized");
},
FromOrchestra::Signal(OverseerSignal::Conclude) => {
gum::info!(target: LOG_TARGET, "Conclude");
return
},
}
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, number)) => {
gum::trace!(target: LOG_TARGET, ?hash, %number, "block finalized");
},
FromOrchestra::Signal(OverseerSignal::Conclude) => {
gum::info!(target: LOG_TARGET, "Conclude");
return
},
}
}
}
}
@@ -274,6 +296,7 @@ impl BitfieldDistribution {
/// Modify the reputation of a peer based on its behavior.
async fn modify_reputation(
reputation: &mut ReputationAggregator,
sender: &mut impl overseer::BitfieldDistributionSenderTrait,
relay_parent: Hash,
peer: PeerId,
@@ -281,7 +304,7 @@ async fn modify_reputation(
) {
gum::trace!(target: LOG_TARGET, ?relay_parent, ?rep, %peer, "reputation change");
sender.send_message(NetworkBridgeTxMessage::ReportPeer(peer, rep)).await
reputation.modify(sender, peer, rep).await;
}
/// Distribute a given valid and signature checked bitfield message.
///
@@ -454,7 +477,14 @@ async fn process_incoming_peer_message<Context>(
);
// we don't care about this, not part of our view.
if !state.view.contains(&relay_parent) {
modify_reputation(ctx.sender(), relay_parent, origin, COST_NOT_IN_VIEW).await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_NOT_IN_VIEW,
)
.await;
return
}
@@ -463,7 +493,14 @@ async fn process_incoming_peer_message<Context>(
let job_data: &mut _ = if let Some(ref mut job_data) = job_data {
job_data
} else {
modify_reputation(ctx.sender(), relay_parent, origin, COST_NOT_IN_VIEW).await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_NOT_IN_VIEW,
)
.await;
return
};
@@ -480,7 +517,14 @@ async fn process_incoming_peer_message<Context>(
let validator_set = &job_data.validator_set;
if validator_set.is_empty() {
gum::trace!(target: LOG_TARGET, ?relay_parent, ?origin, "Validator set is empty",);
modify_reputation(ctx.sender(), relay_parent, origin, COST_MISSING_PEER_SESSION_KEY).await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_MISSING_PEER_SESSION_KEY,
)
.await;
return
}
@@ -490,7 +534,14 @@ async fn process_incoming_peer_message<Context>(
let validator = if let Some(validator) = validator_set.get(validator_index.0 as usize) {
validator.clone()
} else {
modify_reputation(ctx.sender(), relay_parent, origin, COST_VALIDATOR_INDEX_INVALID).await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_VALIDATOR_INDEX_INVALID,
)
.await;
return
};
@@ -503,7 +554,14 @@ async fn process_incoming_peer_message<Context>(
received_set.insert(validator.clone());
} else {
gum::trace!(target: LOG_TARGET, ?validator_index, ?origin, "Duplicate message");
modify_reputation(ctx.sender(), relay_parent, origin, COST_PEER_DUPLICATE_MESSAGE).await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_PEER_DUPLICATE_MESSAGE,
)
.await;
return
};
@@ -517,13 +575,27 @@ async fn process_incoming_peer_message<Context>(
"already received a message for validator",
);
if old_message.signed_availability.as_unchecked() == &bitfield {
modify_reputation(ctx.sender(), relay_parent, origin, BENEFIT_VALID_MESSAGE).await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
BENEFIT_VALID_MESSAGE,
)
.await;
}
return
}
let signed_availability = match bitfield.try_into_checked(&signing_context, &validator) {
Err(_) => {
modify_reputation(ctx.sender(), relay_parent, origin, COST_SIGNATURE_INVALID).await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_SIGNATURE_INVALID,
)
.await;
return
},
Ok(bitfield) => bitfield,
@@ -552,7 +624,14 @@ async fn process_incoming_peer_message<Context>(
)
.await;
modify_reputation(ctx.sender(), relay_parent, origin, BENEFIT_VALID_MESSAGE_FIRST).await
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
BENEFIT_VALID_MESSAGE_FIRST,
)
.await
}
/// Deal with network bridge updates and track what needs to be tracked