mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-04-26 08:47:57 +00:00
Req/res optimization for statement distribution (#2803)
* Wip * Increase proposer timeout. * WIP. * Better timeout values now that we are going to be connected to all nodes. (#2778) * Better timeout values. * Fix typo. * Fix validator bandwidth. * Fix compilation. * Better and more consistent sizes. Most importantly code size is now 5 Meg, which is the limit we currently want to support in statement distribution. * Introduce statement fetching request. * WIP * Statement cache retrieval logic. * Review remarks by @rphmeier * Fixes. * Better requester logic. * WIP: Handle requester messages. * Missing dep. * Fix request launching logic. * Finish fetching logic. * Sending logic. * Redo code size calculations. Now that max code size is compressed size. * Update Cargo.lock (new dep) * Get request receiver to statement distribution. * Expose new functionality for responding to requests. * Cleanup. * Responder logic. * Fixes + Cleanup. * Cargo.lock * Whitespace. * Add lost copyright. * Launch responder task. * Typo. * info -> warn * Typo. * Fix. * Fix. * Update comment. * Doc fix. * Better large statement heuristics. * Fix tests. * Fix network bridge tests. * Add test for size estimate. * Very simple tests that checks we get LargeStatement. * Basic check, that fetching of large candidates is performed. * More tests. * Basic metrics for responder. * More metrics. * Use Encode::encoded_size(). * Some useful spans. * Get rid of redundant metrics. * Don't add peer on duplicate. * Properly check hash instead of relying on signatures alone. * Preserve ordering + better flood protection. * Get rid of redundant clone. * Don't shutdown responder on failed query. And add test for this. * Smaller fixes. * Quotes. * Better queue size calculation. * A bit saner response sizes. * Fixes.
This commit is contained in:
@@ -291,10 +291,7 @@ pub mod v1 {
|
||||
use parity_scale_codec::{Encode, Decode};
|
||||
use std::convert::TryFrom;
|
||||
|
||||
use polkadot_primitives::v1::{
|
||||
CandidateIndex, CollatorId, Hash, Id as ParaId, SignedAvailabilityBitfield,
|
||||
CollatorSignature,
|
||||
};
|
||||
use polkadot_primitives::v1::{CandidateHash, CandidateIndex, CollatorId, CollatorSignature, CompactStatement, Hash, Id as ParaId, SignedAvailabilityBitfield, ValidatorIndex, ValidatorSignature};
|
||||
use polkadot_node_primitives::{
|
||||
approval::{IndirectAssignmentCert, IndirectSignedApprovalVote},
|
||||
SignedFullStatement,
|
||||
@@ -313,7 +310,68 @@ pub mod v1 {
|
||||
pub enum StatementDistributionMessage {
|
||||
/// A signed full statement under a given relay-parent.
|
||||
#[codec(index = 0)]
|
||||
Statement(Hash, SignedFullStatement)
|
||||
Statement(Hash, SignedFullStatement),
|
||||
/// Seconded statement with large payload (e.g. containing a runtime upgrade).
|
||||
///
|
||||
/// We only gossip the hash in that case, actual payloads can be fetched from sending node
|
||||
/// via req/response.
|
||||
#[codec(index = 1)]
|
||||
LargeStatement(StatementMetadata),
|
||||
}
|
||||
|
||||
/// Data that maes a statement unique.
|
||||
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq, Hash)]
|
||||
pub struct StatementMetadata {
|
||||
/// Relayt parent this statement is relevant under.
|
||||
pub relay_parent: Hash,
|
||||
/// Hash of the candidate that got validated.
|
||||
pub candidate_hash: CandidateHash,
|
||||
/// Validator that attested the valididty.
|
||||
pub signed_by: ValidatorIndex,
|
||||
/// Signature of seconding validator.
|
||||
pub signature: ValidatorSignature,
|
||||
}
|
||||
|
||||
impl StatementDistributionMessage {
|
||||
/// Get meta data of the given `StatementDistributionMessage`.
|
||||
pub fn get_metadata(&self) -> StatementMetadata {
|
||||
match self {
|
||||
Self::Statement(relay_parent, statement) => StatementMetadata {
|
||||
relay_parent: *relay_parent,
|
||||
candidate_hash: statement.payload().candidate_hash(),
|
||||
signed_by: statement.validator_index(),
|
||||
signature: statement.signature().clone(),
|
||||
},
|
||||
Self::LargeStatement(metadata) => metadata.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get fingerprint describing the contained statement uniquely.
|
||||
pub fn get_fingerprint(&self) -> (CompactStatement, ValidatorIndex) {
|
||||
match self {
|
||||
Self::Statement(_, statement) =>
|
||||
(statement.payload().to_compact(), statement.validator_index()),
|
||||
Self::LargeStatement(meta) =>
|
||||
(CompactStatement::Seconded(meta.candidate_hash), meta.signed_by),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get contained relay parent.
|
||||
pub fn get_relay_parent(&self) -> Hash {
|
||||
match self {
|
||||
Self::Statement(r, _) => *r,
|
||||
Self::LargeStatement(meta) => meta.relay_parent,
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether or not this message contains a large statement.
|
||||
pub fn is_large_statement(&self) -> bool {
|
||||
if let Self::LargeStatement(_) = self {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Network messages used by the approval distribution subsystem.
|
||||
|
||||
@@ -32,11 +32,12 @@
|
||||
//!
|
||||
//! Versioned (v1 module): The actual requests and responses as sent over the network.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::{borrow::Cow, u64};
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::channel::mpsc;
|
||||
use polkadot_node_primitives::MAX_POV_SIZE;
|
||||
use polkadot_primitives::v1::MAX_CODE_SIZE;
|
||||
use strum::EnumIter;
|
||||
|
||||
pub use sc_network::config as network;
|
||||
@@ -64,8 +65,15 @@ pub enum Protocol {
|
||||
PoVFetching,
|
||||
/// Protocol for fetching available data.
|
||||
AvailableDataFetching,
|
||||
/// Fetching of statements that are too large for gossip.
|
||||
StatementFetching,
|
||||
}
|
||||
|
||||
|
||||
/// Minimum bandwidth we expect for validators - 500Mbit/s is the recommendation, so approximately
|
||||
/// 50Meg bytes per second:
|
||||
const MIN_BANDWIDTH_BYTES: u64 = 50 * 1024 * 1024;
|
||||
|
||||
/// Default request timeout in seconds.
|
||||
///
|
||||
/// When decreasing this value, take into account that the very first request might need to open a
|
||||
@@ -78,14 +86,22 @@ const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(3);
|
||||
/// peer set as well).
|
||||
const DEFAULT_REQUEST_TIMEOUT_CONNECTED: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Minimum bandwidth we expect for validators - 500Mbit/s is the recommendation, so approximately
|
||||
/// 50Meg bytes per second:
|
||||
const MIN_BANDWIDTH_BYTES: u64 = 50 * 1024 * 1024;
|
||||
/// Timeout for PoV like data, 2 times what it should take, assuming we can fully utilize the
|
||||
/// bandwidth. This amounts to two seconds right now.
|
||||
const POV_REQUEST_TIMEOUT_CONNECTED: Duration =
|
||||
Duration::from_millis(2 * 1000 * (MAX_POV_SIZE as u64) / MIN_BANDWIDTH_BYTES);
|
||||
|
||||
/// We want timeout statement requests fast, so we don't waste time on slow nodes. Responders will
|
||||
/// try their best to either serve within that timeout or return an error immediately. (We need to
|
||||
/// fit statement distribution within a block of 6 seconds.)
|
||||
const STATEMENTS_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
|
||||
/// We don't want a slow peer to slow down all the others, at the same time we want to get out the
|
||||
/// data quickly in full to at least some peers (as this will reduce load on us as they then can
|
||||
/// start serving the data). So this value is a tradeoff. 3 seems to be sensible. So we would need
|
||||
/// to have 3 slow noded connected, to delay transfer for others by `STATEMENTS_TIMEOUT`.
|
||||
pub const MAX_PARALLEL_STATEMENT_REQUESTS: u32 = 3;
|
||||
|
||||
impl Protocol {
|
||||
/// Get a configuration for a given Request response protocol.
|
||||
///
|
||||
@@ -105,16 +121,16 @@ impl Protocol {
|
||||
let cfg = match self {
|
||||
Protocol::ChunkFetching => RequestResponseConfig {
|
||||
name: p_name,
|
||||
max_request_size: 10_000,
|
||||
max_response_size: 10_000_000,
|
||||
max_request_size: 1_000,
|
||||
max_response_size: MAX_POV_SIZE as u64 / 10,
|
||||
// We are connected to all validators:
|
||||
request_timeout: DEFAULT_REQUEST_TIMEOUT_CONNECTED,
|
||||
inbound_queue: Some(tx),
|
||||
},
|
||||
Protocol::CollationFetching => RequestResponseConfig {
|
||||
name: p_name,
|
||||
max_request_size: 10_000,
|
||||
max_response_size: MAX_POV_SIZE as u64,
|
||||
max_request_size: 1_000,
|
||||
max_response_size: MAX_POV_SIZE as u64 + 1000,
|
||||
// Taken from initial implementation in collator protocol:
|
||||
request_timeout: POV_REQUEST_TIMEOUT_CONNECTED,
|
||||
inbound_queue: Some(tx),
|
||||
@@ -130,10 +146,28 @@ impl Protocol {
|
||||
name: p_name,
|
||||
max_request_size: 1_000,
|
||||
// Available data size is dominated by the PoV size.
|
||||
max_response_size: MAX_POV_SIZE as u64,
|
||||
max_response_size: MAX_POV_SIZE as u64 + 1000,
|
||||
request_timeout: POV_REQUEST_TIMEOUT_CONNECTED,
|
||||
inbound_queue: Some(tx),
|
||||
},
|
||||
Protocol::StatementFetching => RequestResponseConfig {
|
||||
name: p_name,
|
||||
max_request_size: 1_000,
|
||||
// Available data size is dominated code size.
|
||||
// + 1000 to account for protocol overhead (should be way less).
|
||||
max_response_size: MAX_CODE_SIZE as u64 + 1000,
|
||||
// We need statement fetching to be fast and will try our best at the responding
|
||||
// side to answer requests within that timeout, assuming a bandwidth of 500Mbit/s
|
||||
// - which is the recommended minimum bandwidth for nodes on Kusama as of April
|
||||
// 2021.
|
||||
// Responders will reject requests, if it is unlikely they can serve them within
|
||||
// the timeout, so the requester can immediately try another node, instead of
|
||||
// waiting for timeout on an overloaded node. Fetches from slow nodes will likely
|
||||
// fail, but this is desired, so we can quickly move on to a faster one - we should
|
||||
// also decrease its reputation.
|
||||
request_timeout: Duration::from_secs(1),
|
||||
inbound_queue: Some(tx),
|
||||
},
|
||||
};
|
||||
(rx, cfg)
|
||||
}
|
||||
@@ -154,6 +188,26 @@ impl Protocol {
|
||||
// Validators are constantly self-selecting to request available data which may lead
|
||||
// to constant load and occasional burstiness.
|
||||
Protocol::AvailableDataFetching => 100,
|
||||
// Our queue size approximation is how many blocks of the size of
|
||||
// a runtime we can transfer within a statements timeout, minus the requests we handle
|
||||
// in parallel.
|
||||
Protocol::StatementFetching => {
|
||||
// We assume we can utilize up to 70% of the available bandwidth for statements.
|
||||
// This is just a guess/estimate, with the following considerations: If we are
|
||||
// faster than that, queue size will stay low anyway, even if not - requesters will
|
||||
// get an immediate error, but if we are slower, requesters will run in a timeout -
|
||||
// waisting precious time.
|
||||
let available_bandwidth = 7 * MIN_BANDWIDTH_BYTES / 10;
|
||||
let size = u64::saturating_sub(
|
||||
STATEMENTS_TIMEOUT.as_millis() as u64 * available_bandwidth / (1000 * MAX_CODE_SIZE as u64),
|
||||
MAX_PARALLEL_STATEMENT_REQUESTS as u64
|
||||
);
|
||||
debug_assert!(
|
||||
size > 0,
|
||||
"We should have a channel size greater zero, otherwise we won't accept any requests."
|
||||
);
|
||||
size as usize
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,6 +223,7 @@ impl Protocol {
|
||||
Protocol::CollationFetching => "/polkadot/req_collation/1",
|
||||
Protocol::PoVFetching => "/polkadot/req_pov/1",
|
||||
Protocol::AvailableDataFetching => "/polkadot/req_available_data/1",
|
||||
Protocol::StatementFetching => "/polkadot/req_statement/1",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,8 @@ use sc_network::PeerId;
|
||||
|
||||
use polkadot_primitives::v1::AuthorityDiscoveryId;
|
||||
|
||||
use crate::UnifiedReputationChange;
|
||||
|
||||
use super::{v1, Protocol};
|
||||
|
||||
/// Common properties of any `Request`.
|
||||
@@ -47,6 +49,8 @@ pub enum Requests {
|
||||
PoVFetching(OutgoingRequest<v1::PoVFetchingRequest>),
|
||||
/// Request full available data from a node.
|
||||
AvailableDataFetching(OutgoingRequest<v1::AvailableDataFetchingRequest>),
|
||||
/// Requests for fetching large statements as part of statement distribution.
|
||||
StatementFetching(OutgoingRequest<v1::StatementFetchingRequest>),
|
||||
}
|
||||
|
||||
impl Requests {
|
||||
@@ -57,6 +61,7 @@ impl Requests {
|
||||
Self::CollationFetching(_) => Protocol::CollationFetching,
|
||||
Self::PoVFetching(_) => Protocol::PoVFetching,
|
||||
Self::AvailableDataFetching(_) => Protocol::AvailableDataFetching,
|
||||
Self::StatementFetching(_) => Protocol::StatementFetching,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,6 +78,7 @@ impl Requests {
|
||||
Self::CollationFetching(r) => r.encode_request(),
|
||||
Self::PoVFetching(r) => r.encode_request(),
|
||||
Self::AvailableDataFetching(r) => r.encode_request(),
|
||||
Self::StatementFetching(r) => r.encode_request(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -199,6 +205,22 @@ pub struct IncomingRequest<Req> {
|
||||
pending_response: oneshot::Sender<netconfig::OutgoingResponse>,
|
||||
}
|
||||
|
||||
/// Typed variant of [`netconfig::OutgoingResponse`].
|
||||
///
|
||||
/// Responses to `IncomingRequest`s.
|
||||
pub struct OutgoingResponse<Response> {
|
||||
/// The payload of the response.
|
||||
pub result: Result<Response, ()>,
|
||||
|
||||
/// Reputation changes accrued while handling the request. To be applied to the reputation of
|
||||
/// the peer sending the request.
|
||||
pub reputation_changes: Vec<UnifiedReputationChange>,
|
||||
|
||||
/// If provided, the `oneshot::Sender` will be notified when the request has been sent to the
|
||||
/// peer.
|
||||
pub sent_feedback: Option<oneshot::Sender<()>>,
|
||||
}
|
||||
|
||||
impl<Req> IncomingRequest<Req>
|
||||
where
|
||||
Req: IsRequest,
|
||||
@@ -232,6 +254,31 @@ where
|
||||
})
|
||||
.map_err(|_| resp)
|
||||
}
|
||||
|
||||
/// Send response with additional options.
|
||||
///
|
||||
/// This variant allows for waiting for the response to be sent out, allows for changing peer's
|
||||
/// reputation and allows for not sending a response at all (for only changing the peer's
|
||||
/// reputation).
|
||||
pub fn send_outgoing_response(self, resp: OutgoingResponse<<Req as IsRequest>::Response>)
|
||||
-> Result<(), ()> {
|
||||
let OutgoingResponse {
|
||||
result,
|
||||
reputation_changes,
|
||||
sent_feedback,
|
||||
} = resp;
|
||||
|
||||
let response = netconfig::OutgoingResponse {
|
||||
result: result.map(|v| v.encode()),
|
||||
reputation_changes: reputation_changes
|
||||
.into_iter()
|
||||
.map(|c| c.into_base_rep())
|
||||
.collect(),
|
||||
sent_feedback,
|
||||
};
|
||||
|
||||
self.pending_response.send(response).map_err(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
/// Future for actually receiving a typed response for an OutgoingRequest.
|
||||
|
||||
@@ -18,10 +18,7 @@
|
||||
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
|
||||
use polkadot_primitives::v1::{
|
||||
CandidateHash, CandidateReceipt, ValidatorIndex,
|
||||
Hash,
|
||||
};
|
||||
use polkadot_primitives::v1::{CandidateHash, CandidateReceipt, CommittedCandidateReceipt, Hash, ValidatorIndex};
|
||||
use polkadot_primitives::v1::Id as ParaId;
|
||||
use polkadot_node_primitives::{AvailableData, PoV, ErasureChunk};
|
||||
|
||||
@@ -169,3 +166,29 @@ impl IsRequest for AvailableDataFetchingRequest {
|
||||
type Response = AvailableDataFetchingResponse;
|
||||
const PROTOCOL: Protocol = Protocol::AvailableDataFetching;
|
||||
}
|
||||
|
||||
/// Request for fetching a large statement via request/response.
|
||||
#[derive(Debug, Clone, Encode, Decode)]
|
||||
pub struct StatementFetchingRequest {
|
||||
/// Data needed to locate and identify the needed statement.
|
||||
pub relay_parent: Hash,
|
||||
/// Hash of candidate that was used create the CommitedCandidateRecept.
|
||||
pub candidate_hash: CandidateHash,
|
||||
}
|
||||
|
||||
/// Respond with found full statement.
|
||||
///
|
||||
/// In this protocol the requester will only request data it was previously notified about,
|
||||
/// therefore not having the data is not really an option and would just result in a
|
||||
/// `RequestFailure`.
|
||||
#[derive(Debug, Clone, Encode, Decode)]
|
||||
pub enum StatementFetchingResponse {
|
||||
/// Data missing to reconstruct the full signed statement.
|
||||
#[codec(index = 0)]
|
||||
Statement(CommittedCandidateReceipt),
|
||||
}
|
||||
|
||||
impl IsRequest for StatementFetchingRequest {
|
||||
type Response = StatementFetchingResponse;
|
||||
const PROTOCOL: Protocol = Protocol::StatementFetching;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user